Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
bloom_filter_example.C
Go to the documentation of this file.
1
81#include <iostream>
82#include <iomanip>
83#include <string>
84#include <vector>
85#include <cmath>
86
87#include <tclap/CmdLine.h>
88
89#include <bloom-filter.H>
90
91using namespace std;
92using namespace Aleph;
93
94// =============================================================================
95// Helper functions
96// =============================================================================
97
98void print_section(const string& title)
99{
100 cout << "\n" << string(60, '=') << "\n";
101 cout << " " << title << "\n";
102 cout << string(60, '=') << "\n\n";
103}
104
105void print_subsection(const string& title)
106{
107 cout << "\n--- " << title << " ---\n";
108}
109
110// =============================================================================
111// 1. Basic Usage
112// =============================================================================
113
115{
116 print_section("BASIC BLOOM FILTER USAGE");
117
118 cout << "A Bloom filter tests set membership probabilistically.\n";
119 cout << "May have false positives, but NEVER false negatives.\n\n";
120
121 // Create a Bloom filter for strings
122 // Parameters: bit array size (m), number of hash functions (k)
123 size_t m = 1000; // 1000 bits
124 size_t k = 7; // 7 hash functions
125
127
128 cout << "Created Bloom filter:" << endl;
129 cout << " Bit array size (m): " << m << endl;
130 cout << " Hash functions (k): " << k << endl;
131
132 // Insert some elements
133 print_subsection("Inserting elements");
134 vector<string> words = {"cafe", "arepa", "empanada", "bandeja", "sancocho"};
135
136 for (const auto& w : words)
137 {
138 filter.insert(w);
139 cout << " Inserted: \"" << w << "\"" << endl;
140 }
141
142 // Test membership
143 print_subsection("Testing membership");
144
145 vector<string> test_words = {"cafe", "pizza", "arepa", "hamburguesa", "sancocho"};
146
147 for (const auto& w : test_words)
148 {
149 bool found = filter.contains(w);
150 cout << " \"" << w << "\": "
151 << (found ? "POSSIBLY present" : "DEFINITELY absent") << endl;
152 }
153
154 cout << "\nNote: \"POSSIBLY present\" may be a false positive." << endl;
155 cout << " \"DEFINITELY absent\" is always correct!" << endl;
156}
157
158// =============================================================================
159// 2. False Positive Rate
160// =============================================================================
161
163{
164 print_section("FALSE POSITIVE RATE ANALYSIS");
165
166 cout << "The false positive rate depends on m (bits), k (hashes), n (elements).\n";
167 cout << "Formula: P(FP) ≈ (1 - e^(-kn/m))^k\n\n";
168
169 // Insert known elements
170 size_t n = 100; // Number of elements to insert
171 size_t m = 1000; // Bit array size
172 size_t k = 7; // Number of hash functions
173
175
176 // Insert elements 0 to n-1
177 for (size_t i = 0; i < n; i++)
178 filter.insert(static_cast<int>(i));
179
180 cout << "Configuration:" << endl;
181 cout << " Elements inserted (n): " << n << endl;
182 cout << " Bit array size (m): " << m << endl;
183 cout << " Hash functions (k): " << k << endl;
184
185 // Theoretical false positive rate
186 const double theoretical_fp = pow(1.0 - exp(-static_cast<double>(k * n) / m), k);
187 cout << "\nTheoretical FP rate: " << fixed << setprecision(4)
188 << (theoretical_fp * 100) << "%" << endl;
189
190 // Empirical test: check elements NOT in the filter
191 print_subsection("Empirical test");
192
193 size_t test_count = 10000;
194 size_t false_positives = 0;
195
196 for (size_t i = n; i < n + test_count; i++)
197 {
198 if (filter.contains(static_cast<int>(i)))
200 }
201
202 double empirical_fp = static_cast<double>(false_positives) / test_count;
203
204 cout << "Tested " << test_count << " elements NOT in the filter:" << endl;
205 cout << " False positives: " << false_positives << endl;
206 cout << " Empirical FP rate: " << fixed << setprecision(4)
207 << (empirical_fp * 100) << "%" << endl;
208}
209
210// =============================================================================
211// 3. Optimal Parameters
212// =============================================================================
213
215{
216 print_section("OPTIMAL PARAMETERS");
217
218 cout << "For a target false positive rate p and n elements:\n";
219 cout << " Optimal m = -n * ln(p) / (ln(2))²\n";
220 cout << " Optimal k = (m/n) * ln(2) ≈ 0.693 * m/n\n\n";
221
222 // Example: design for 1% FP rate with 1000 elements
223 size_t n = 1000;
224 double target_fp = 0.01; // 1% false positive rate
225
226 // Calculate optimal parameters
227 double ln2 = log(2);
228 double m_optimal = -static_cast<double>(n) * log(target_fp) / (ln2 * ln2);
229 double k_optimal = (m_optimal / n) * ln2;
230
231 cout << "Target: " << (target_fp * 100) << "% FP rate for " << n << " elements" << endl;
232 cout << " Optimal m: " << static_cast<size_t>(ceil(m_optimal)) << " bits" << endl;
233 cout << " Optimal k: " << static_cast<size_t>(round(k_optimal)) << " hash functions" << endl;
234 cout << " Bits per element: " << fixed << setprecision(2) << (m_optimal / n) << endl;
235
236 // Compare different configurations
237 print_subsection("Comparison of configurations");
238
239 cout << setw(10) << "m" << setw(10) << "k"
240 << setw(15) << "FP Rate (%)" << setw(15) << "Bits/elem" << endl;
241 cout << string(50, '-') << endl;
242
244 {5000, 3}, {5000, 7}, {10000, 7}, {10000, 10}, {15000, 10}
245 };
246
247 for (auto [m, k] : configs)
248 {
249 double fp = pow(1.0 - exp(-static_cast<double>(k * n) / m), k);
250 cout << setw(10) << m << setw(10) << k
251 << setw(15) << fixed << setprecision(4) << (fp * 100)
252 << setw(15) << setprecision(2) << (static_cast<double>(m) / n) << endl;
253 }
254}
255
256// =============================================================================
257// 4. Practical Application: Spell Checker
258// =============================================================================
259
261{
262 print_section("PRACTICAL: Simple Spell Checker");
263
264 cout << "Use a Bloom filter as a fast first-pass spell checker.\n";
265 cout << "If word is 'definitely absent', it's misspelled.\n\n";
266
267 // Create dictionary filter
268 size_t dict_size = 50; // Expected dictionary size
269 size_t m = dict_size * 10; // ~1% FP rate
270 size_t k = 7;
271
273
274 // Add Spanish words to dictionary
276 "hola", "mundo", "casa", "perro", "gato", "agua", "fuego", "tierra",
277 "aire", "sol", "luna", "estrella", "mar", "cielo", "arbol", "flor",
278 "libro", "mesa", "silla", "puerta", "ventana", "calle", "ciudad",
279 "pueblo", "pais", "rio", "montana", "valle", "bosque", "campo",
280 "tiempo", "dia", "noche", "semana", "mes", "ano", "hora", "minuto",
281 "segundo", "mano", "pie", "cabeza", "ojo", "nariz", "boca", "oreja",
282 "corazon", "alma", "vida", "muerte"
283 };
284
285 cout << "Loading dictionary (" << spanish_words.size() << " words)..." << endl;
286 for (const auto& w : spanish_words)
288
289 // Check some text
290 print_subsection("Spell checking");
291
293 "hola", "munod", "casa", "perro", "gatoh", "agua", "xyz", "libro"
294 };
295
296 cout << "Checking words:\n";
297 for (const auto& w : text)
298 {
299 bool found = dictionary.contains(w);
300 cout << " \"" << w << "\": ";
301 if (found)
302 cout << "OK (in dictionary)" << endl;
303 else
304 cout << "MISSPELLED (not in dictionary)" << endl;
305 }
306
307 cout << "\nNote: 'OK' might be a false positive for unknown words." << endl;
308 cout << " 'MISSPELLED' is always correct!" << endl;
309}
310
311// =============================================================================
312// 5. Practical Application: Cache Filter
313// =============================================================================
314
316{
317 print_section("PRACTICAL: Database Cache Filter");
318
319 cout << "Use Bloom filter to avoid expensive database lookups.\n";
320 cout << "If key is 'definitely absent', skip the database query.\n\n";
321
322 // Simulate a cache with some IDs
323 size_t cache_size = 1000;
324 size_t m = cache_size * 10;
325 size_t k = 7;
326
328
329 // Populate cache with even IDs
330 cout << "Cache contains IDs: 0, 2, 4, 6, ..., 1998 (even numbers)" << endl;
331 for (int i = 0; i < 2000; i += 2)
333
334 // Simulate queries
335 print_subsection("Query simulation");
336
337 vector<int> queries = {100, 101, 500, 999, 1000, 1001, 1500, 9999};
338
339 int cache_hits = 0;
340 int db_lookups_saved = 0;
341
342 cout << setw(10) << "Query ID" << setw(20) << "Bloom Result"
343 << setw(20) << "Action" << endl;
344 cout << string(50, '-') << endl;
345
346 for (int id : queries)
347 {
348 bool maybe_cached = cache_filter.contains(id);
349 bool actually_cached = (id < 2000) and (id % 2 == 0); // Ground truth
350
351 cout << setw(10) << id;
352
353 if (maybe_cached)
354 {
355 cout << setw(20) << "Maybe present";
356 if (actually_cached)
357 {
358 cout << setw(20) << "Cache HIT" << endl;
359 cache_hits++;
360 }
361 else
362 cout << setw(20) << "False positive, DB lookup" << endl;
363 }
364 else
365 {
366 cout << setw(20) << "Definitely absent" << setw(20) << "Skip DB (saved!)" << endl;
368 }
369 }
370
371 cout << "\nResults:" << endl;
372 cout << " Cache hits: " << cache_hits << endl;
373 cout << " DB lookups saved: " << db_lookups_saved << endl;
374}
375
376// =============================================================================
377// 6. Space Comparison
378// =============================================================================
379
381{
382 print_section("SPACE EFFICIENCY");
383
384 cout << "Bloom filters trade accuracy for space efficiency.\n\n";
385
386 size_t n = 10000; // Number of elements
387
388 cout << "Storing " << n << " 64-bit integers:\n\n";
389
390 // Hash set: stores actual elements
391 size_t hashset_bytes = n * (sizeof(int64_t) + sizeof(void*) * 2); // Approx
392
393 // Bloom filter for various FP rates
394 cout << setw(15) << "FP Rate" << setw(15) << "Bits/elem"
395 << setw(15) << "Total KB" << setw(15) << "Savings" << endl;
396 cout << string(60, '-') << endl;
397
398 vector<double> fp_rates = {0.10, 0.05, 0.01, 0.001, 0.0001};
399
400 for (double fp : fp_rates)
401 {
402 double ln2 = log(2);
403 double m = -static_cast<double>(n) * log(fp) / (ln2 * ln2);
404 double bits_per_elem = m / n;
405 double bloom_kb = m / 8.0 / 1024.0;
406 double savings = 1.0 - (bloom_kb * 1024.0 / hashset_bytes);
407
408 cout << setw(14) << fixed << setprecision(4) << (fp * 100) << "%"
409 << setw(15) << setprecision(2) << bits_per_elem
410 << setw(15) << setprecision(2) << bloom_kb
411 << setw(14) << setprecision(1) << (savings * 100) << "%" << endl;
412 }
413
414 cout << "\nHash set (approximate): "
415 << fixed << setprecision(2) << (hashset_bytes / 1024.0) << " KB" << endl;
416}
417
418// =============================================================================
419// Main
420// =============================================================================
421
422int main(int argc, char* argv[])
423{
424 try
425 {
426 TCLAP::CmdLine cmd(
427 "Bloom filter example for Aleph-w.\n"
428 "Demonstrates probabilistic set membership testing.",
429 ' ', "1.0"
430 );
431
432 TCLAP::ValueArg<string> sectionArg(
433 "s", "section",
434 "Run only specific section: basic, fp, params, spell, cache, space, or 'all'",
435 false, "all", "section", cmd
436 );
437
438 cmd.parse(argc, argv);
439
440 string section = sectionArg.getValue();
441
442 cout << "\n";
443 cout << "============================================================\n";
444 cout << " ALEPH-W BLOOM FILTER EXAMPLE\n";
445 cout << "============================================================\n";
446
447 if (section == "all" or section == "basic")
448 demo_basic();
449
450 if (section == "all" or section == "fp")
452
453 if (section == "all" or section == "params")
455
456 if (section == "all" or section == "spell")
458
459 if (section == "all" or section == "cache")
461
462 if (section == "all" or section == "space")
464
465 cout << "\n" << string(60, '=') << "\n";
466 cout << "Bloom filter demo completed!\n";
467 cout << string(60, '=') << "\n\n";
468
469 return 0;
470 }
471 catch (TCLAP::ArgException& e)
472 {
473 cerr << "Error: " << e.error() << " for argument " << e.argId() << endl;
474 return 1;
475 }
476 catch (exception& e)
477 {
478 cerr << "Error: " << e.what() << endl;
479 return 1;
480 }
481}
482
int main()
Probabilistic set membership with Bloom filters.
void print_subsection(const string &title)
void demo_optimal_params()
void print_section(const string &title)
void demo_false_positives()
void demo_space_comparison()
void demo_cache_filter()
void demo_basic()
void demo_spell_checker()
long double w
Definition btreepic.C:153
Bloom filter implementation.
T & insert(const T &item)
Insert a new item by copy.
Definition htlist.H:1502
size_t size() const noexcept
Count the number of elements of the list.
Definition htlist.H:1319
__gmp_expr< T, __gmp_unary_expr< __gmp_expr< T, U >, __gmp_exp_function > > exp(const __gmp_expr< T, U > &expr)
Definition gmpfrxx.h:4066
__gmp_expr< T, __gmp_unary_expr< __gmp_expr< T, U >, __gmp_log_function > > log(const __gmp_expr< T, U > &expr)
Definition gmpfrxx.h:4063
__gmp_expr< T, __gmp_unary_expr< __gmp_expr< T, U >, __gmp_ceil_function > > ceil(const __gmp_expr< T, U > &expr)
Definition gmpfrxx.h:4056
__gmp_expr< typename __gmp_resolve_expr< T, V >::value_type, __gmp_binary_expr< __gmp_expr< T, U >, __gmp_expr< V, W >, __gmp_pow_function > > pow(const __gmp_expr< T, U > &expr1, const __gmp_expr< V, W > &expr2)
Definition gmpfrxx.h:4061
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
Container2< typename Container1::Item_Type > filter(Container1 &container, Operation &operation)
Filter elements that satisfy operation.
DynList< T > maps(const C &c, Op op)
Classic map operation.
STL namespace.