87#include <tclap/CmdLine.h>
100 cout <<
"\n" << string(60,
'=') <<
"\n";
102 cout << string(60,
'=') <<
"\n\n";
118 cout <<
"A Bloom filter tests set membership probabilistically.\n";
119 cout <<
"May have false positives, but NEVER false negatives.\n\n";
128 cout <<
"Created Bloom filter:" <<
endl;
129 cout <<
" Bit array size (m): " << m <<
endl;
130 cout <<
" Hash functions (k): " <<
k <<
endl;
134 vector<string> words = {
"cafe",
"arepa",
"empanada",
"bandeja",
"sancocho"};
136 for (
const auto&
w : words)
139 cout <<
" Inserted: \"" <<
w <<
"\"" <<
endl;
150 cout <<
" \"" <<
w <<
"\": "
151 << (
found ?
"POSSIBLY present" :
"DEFINITELY absent") <<
endl;
154 cout <<
"\nNote: \"POSSIBLY present\" may be a false positive." <<
endl;
155 cout <<
" \"DEFINITELY absent\" is always correct!" <<
endl;
166 cout <<
"The false positive rate depends on m (bits), k (hashes), n (elements).\n";
167 cout <<
"Formula: P(FP) ≈ (1 - e^(-kn/m))^k\n\n";
177 for (
size_t i = 0; i < n; i++)
178 filter.insert(
static_cast<int>(i));
181 cout <<
" Elements inserted (n): " << n <<
endl;
182 cout <<
" Bit array size (m): " << m <<
endl;
183 cout <<
" Hash functions (k): " <<
k <<
endl;
198 if (
filter.contains(
static_cast<int>(i)))
218 cout <<
"For a target false positive rate p and n elements:\n";
219 cout <<
" Optimal m = -n * ln(p) / (ln(2))²\n";
220 cout <<
" Optimal k = (m/n) * ln(2) ≈ 0.693 * m/n\n\n";
231 cout <<
"Target: " << (
target_fp * 100) <<
"% FP rate for " << n <<
" elements" <<
endl;
240 <<
setw(15) <<
"FP Rate (%)" <<
setw(15) <<
"Bits/elem" <<
endl;
244 {5000, 3}, {5000, 7}, {10000, 7}, {10000, 10}, {15000, 10}
249 double fp =
pow(1.0 -
exp(-
static_cast<double>(
k * n) / m),
k);
264 cout <<
"Use a Bloom filter as a fast first-pass spell checker.\n";
265 cout <<
"If word is 'definitely absent', it's misspelled.\n\n";
276 "hola",
"mundo",
"casa",
"perro",
"gato",
"agua",
"fuego",
"tierra",
277 "aire",
"sol",
"luna",
"estrella",
"mar",
"cielo",
"arbol",
"flor",
278 "libro",
"mesa",
"silla",
"puerta",
"ventana",
"calle",
"ciudad",
279 "pueblo",
"pais",
"rio",
"montana",
"valle",
"bosque",
"campo",
280 "tiempo",
"dia",
"noche",
"semana",
"mes",
"ano",
"hora",
"minuto",
281 "segundo",
"mano",
"pie",
"cabeza",
"ojo",
"nariz",
"boca",
"oreja",
282 "corazon",
"alma",
"vida",
"muerte"
293 "hola",
"munod",
"casa",
"perro",
"gatoh",
"agua",
"xyz",
"libro"
296 cout <<
"Checking words:\n";
297 for (
const auto&
w :
text)
300 cout <<
" \"" <<
w <<
"\": ";
302 cout <<
"OK (in dictionary)" <<
endl;
304 cout <<
"MISSPELLED (not in dictionary)" <<
endl;
307 cout <<
"\nNote: 'OK' might be a false positive for unknown words." <<
endl;
308 cout <<
" 'MISSPELLED' is always correct!" <<
endl;
319 cout <<
"Use Bloom filter to avoid expensive database lookups.\n";
320 cout <<
"If key is 'definitely absent', skip the database query.\n\n";
330 cout <<
"Cache contains IDs: 0, 2, 4, 6, ..., 1998 (even numbers)" <<
endl;
331 for (
int i = 0; i < 2000; i += 2)
337 vector<int>
queries = {100, 101, 500, 999, 1000, 1001, 1500, 9999};
342 cout <<
setw(10) <<
"Query ID" <<
setw(20) <<
"Bloom Result"
355 cout <<
setw(20) <<
"Maybe present";
362 cout <<
setw(20) <<
"False positive, DB lookup" <<
endl;
366 cout <<
setw(20) <<
"Definitely absent" <<
setw(20) <<
"Skip DB (saved!)" <<
endl;
384 cout <<
"Bloom filters trade accuracy for space efficiency.\n\n";
388 cout <<
"Storing " << n <<
" 64-bit integers:\n\n";
394 cout <<
setw(15) <<
"FP Rate" <<
setw(15) <<
"Bits/elem"
395 <<
setw(15) <<
"Total KB" <<
setw(15) <<
"Savings" <<
endl;
398 vector<double>
fp_rates = {0.10, 0.05, 0.01, 0.001, 0.0001};
403 double m = -
static_cast<double>(n) *
log(
fp) / (
ln2 *
ln2);
414 cout <<
"\nHash set (approximate): "
427 "Bloom filter example for Aleph-w.\n"
428 "Demonstrates probabilistic set membership testing.",
434 "Run only specific section: basic, fp, params, spell, cache, space, or 'all'",
435 false,
"all",
"section",
cmd
443 cout <<
"============================================================\n";
444 cout <<
" ALEPH-W BLOOM FILTER EXAMPLE\n";
445 cout <<
"============================================================\n";
465 cout <<
"\n" << string(60,
'=') <<
"\n";
466 cout <<
"Bloom filter demo completed!\n";
467 cout << string(60,
'=') <<
"\n\n";
471 catch (TCLAP::ArgException& e)
473 cerr <<
"Error: " << e.error() <<
" for argument " << e.argId() <<
endl;
478 cerr <<
"Error: " << e.what() <<
endl;
Probabilistic set membership with Bloom filters.
void print_subsection(const string &title)
void demo_optimal_params()
void print_section(const string &title)
void demo_false_positives()
void demo_space_comparison()
void demo_spell_checker()
Bloom filter implementation.
T & insert(const T &item)
Insert a new item by copy.
size_t size() const noexcept
Count the number of elements of the list.
__gmp_expr< T, __gmp_unary_expr< __gmp_expr< T, U >, __gmp_exp_function > > exp(const __gmp_expr< T, U > &expr)
__gmp_expr< T, __gmp_unary_expr< __gmp_expr< T, U >, __gmp_log_function > > log(const __gmp_expr< T, U > &expr)
__gmp_expr< T, __gmp_unary_expr< __gmp_expr< T, U >, __gmp_ceil_function > > ceil(const __gmp_expr< T, U > &expr)
__gmp_expr< typename __gmp_resolve_expr< T, V >::value_type, __gmp_binary_expr< __gmp_expr< T, U >, __gmp_expr< V, W >, __gmp_pow_function > > pow(const __gmp_expr< T, U > &expr1, const __gmp_expr< V, W > &expr2)
Main namespace for Aleph-w library functions.
Container2< typename Container1::Item_Type > filter(Container1 &container, Operation &operation)
Filter elements that satisfy operation.
DynList< T > maps(const C &c, Op op)
Classic map operation.