87#include <tclap/CmdLine.h>
100 cout <<
"\n" << string(60,
'=') <<
"\n";
101 cout <<
" " << title <<
"\n";
102 cout << string(60,
'=') <<
"\n\n";
107 cout <<
"\n--- " << title <<
" ---\n";
118 cout <<
"A Bloom filter tests set membership probabilistically.\n";
119 cout <<
"May have false positives, but NEVER false negatives.\n\n";
128 cout <<
"Created Bloom filter:" <<
endl;
129 cout <<
" Bit array size (m): " <<
m <<
endl;
130 cout <<
" Hash functions (k): " <<
k <<
endl;
134 vector<string> words = {
"cafe",
"arepa",
"empanada",
"bandeja",
"sancocho"};
136 for (
const auto&
w : words)
139 cout <<
" Inserted: \"" <<
w <<
"\"" <<
endl;
149 bool found =
filter.contains(
w);
150 cout <<
" \"" <<
w <<
"\": "
151 << (found ?
"POSSIBLY present" :
"DEFINITELY absent") <<
endl;
154 cout <<
"\nNote: \"POSSIBLY present\" may be a false positive." <<
endl;
155 cout <<
" \"DEFINITELY absent\" is always correct!" <<
endl;
166 cout <<
"The false positive rate depends on m (bits), k (hashes), n (elements).\n";
167 cout <<
"Formula: P(FP) ≈ (1 - e^(-kn/m))^k\n\n";
177 for (
size_t i = 0; i < n; i++)
178 filter.insert(
static_cast<int>(i));
180 cout <<
"Configuration:" <<
endl;
181 cout <<
" Elements inserted (n): " << n <<
endl;
182 cout <<
" Bit array size (m): " <<
m <<
endl;
183 cout <<
" Hash functions (k): " <<
k <<
endl;
187 cout <<
"\nTheoretical FP rate: " << fixed << setprecision(4)
198 if (
filter.contains(
static_cast<int>(i)))
204 cout <<
"Tested " <<
test_count <<
" elements NOT in the filter:" <<
endl;
206 cout <<
" Empirical FP rate: " << fixed << setprecision(4)
218 cout <<
"For a target false positive rate p and n elements:\n";
219 cout <<
" Optimal m = -n * ln(p) / (ln(2))²\n";
220 cout <<
" Optimal k = (m/n) * ln(2) ≈ 0.693 * m/n\n\n";
231 cout <<
"Target: " << (
target_fp * 100) <<
"% FP rate for " << n <<
" elements" <<
endl;
232 cout <<
" Optimal m: " <<
static_cast<size_t>(
ceil(
m_optimal)) <<
" bits" <<
endl;
233 cout <<
" Optimal k: " <<
static_cast<size_t>(
round(
k_optimal)) <<
" hash functions" <<
endl;
234 cout <<
" Bits per element: " << fixed << setprecision(2) << (
m_optimal / n) <<
endl;
239 cout << setw(10) <<
"m" << setw(10) <<
"k"
240 << setw(15) <<
"FP Rate (%)" << setw(15) <<
"Bits/elem" <<
endl;
241 cout << string(50,
'-') <<
endl;
244 {5000, 3}, {5000, 7}, {10000, 7}, {10000, 10}, {15000, 10}
249 double fp =
pow(1.0 -
exp(-
static_cast<double>(
k * n) /
m),
k);
250 cout << setw(10) <<
m << setw(10) <<
k
251 << setw(15) << fixed << setprecision(4) << (fp * 100)
252 << setw(15) << setprecision(2) << (
static_cast<double>(
m) / n) <<
endl;
264 cout <<
"Use a Bloom filter as a fast first-pass spell checker.\n";
265 cout <<
"If word is 'definitely absent', it's misspelled.\n\n";
276 "hola",
"mundo",
"casa",
"perro",
"gato",
"agua",
"fuego",
"tierra",
277 "aire",
"sol",
"luna",
"estrella",
"mar",
"cielo",
"arbol",
"flor",
278 "libro",
"mesa",
"silla",
"puerta",
"ventana",
"calle",
"ciudad",
279 "pueblo",
"pais",
"rio",
"montana",
"valle",
"bosque",
"campo",
280 "tiempo",
"dia",
"noche",
"semana",
"mes",
"ano",
"hora",
"minuto",
281 "segundo",
"mano",
"pie",
"cabeza",
"ojo",
"nariz",
"boca",
"oreja",
282 "corazon",
"alma",
"vida",
"muerte"
293 "hola",
"munod",
"casa",
"perro",
"gatoh",
"agua",
"xyz",
"libro"
296 cout <<
"Checking words:\n";
297 for (
const auto&
w :
text)
300 cout <<
" \"" <<
w <<
"\": ";
302 cout <<
"OK (in dictionary)" <<
endl;
304 cout <<
"MISSPELLED (not in dictionary)" <<
endl;
307 cout <<
"\nNote: 'OK' might be a false positive for unknown words." <<
endl;
308 cout <<
" 'MISSPELLED' is always correct!" <<
endl;
319 cout <<
"Use Bloom filter to avoid expensive database lookups.\n";
320 cout <<
"If key is 'definitely absent', skip the database query.\n\n";
330 cout <<
"Cache contains IDs: 0, 2, 4, 6, ..., 1998 (even numbers)" <<
endl;
331 for (
int i = 0; i < 2000; i += 2)
337 vector<int>
queries = {100, 101, 500, 999, 1000, 1001, 1500, 9999};
342 cout << setw(10) <<
"Query ID" << setw(20) <<
"Bloom Result"
343 << setw(20) <<
"Action" <<
endl;
344 cout << string(50,
'-') <<
endl;
351 cout << setw(10) << id;
355 cout << setw(20) <<
"Maybe present";
358 cout << setw(20) <<
"Cache HIT" <<
endl;
362 cout << setw(20) <<
"False positive, DB lookup" <<
endl;
366 cout << setw(20) <<
"Definitely absent" << setw(20) <<
"Skip DB (saved!)" <<
endl;
371 cout <<
"\nResults:" <<
endl;
384 cout <<
"Bloom filters trade accuracy for space efficiency.\n\n";
388 cout <<
"Storing " << n <<
" 64-bit integers:\n\n";
394 cout << setw(15) <<
"FP Rate" << setw(15) <<
"Bits/elem"
395 << setw(15) <<
"Total KB" << setw(15) <<
"Savings" <<
endl;
396 cout << string(60,
'-') <<
endl;
398 vector<double>
fp_rates = {0.10, 0.05, 0.01, 0.001, 0.0001};
403 double m = -
static_cast<double>(n) *
log(fp) / (
ln2 *
ln2);
408 cout << setw(14) << fixed << setprecision(4) << (fp * 100) <<
"%"
410 << setw(15) << setprecision(2) <<
bloom_kb
411 << setw(14) << setprecision(1) << (
savings * 100) <<
"%" <<
endl;
414 cout <<
"\nHash set (approximate): "
427 "Bloom filter example for Aleph-w.\n"
428 "Demonstrates probabilistic set membership testing.",
434 "Run only specific section: basic, fp, params, spell, cache, space, or 'all'",
435 false,
"all",
"section",
cmd
443 cout <<
"============================================================\n";
444 cout <<
" ALEPH-W BLOOM FILTER EXAMPLE\n";
445 cout <<
"============================================================\n";
465 cout <<
"\n" << string(60,
'=') <<
"\n";
466 cout <<
"Bloom filter demo completed!\n";
467 cout << string(60,
'=') <<
"\n\n";
471 catch (TCLAP::ArgException& e)
473 cerr <<
"Error: " << e.error() <<
" for argument " << e.argId() <<
endl;
478 cerr <<
"Error: " << e.what() <<
endl;
Probabilistic set membership with Bloom filters.
void print_subsection(const string &title)
void demo_optimal_params()
void print_section(const string &title)
void demo_false_positives()
void demo_space_comparison()
void demo_spell_checker()
Bloom filter implementation.
__gmp_expr< T, __gmp_unary_expr< __gmp_expr< T, U >, __gmp_exp_function > > exp(const __gmp_expr< T, U > &expr)
__gmp_expr< T, __gmp_unary_expr< __gmp_expr< T, U >, __gmp_log_function > > log(const __gmp_expr< T, U > &expr)
__gmp_expr< T, __gmp_unary_expr< __gmp_expr< T, U >, __gmp_ceil_function > > ceil(const __gmp_expr< T, U > &expr)
__gmp_expr< typename __gmp_resolve_expr< T, V >::value_type, __gmp_binary_expr< __gmp_expr< T, U >, __gmp_expr< V, W >, __gmp_pow_function > > pow(const __gmp_expr< T, U > &expr1, const __gmp_expr< V, W > &expr2)
Main namespace for Aleph-w library functions.
and
Check uniqueness with explicit hash + equality functors.
Container2< typename Container1::Item_Type > filter(Container1 &container, Operation &operation)
Filter elements that satisfy operation.
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
static void section(const string &title)
FooMap m(5, fst_unit_pair_hash, snd_unit_pair_hash)