Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
compact-cuckoo-filter.cc
Go to the documentation of this file.
1
2/*
3 Aleph_w
4
5 Data structures & Algorithms
6 version 2.0.0b
7 https://github.com/lrleon/Aleph-w
8
9 This file is part of Aleph-w library
10
11 Copyright (c) 2002-2026 Leandro Rabindranath Leon
12
13 Permission is hereby granted, free of charge, to any person obtaining a copy
14 of this software and associated documentation files (the "Software"), to deal
15 in the Software without restriction, including without limitation the rights
16 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17 copies of the Software, and to permit persons to whom the Software is
18 furnished to do so, subject to the following conditions:
19
20 The above copyright notice and this permission notice shall be included in all
21 copies or substantial portions of the Software.
22
23 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29 SOFTWARE.
30*/
31
36#include <gtest/gtest.h>
37
38#include <cuckoo-filter.H>
40
41#include <cstddef>
42#include <string>
43#include <set>
44#include <random>
45#include <vector>
46#include <algorithm>
47
48using namespace Aleph;
49using namespace std;
50
51// =====================================================================
52// Construction
53// =====================================================================
54
56{
58 EXPECT_GE(cf.capacity(), 1000u);
59 EXPECT_EQ(cf.size(), 0u);
60 EXPECT_TRUE(cf.is_empty());
61}
62
64{
65 Compact_Cuckoo_Filter<int, 12> cf(1000); // 12-bit fingerprints
66 EXPECT_GE(cf.capacity(), 1000u);
67}
68
70{
71 Compact_Cuckoo_Filter<int, 8, 8> cf(1000); // 8 entries per bucket
72 EXPECT_GE(cf.capacity(), 1000u);
73}
74
76{
77 // Standard: 4 entries × 32 bits = 16 bytes/bucket
78 // Compact with 8-bit FP: 4 entries × 8 bits = 4 bytes/bucket
80
81 size_t num_buckets = cf.capacity() / 4;
82 size_t expected_bytes = (num_buckets * 4 * 8 + 7) / 8; // 4 entries × 8 bits
83
84 EXPECT_LE(cf.memory_usage(), expected_bytes + 100); // Allow small overhead
85
86 // Should be much less than standard (16 bytes/bucket)
87 EXPECT_LT(cf.memory_usage(), num_buckets * 10);
88}
89
90// =====================================================================
91// Basic insert / contains
92// =====================================================================
93
95{
97
98 for (int i = 0; i < 100; ++i)
99 EXPECT_TRUE(cf.insert(i));
100
101 EXPECT_EQ(cf.size(), 100u);
102
103 for (int i = 0; i < 100; ++i)
104 EXPECT_TRUE(cf.contains(i)) << "False negative for i=" << i;
105}
106
108{
110
111 for (int i = 0; i < 100; ++i)
112 cf.insert(i);
113
114 size_t false_positives = 0;
115 for (int i = 1000; i < 2000; ++i)
116 if (cf.contains(i))
118
119 // With 8-bit FP and 4 entries/bucket, FP rate ≈ 3.1%
120 // Testing 1000 elements: expect < 50 false positives
121 EXPECT_LT(false_positives, 60u) << "Too many false positives: " << false_positives;
122}
123
125{
127
128 EXPECT_TRUE(cf.insert(42));
129 EXPECT_EQ(cf.size(), 1u);
130
131 // Cuckoo filter allows duplicates
132 EXPECT_TRUE(cf.insert(42));
133 EXPECT_EQ(cf.size(), 2u);
134
135 EXPECT_TRUE(cf.contains(42));
136}
137
139{
141
142 cf.insert("hello");
143 cf.insert("world");
144 cf.insert("aleph");
145
146 EXPECT_TRUE(cf.contains("hello"));
147 EXPECT_TRUE(cf.contains("world"));
148 EXPECT_TRUE(cf.contains("aleph"));
149
150 EXPECT_EQ(cf.size(), 3u);
151}
152
154{
155 // 4-bit fingerprints → high FP rate but minimal memory
157
158 for (int i = 0; i < 50; ++i)
159 cf.insert(i);
160
161 for (int i = 0; i < 50; ++i)
162 EXPECT_TRUE(cf.contains(i));
163}
164
166{
167 // 16-bit fingerprints → very low FP rate
169
170 for (int i = 0; i < 50; ++i)
171 cf.insert(i);
172
173 for (int i = 0; i < 50; ++i)
174 EXPECT_TRUE(cf.contains(i));
175}
176
177// =====================================================================
178// Deletion
179// =====================================================================
180
182{
184
185 cf.insert(10);
186 cf.insert(20);
187 cf.insert(30);
188 EXPECT_EQ(cf.size(), 3u);
189
190 EXPECT_TRUE(cf.remove(20));
191 EXPECT_EQ(cf.size(), 2u);
192 EXPECT_FALSE(cf.contains(20));
193 EXPECT_TRUE(cf.contains(10));
194 EXPECT_TRUE(cf.contains(30));
195}
196
198{
200
201 cf.insert(1);
202 EXPECT_FALSE(cf.remove(999));
203 EXPECT_EQ(cf.size(), 1u);
204 EXPECT_TRUE(cf.contains(1));
205}
206
208{
210
211 cf.insert(5);
212 cf.insert(5); // duplicate
213 EXPECT_EQ(cf.size(), 2u);
214
215 EXPECT_TRUE(cf.remove(5));
216 EXPECT_EQ(cf.size(), 1u);
217 EXPECT_TRUE(cf.contains(5)); // Still has one copy
218
219 EXPECT_TRUE(cf.remove(5));
220 EXPECT_EQ(cf.size(), 0u);
221 EXPECT_FALSE(cf.contains(5));
222}
223
225{
227
228 cf.insert(5);
229 cf.remove(5);
230 EXPECT_FALSE(cf.contains(5));
231
232 cf.insert(5);
233 EXPECT_TRUE(cf.contains(5));
234 EXPECT_EQ(cf.size(), 1u);
235}
236
238{
240
241 for (int i = 0; i < 100; ++i)
242 cf.insert(i);
243
244 // Remove every other element
245 for (int i = 0; i < 100; i += 2)
246 EXPECT_TRUE(cf.remove(i));
247
248 EXPECT_EQ(cf.size(), 50u);
249
250 // Verify odd elements remain
251 for (int i = 1; i < 100; i += 2)
252 EXPECT_TRUE(cf.contains(i)) << "Lost i=" << i;
253
254 // Verify even elements removed
255 for (int i = 0; i < 100; i += 2)
256 EXPECT_FALSE(cf.contains(i)) << "Still has i=" << i;
257}
258
259// =====================================================================
260// Capacity and load factor
261// =====================================================================
262
264{
266 EXPECT_DOUBLE_EQ(cf.load_factor(), 0.0);
267
268 int inserted = 0;
269 for (int i = 0; i < 50; ++i)
270 if (cf.insert(i))
271 ++inserted;
272
273 double lf = cf.load_factor();
274 EXPECT_GT(lf, 0.3); // At least 30% of capacity
275 EXPECT_LT(lf, 0.6);
276}
277
279{
281
282 std::set<int> inserted_set;
283 // Fill to ~90% capacity
284 for (int i = 0; i < 1000; ++i)
285 if (cf.insert(i))
286 inserted_set.insert(i);
287
288 EXPECT_GT(inserted_set.size(), 800u);
289
290 // All inserted elements should still be found
291 for (int val : inserted_set)
292 EXPECT_TRUE(cf.contains(val)) << "Lost val=" << val;
293}
294
296{
298
299 int successful = 0;
300 for (int i = 0; i < 200; ++i)
301 if (cf.insert(i))
302 ++successful;
303
304 // Should have inserted most but not all
306 EXPECT_LT(successful, 200);
307}
308
309// =====================================================================
310// Introspection
311// =====================================================================
312
314{
316
317 // Should use ~4 bytes/bucket (4 × 8 bits)
318 size_t num_buckets = cf.capacity() / 4;
319 size_t expected = (num_buckets * 4 * 8 + 7) / 8;
320
321 EXPECT_LE(cf.memory_usage(), expected + 100); // Allow small overhead
322}
323
325{
327
328 for (int i = 0; i < 100; ++i)
329 cf.insert(i);
330
331 double lf = cf.load_factor();
332 EXPECT_GT(lf, 0.04);
333 EXPECT_LT(lf, 0.15);
334}
335
336// =====================================================================
337// Copy / Move semantics
338// =====================================================================
339
341{
343 for (int i = 0; i < 30; ++i)
344 cf.insert(i);
345
347 EXPECT_EQ(copy.size(), cf.size());
348 EXPECT_EQ(copy.capacity(), cf.capacity());
349
350 for (int i = 0; i < 30; ++i)
351 EXPECT_TRUE(copy.contains(i));
352}
353
355{
357 for (int i = 0; i < 30; ++i)
358 cf.insert(i);
359
360 size_t orig_size = cf.size();
362
363 EXPECT_EQ(moved.size(), orig_size);
364 for (int i = 0; i < 30; ++i)
365 EXPECT_TRUE(moved.contains(i));
366}
367
368// =====================================================================
369// Clear
370// =====================================================================
371
373{
375 for (int i = 0; i < 50; ++i)
376 cf.insert(i);
377
378 cf.clear();
379 EXPECT_EQ(cf.size(), 0u);
380 EXPECT_TRUE(cf.is_empty());
381
382 for (int i = 0; i < 50; ++i)
383 EXPECT_FALSE(cf.contains(i));
384}
385
387{
389
390 for (int i = 0; i < 20; ++i)
391 cf.insert(i);
392
393 cf.clear();
394
395 for (int i = 100; i < 120; ++i)
396 cf.insert(i);
397
398 EXPECT_EQ(cf.size(), 20u);
399 for (int i = 100; i < 120; ++i)
400 EXPECT_TRUE(cf.contains(i));
401}
402
403// =====================================================================
404// False positive rate (statistical)
405// =====================================================================
406
408{
409 const size_t n = 500;
411
412 for (size_t i = 0; i < n; ++i)
413 cf.insert(static_cast<int>(i));
414
415 size_t test_count = 10000;
416 size_t fps = 0;
417 for (size_t i = n + 10000; i < n + 10000 + test_count; ++i)
418 if (cf.contains(static_cast<int>(i)))
419 ++fps;
420
421 double empirical = static_cast<double>(fps) / test_count;
422
423 // With 8-bit FP and 4 entries/bucket, theoretical FP ≈ 3.1%
424 // Empirical should be reasonably close
425 EXPECT_LT(empirical, 0.10) // < 10%
426 << "Empirical FP rate " << empirical << " too high";
427}
428
430{
432
433 for (int i = 0; i < 200; ++i)
434 cf.insert(i);
435
436 size_t fps = 0;
437 for (int i = 10000; i < 20000; ++i)
438 if (cf.contains(i))
439 ++fps;
440
441 // With 16-bit FP, expect very few false positives
442 EXPECT_LT(fps, 20u);
443}
444
445// =====================================================================
446// Stress tests
447// =====================================================================
448
450{
452 std::mt19937 rng(42);
453 std::uniform_int_distribution<int> dist(0, 1000000);
454
455 std::set<int> inserted;
456
457 // Insert 2000 random elements
458 for (int i = 0; i < 2000; ++i)
459 {
460 int val = dist(rng);
461 if (cf.insert(val))
462 inserted.insert(val);
463 }
464
465 // All inserted elements must be found
466 for (int val : inserted)
467 EXPECT_TRUE(cf.contains(val)) << "Lost value " << val;
468}
469
471{
473
474 for (int cycle = 0; cycle < 5; ++cycle)
475 {
476 cf.clear();
477
478 // Insert 200 elements
479 for (int i = 0; i < 200; ++i)
480 cf.insert(cycle * 1000 + i);
481
482 // Remove half
483 for (int i = 0; i < 100; ++i)
484 cf.remove(cycle * 1000 + i);
485
486 // Verify remaining half
487 for (int i = 100; i < 200; ++i)
488 EXPECT_TRUE(cf.contains(cycle * 1000 + i)) << "Cycle " << cycle << ", lost i=" << i;
489 }
490}
491
493{
495
496 // Insert same element multiple times (some may fail due to cuckoo hashing)
497 int successful_inserts = 0;
498 for (int i = 0; i < 10; ++i)
499 if (cf.insert(42))
501
502 EXPECT_GE(successful_inserts, 5); // At least half should succeed
503 EXPECT_EQ(cf.size(), static_cast<size_t>(successful_inserts));
504 EXPECT_TRUE(cf.contains(42));
505
506 // Remove half
508 for (int i = 0; i < removals; ++i)
509 EXPECT_TRUE(cf.remove(42));
510
511 EXPECT_EQ(cf.size(), static_cast<size_t>(successful_inserts - removals));
513 EXPECT_TRUE(cf.contains(42)); // Still has copies remaining
514}
515
516// =====================================================================
517// Edge cases
518// =====================================================================
519
521{
523
524 cf.insert(1);
525 EXPECT_TRUE(cf.contains(1));
526
527 cf.insert(2);
528 EXPECT_TRUE(cf.contains(1));
529 EXPECT_TRUE(cf.contains(2));
530}
531
533{
534 Compact_Cuckoo_Filter<int, 1> cf(1000); // 1-bit FP
535
536 for (int i = 0; i < 50; ++i)
537 cf.insert(i);
538
539 for (int i = 0; i < 50; ++i)
540 EXPECT_TRUE(cf.contains(i));
541}
542
544{
545 Compact_Cuckoo_Filter<int, 32> cf(1000); // 32-bit FP
546
547 cf.insert(42);
548 EXPECT_TRUE(cf.contains(42));
549 EXPECT_FALSE(cf.contains(43));
550}
551
553{
554 Compact_Cuckoo_Filter<int, 8, 1> cf(1000); // 1 entry/bucket
555
556 for (int i = 0; i < 50; ++i)
557 cf.insert(i);
558
559 for (int i = 0; i < 50; ++i)
560 EXPECT_TRUE(cf.contains(i));
561}
562
563// =====================================================================
564// Comparison with standard Cuckoo_Filter
565// =====================================================================
566
568{
569 Cuckoo_Filter<int> standard(1000);
571
572 std::vector<int> values = {1, 5, 10, 42, 99, 123, 456, 789};
573
574 for (int val : values)
575 {
576 standard.insert(val);
577 compact.insert(val);
578 }
579
580 // Both should find all inserted elements
581 for (int val : values)
582 {
583 EXPECT_TRUE(standard.contains(val));
584 EXPECT_TRUE(compact.contains(val));
585 }
586
587 // Both should have same size
588 EXPECT_EQ(standard.size(), compact.size());
589}
590
592{
594
595 size_t compact_mem = compact.memory_usage();
596
597 // Compact uses ~4 bytes/bucket (4 × 8 bits)
598 // Standard would use ~16 bytes/bucket (4 × 32 bits)
599 size_t num_buckets = compact.capacity() / 4;
600 size_t expected_compact = (num_buckets * 4 * 8 + 7) / 8; // 4 entries × 8 bits
601 size_t expected_standard = num_buckets * 16; // 4 entries × 32 bits
602
603 // Verify compact is close to theoretical minimum
604 EXPECT_LE(compact_mem, expected_compact + 100); // Allow small overhead
605
606 // Verify it's much less than standard would be
607 EXPECT_LT(compact_mem, expected_standard / 3); // < 33% of standard
608}
609
611{
612 // This test just verifies both work correctly under load
613 Cuckoo_Filter<int> standard(5000);
615
616 for (int i = 0; i < 1000; ++i)
617 {
618 standard.insert(i);
619 compact.insert(i);
620 }
621
622 // Both should have similar success rates
623 EXPECT_EQ(standard.size(), 1000u);
624 EXPECT_EQ(compact.size(), 1000u);
625
626 // Both should find all elements
627 for (int i = 0; i < 1000; ++i)
628 {
629 EXPECT_TRUE(standard.contains(i));
630 EXPECT_TRUE(compact.contains(i));
631 }
632}
Memory-optimized Cuckoo Filter with bit-packed fingerprints.
Industrial-grade Cuckoo Filter implementation.
bool contains(const T &item) const
Test membership.
size_t size() const noexcept
Return the number of items currently in the filter.
bool insert(const T &item)
Insert an item into the filter.
Memory-optimized Cuckoo filter with bit-packed fingerprints.
Probabilistic set membership with Cuckoo filters.
#define TEST(name)
static mt19937 rng
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
Itor2 copy(Itor1 sourceBeg, const Itor1 &sourceEnd, Itor2 destBeg)
Copy elements from one range to another.
Definition ahAlgo.H:584
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
STL namespace.