Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
compact-cuckoo-filter.H
Go to the documentation of this file.
1/*
2 Aleph_w
3
4 Data structures & Algorithms
5 version 2.0.0b
6 https://github.com/lrleon/Aleph-w
7
8 This file is part of Aleph-w library
9
10 Copyright (c) 2002-2026 Leandro Rabindranath Leon
11
12 Permission is hereby granted, free of charge, to any person obtaining a copy
13 of this software and associated documentation files (the "Software"), to deal
14 in the Software without restriction, including without limitation the rights
15 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 copies of the Software, and to permit persons to whom the Software is
17 furnished to do so, subject to the following conditions:
18
19 The above copyright notice and this permission notice shall be included in all
20 copies or substantial portions of the Software.
21
22 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 SOFTWARE.
29*/
30
72#ifndef ALEPH_COMPACT_CUCKOO_FILTER_H
73#define ALEPH_COMPACT_CUCKOO_FILTER_H
74
75#include <cassert>
76#include <cstddef>
77#include <cmath>
78#include <optional>
79#include <random>
80#include <algorithm>
81#include <hash-fct.H>
82#include <ah-errors.H>
83#include <ahUtils.H>
84#include <tpl_array.H>
85#include <bitArray.H>
86
87namespace Aleph {
94template <typename T, size_t FingerprintBits = 8, size_t EntriesPerBucket = 4> class Compact_Cuckoo_Filter
95{
96 static_assert(FingerprintBits > 0 and FingerprintBits <= 32, "Fingerprint bits must be between 1 and 32");
97 static_assert(EntriesPerBucket > 0, "Entries per bucket must be > 0");
98
100
101 struct Bucket
102 {
104
108
109 void clear()
110 {
111 for (size_t i = 0; i < EntriesPerBucket * FingerprintBits; ++i)
112 bits.write_bit(i, 0);
113 }
114
115 [[nodiscard]] Fingerprint get_slot(const size_t idx) const
116 {
117 const size_t bit_offset = idx * FingerprintBits;
118 Fingerprint fp = 0;
119 for (size_t i = 0; i < FingerprintBits; ++i)
120 if (bits.read_bit(bit_offset + i))
121 fp |= (1U << i);
122 return fp;
123 }
124
125 void set_slot(size_t idx, Fingerprint fp)
126 {
127 const size_t bit_offset = idx * FingerprintBits;
128 for (size_t i = 0; i < FingerprintBits; ++i)
129 {
130 if (fp & (1U << i))
131 bits.write_bit(bit_offset + i, 1);
132 else
133 bits.write_bit(bit_offset + i, 0);
134 }
135 }
136
138 {
139 for (size_t i = 0; i < EntriesPerBucket; ++i)
140 if (get_slot(i) == 0)
141 {
142 set_slot(i, fp);
143 return true;
144 }
145 return false;
146 }
147
148 [[nodiscard]] bool contains(const Fingerprint fp) const
149 {
150 for (size_t i = 0; i < EntriesPerBucket; ++i)
151 if (get_slot(i) == fp)
152 return true;
153 return false;
154 }
155
157 {
158 for (size_t i = 0; i < EntriesPerBucket; ++i)
159 if (get_slot(i) == fp)
160 {
161 set_slot(i, 0);
162 return true;
163 }
164 return false;
165 }
166
167 void restore(Fingerprint fp, size_t idx)
168 {
169 set_slot(idx, fp);
170 }
171
172 [[nodiscard]] size_t swap_random_slot(Fingerprint &fp, std::mt19937 &rng)
173 {
174 std::uniform_int_distribution<size_t> dist(0, EntriesPerBucket - 1);
175 const size_t idx = dist(rng);
176 const Fingerprint old_fp = get_slot(idx);
177 set_slot(idx, fp);
178 fp = old_fp;
179 return idx;
180 }
181 };
182
184 struct Victim
185 {
186 size_t index = 0;
188 };
189
191 size_t num_items = 0;
192 size_t num_buckets = 0;
193 size_t bucket_mask = 0;
194 mutable std::mt19937 rng_;
195
196 static constexpr size_t max_kicks = 500;
197 static constexpr size_t stash_size = 4;
198 static constexpr double kTargetLoad = 0.9;
200 size_t stash_count_ = 0;
201
202 // Mask to keep only the desired number of bits for fingerprint
203 static constexpr Fingerprint fp_mask = (FingerprintBits == 32) ? 0xFFFFFFFF : (1U << FingerprintBits) - 1;
204
205 [[nodiscard]] Fingerprint get_fingerprint(const T &item) const
206 {
207 auto fp = static_cast<Fingerprint>(dft_hash_fct(item, 0x12345678) & fp_mask);
208 if (fp == 0)
209 fp = 1; // 0 is reserved for empty slot
210 return fp;
211 }
212
213 [[nodiscard]] size_t index_hash(const T &item) const noexcept
214 {
215 return dft_hash_fct(item) & bucket_mask;
216 }
217
218 [[nodiscard]] size_t alt_index(const size_t i, Fingerprint fp) const
219 {
220 // h2(x) = h1(x) XOR hash(fp)
221 // Both operands are in [0, bucket_mask], so the XOR result
222 // stays in the same range, preserving perfect reversibility.
223 return i ^ (murmur3hash(fp, 0x87654321) & bucket_mask);
224 }
225
226public:
236 explicit Compact_Cuckoo_Filter(size_t capacity, const std::optional<std::uint32_t> seed = std::nullopt)
237 : rng_(seed.has_value() ? std::mt19937(seed.value())
238 : std::mt19937(std::random_device{}()))
239 {
240 const size_t needed_buckets =
241 static_cast<size_t>(std::ceil(capacity / (EntriesPerBucket * kTargetLoad)));
242 // Round up to a power of 2 so that bucket_mask works as a modulo
244 if (num_buckets == 0)
245 num_buckets = 1;
247 buckets.putn(num_buckets);
248 }
249
255 {
256 return num_items;
257 }
258
264 {
265 return num_items == 0;
266 }
267
273 {
274 return empty();
275 }
276
282 {
284 }
285
291 {
292 return 1.0 * num_items / capacity();
293 }
294
300 {
301 // Each bucket uses FingerprintBits * EntriesPerBucket bits
303 const size_t bytes_per_bucket = (bits_per_bucket + 7) / 8;
305 }
306
314 bool insert(const T &item)
315 {
316 const Fingerprint fp = get_fingerprint(item);
317 size_t i1 = index_hash(item);
318 size_t i2 = alt_index(i1, fp);
319
320 if (buckets(i1).insert(fp))
321 {
322 ++num_items;
323 return true;
324 }
325
326 if (buckets(i2).insert(fp))
327 {
328 ++num_items;
329 return true;
330 }
331
332 // Both buckets full, start kicking
333 std::uniform_int_distribution<int> coin(0, 1);
334 size_t curr_idx = coin(rng_) ? i1 : i2;
335 Fingerprint curr_fp = fp;
336 struct Kick_Record
337 {
338 size_t bucket_idx = 0;
339 size_t slot_idx = 0;
341 };
342 Kick_Record history[max_kicks];
343 size_t history_size = 0;
344
345 for (size_t kick = 0; kick < max_kicks; ++kick)
346 {
347 const size_t slot_idx = buckets(curr_idx).swap_random_slot(curr_fp, rng_);
348 history[history_size++] = {curr_idx, slot_idx, curr_fp};
349 curr_idx = alt_index(curr_idx, curr_fp);
350 if (buckets(curr_idx).insert(curr_fp))
351 {
352 ++num_items;
353 return true;
354 }
355 }
356
357 // Kick chain exhausted — save the displaced fingerprint in the stash
359 {
360 stash_[stash_count_++] = {curr_idx, curr_fp};
361 ++num_items;
362 return true;
363 }
364
365 while (history_size > 0)
366 {
367 const Kick_Record & rec = history[--history_size];
368 buckets(rec.bucket_idx).restore(rec.original_fp, rec.slot_idx);
369 }
370
371 return false; // Filter and stash both full
372 }
373
379 bool append(const T &item)
380 {
381 return insert(item);
382 }
383
391 [[nodiscard]] bool contains(const T &item) const
392 {
393 const Fingerprint fp = get_fingerprint(item);
394 const size_t i1 = index_hash(item);
395 const size_t i2 = alt_index(i1, fp);
396
397 if (buckets(i1).contains(fp) or buckets(i2).contains(fp))
398 return true;
399
400 for (size_t i = 0; i < stash_count_; ++i)
401 if (stash_[i].fp == fp and (stash_[i].index == i1 or stash_[i].index == i2))
402 return true;
403
404 return false;
405 }
406
414 bool remove(const T &item)
415 {
416 const Fingerprint fp = get_fingerprint(item);
417 const size_t i1 = index_hash(item);
418 const size_t i2 = alt_index(i1, fp);
419
420 if (buckets(i1).remove(fp) or buckets(i2).remove(fp))
421 {
422 --num_items;
423 return true;
424 }
425
426 // Check the stash
427 for (size_t i = 0; i < stash_count_; ++i)
428 if (stash_[i].fp == fp and (stash_[i].index == i1 or stash_[i].index == i2))
429 {
430 stash_[i] = stash_[--stash_count_]; // swap with last
431 --num_items;
432 return true;
433 }
434
435 return false;
436 }
437
442 void clear()
443 {
444 for (size_t i = 0; i < num_buckets; ++i)
445 buckets(i).clear();
446 num_items = 0;
447 stash_count_ = 0;
448 }
449};
450} // namespace Aleph
451
452#endif // ALEPH_COMPACT_CUCKOO_FILTER_H
Exception handling system with formatted messages for Aleph-w.
General utility functions and helpers.
Space-efficient bit array implementation.
Simple dynamic array with automatic resizing and functional operations.
Definition tpl_array.H:139
Contiguous array of bits.
Definition bitArray.H:189
int read_bit(const size_t i) const
Read bit i.
Definition bitArray.H:377
void write_bit(const size_t i, const unsigned int value)
Write bit i with the value.
Definition bitArray.H:392
Memory-optimized Cuckoo Filter with bit-packed fingerprints.
bool append(const T &item)
Alias for insert().
bool contains(const T &item) const
Test membership.
static constexpr double kTargetLoad
bool is_empty() const noexcept
Synonym for empty() in traditional Aleph style.
size_t size() const noexcept
Return the number of items currently in the filter.
bool remove(const T &item)
Remove an item from the filter.
bool insert(const T &item)
Insert an item into the filter.
double load_factor() const noexcept
Return the load factor of the filter (relative to capacity()).
Compact_Cuckoo_Filter(size_t capacity, const std::optional< std::uint32_t > seed=std::nullopt)
Construct a Compact Cuckoo Filter with a given capacity.
static constexpr size_t stash_size
size_t alt_index(const size_t i, Fingerprint fp) const
static constexpr size_t max_kicks
size_t memory_usage() const noexcept
Return approximate memory usage in bytes.
static constexpr Fingerprint fp_mask
bool empty() const noexcept
Return whether the filter contains no items.
size_t capacity() const noexcept
Return the total number of fingerprint slots (including stash).
Fingerprint get_fingerprint(const T &item) const
size_t index_hash(const T &item) const noexcept
static mt19937 rng
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
and
Check uniqueness with explicit hash + equality functors.
unsigned long next_power_of_2(unsigned long x)
In x is not exact power of 2, it returns the next power of 2.
Definition ahUtils.H:234
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
std::decay_t< typename HeadC::Item_Type > T
Definition ah-zip.H:105
size_t dft_hash_fct(const Key &key) noexcept
Primary default hash: best speed/quality trade-off.
Definition hash-fct.H:1003
size_t murmur3hash(const Key &key, std::uint32_t seed)
Definition hash-fct.H:334
STL namespace.
Fingerprint get_slot(const size_t idx) const
size_t swap_random_slot(Fingerprint &fp, std::mt19937 &rng)
void restore(Fingerprint fp, size_t idx)
bool contains(const Fingerprint fp) const
void set_slot(size_t idx, Fingerprint fp)
Victim entry: a fingerprint that was displaced during a failed insert.
ValueArg< size_t > seed
Definition testHash.C:48
Dynamic array container with automatic resizing.