Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
cuckoo-filter.H
Go to the documentation of this file.
1/*
2 Aleph_w
3
4 Data structures & Algorithms
5 version 2.0.0b
6 https://github.com/lrleon/Aleph-w
7
8 This file is part of Aleph-w library
9
10 Copyright (c) 2002-2026 Leandro Rabindranath Leon
11
12 Permission is hereby granted, free of charge, to any person obtaining a copy
13 of this software and associated documentation files (the "Software"), to deal
14 in the Software without restriction, including without limitation the rights
15 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 copies of the Software, and to permit persons to whom the Software is
17 furnished to do so, subject to the following conditions:
18
19 The above copyright notice and this permission notice shall be included in all
20 copies or substantial portions of the Software.
21
22 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 SOFTWARE.
29*/
30
77#ifndef ALEPH_CUCKOO_FILTER_H
78#define ALEPH_CUCKOO_FILTER_H
79
80#include <cassert>
81#include <cstddef>
82#include <cmath>
83#include <optional>
84#include <random>
85#include <algorithm>
86#include <hash-fct.H>
87#include <ah-errors.H>
88#include <ahUtils.H>
89#include <tpl_array.H>
90
91namespace Aleph {
98template <typename T, size_t FingerprintBits = 8, size_t EntriesPerBucket = 4> class Cuckoo_Filter
99{
100 static_assert(FingerprintBits > 0 and FingerprintBits <= 32, "Fingerprint bits must be between 1 and 32");
101 static_assert(EntriesPerBucket > 0, "Entries per bucket must be > 0");
102
104
105 struct Bucket
106 {
108
110 {
111 std::fill(slots, slots + EntriesPerBucket, 0);
112 }
113
114 void clear()
115 {
116 std::fill(slots, slots + EntriesPerBucket, 0);
117 }
118
120 {
121 for (size_t i = 0; i < EntriesPerBucket; ++i)
122 if (slots[i] == 0)
123 {
124 slots[i] = fp;
125 return true;
126 }
127 return false;
128 }
129
130 bool contains(Fingerprint fp) const
131 {
132 for (size_t i = 0; i < EntriesPerBucket; ++i)
133 if (slots[i] == fp)
134 return true;
135 return false;
136 }
137
139 {
140 for (size_t i = 0; i < EntriesPerBucket; ++i)
141 if (slots[i] == fp)
142 {
143 slots[i] = 0;
144 return true;
145 }
146 return false;
147 }
148
149 void restore(Fingerprint fp, size_t idx)
150 {
151 slots[idx] = fp;
152 }
153
154 [[nodiscard]] size_t swap_random_slot(Fingerprint &fp, std::mt19937 &rng)
155 {
156 std::uniform_int_distribution<size_t> dist(0, EntriesPerBucket - 1);
157 const size_t idx = dist(rng);
158 std::swap(fp, slots[idx]);
159 return idx;
160 }
161 };
162
164 struct Victim
165 {
166 size_t index = 0;
168 };
169
171 size_t num_items = 0;
172 size_t num_buckets = 0;
173 size_t bucket_mask = 0;
174 mutable std::mt19937 rng_;
175
176 static constexpr size_t max_kicks = 500;
177 static constexpr size_t stash_size = 4;
178 static constexpr double kTargetLoad = 0.9;
180 size_t stash_count_ = 0;
181
182 // Mask to keep only the desired number of bits for fingerprint
183 static constexpr Fingerprint fp_mask = (FingerprintBits == 32) ? 0xFFFFFFFF : (1U << FingerprintBits) - 1;
184
185 [[nodiscard]] Fingerprint get_fingerprint(const T &item) const
186 {
187 auto fp = static_cast<Fingerprint>(dft_hash_fct(item, 0x12345678) & fp_mask);
188 if (fp == 0)
189 fp = 1; // 0 is reserved for empty slot
190 return fp;
191 }
192
193 [[nodiscard]] size_t index_hash(const T &item) const noexcept
194 {
195 return dft_hash_fct(item) & bucket_mask;
196 }
197
198 [[nodiscard]] size_t alt_index(size_t i, Fingerprint fp) const
199 {
200 // h2(x) = h1(x) XOR hash(fp)
201 // Both operands are in [0, bucket_mask], so the XOR result
202 // stays in the same range, preserving perfect reversibility.
203 return i ^ (murmur3hash(fp, 0x87654321) & bucket_mask);
204 }
205
206public:
216 explicit Cuckoo_Filter(size_t capacity,
217 std::optional<std::uint32_t> seed = std::nullopt)
218 : rng_(seed.has_value() ? std::mt19937(seed.value())
219 : std::mt19937(std::random_device{}()))
220 {
221 const size_t needed_buckets =
222 static_cast<size_t>(std::ceil(capacity / (EntriesPerBucket * kTargetLoad)));
223 // Round up to a power of 2 so that bucket_mask works as a modulo
225 if (num_buckets == 0)
226 num_buckets = 1;
228 buckets.putn(num_buckets);
229 }
230
236 {
237 return num_items;
238 }
239
245 {
246 return num_items == 0;
247 }
248
254 {
255 return empty();
256 }
257
263 {
265 }
266
272 {
273 return 1.0 * num_items / capacity();
274 }
275
283 bool insert(const T &item)
284 {
285 const Fingerprint fp = get_fingerprint(item);
286 size_t i1 = index_hash(item);
287 size_t i2 = alt_index(i1, fp);
288
289 if (buckets(i1).insert(fp))
290 {
291 ++num_items;
292 return true;
293 }
294
295 if (buckets(i2).insert(fp))
296 {
297 ++num_items;
298 return true;
299 }
300
301 // Both buckets full, start kicking
302 std::uniform_int_distribution<int> coin(0, 1);
303 size_t curr_idx = coin(rng_) ? i1 : i2;
304 Fingerprint curr_fp = fp;
305 struct Kick_Record
306 {
307 size_t bucket_idx = 0;
308 size_t slot_idx = 0;
310 };
311 Kick_Record history[max_kicks];
312 size_t history_size = 0;
313
314 for (size_t kick = 0; kick < max_kicks; ++kick)
315 {
316 const size_t slot_idx = buckets(curr_idx).swap_random_slot(curr_fp, rng_);
317 history[history_size++] = {curr_idx, slot_idx, curr_fp};
318 curr_idx = alt_index(curr_idx, curr_fp);
319 if (buckets(curr_idx).insert(curr_fp))
320 {
321 ++num_items;
322 return true;
323 }
324 }
325
326 // Kick chain exhausted — save the displaced fingerprint in the stash
328 {
329 stash_[stash_count_++] = {curr_idx, curr_fp};
330 ++num_items;
331 return true;
332 }
333
334 while (history_size > 0)
335 {
336 const Kick_Record & rec = history[--history_size];
337 buckets(rec.bucket_idx).restore(rec.original_fp, rec.slot_idx);
338 }
339
340 return false; // Filter and stash both full
341 }
342
348 bool append(const T &item)
349 {
350 return insert(item);
351 }
352
360 [[nodiscard]] bool contains(const T &item) const
361 {
362 const Fingerprint fp = get_fingerprint(item);
363 const size_t i1 = index_hash(item);
364 const size_t i2 = alt_index(i1, fp);
365
366 if (buckets(i1).contains(fp) or buckets(i2).contains(fp))
367 return true;
368
369 for (size_t i = 0; i < stash_count_; ++i)
370 if (stash_[i].fp == fp and (stash_[i].index == i1 or stash_[i].index == i2))
371 return true;
372
373 return false;
374 }
375
383 bool remove(const T &item)
384 {
385 const Fingerprint fp = get_fingerprint(item);
386 const size_t i1 = index_hash(item);
387 const size_t i2 = alt_index(i1, fp);
388
389 if (buckets(i1).remove(fp) or buckets(i2).remove(fp))
390 {
391 --num_items;
392 return true;
393 }
394
395 for (size_t i = 0; i < stash_count_; ++i)
396 if (stash_[i].fp == fp and (stash_[i].index == i1 or stash_[i].index == i2))
397 {
398 stash_[i] = stash_[--stash_count_];
399 --num_items;
400 return true;
401 }
402
403 return false;
404 }
405
410 void clear()
411 {
412 for (size_t i = 0; i < num_buckets; ++i)
413 buckets(i).clear();
414 num_items = 0;
415 stash_count_ = 0;
416 }
417};
418} // namespace Aleph
419
420#endif // ALEPH_CUCKOO_FILTER_H
Exception handling system with formatted messages for Aleph-w.
General utility functions and helpers.
Simple dynamic array with automatic resizing and functional operations.
Definition tpl_array.H:139
Industrial-grade Cuckoo Filter implementation.
bool contains(const T &item) const
Test membership.
static constexpr Fingerprint fp_mask
Cuckoo_Filter(size_t capacity, std::optional< std::uint32_t > seed=std::nullopt)
Construct a Cuckoo Filter with a given capacity.
double load_factor() const noexcept
Return the load factor of the filter (relative to capacity()).
void clear()
Clear the filter.
bool empty() const noexcept
Return whether the filter contains no items.
bool is_empty() const noexcept
Synonym for empty() in traditional Aleph style.
Fingerprint get_fingerprint(const T &item) const
bool remove(const T &item)
Remove an item from the filter.
size_t size() const noexcept
Return the number of items currently in the filter.
size_t alt_index(size_t i, Fingerprint fp) const
Aleph::Array< Bucket > buckets
static constexpr size_t stash_size
size_t index_hash(const T &item) const noexcept
static constexpr size_t max_kicks
static constexpr double kTargetLoad
Victim stash_[stash_size]
size_t capacity() const noexcept
Return the total number of fingerprint slots (including stash).
bool insert(const T &item)
Insert an item into the filter.
bool append(const T &item)
Alias for insert().
static mt19937 rng
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
and
Check uniqueness with explicit hash + equality functors.
unsigned long next_power_of_2(unsigned long x)
In x is not exact power of 2, it returns the next power of 2.
Definition ahUtils.H:234
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
std::decay_t< typename HeadC::Item_Type > T
Definition ah-zip.H:105
size_t dft_hash_fct(const Key &key) noexcept
Primary default hash: best speed/quality trade-off.
Definition hash-fct.H:1003
size_t murmur3hash(const Key &key, std::uint32_t seed)
Definition hash-fct.H:334
STL namespace.
bool remove(Fingerprint fp)
size_t swap_random_slot(Fingerprint &fp, std::mt19937 &rng)
Fingerprint slots[EntriesPerBucket]
bool insert(Fingerprint fp)
void restore(Fingerprint fp, size_t idx)
bool contains(Fingerprint fp) const
Victim entry: a fingerprint displaced during a failed kick chain.
ValueArg< size_t > seed
Definition testHash.C:48
Dynamic array container with automatic resizing.