Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
compact-quotient-filter.H
Go to the documentation of this file.
1
2/*
3 Aleph_w
4
5 Data structures & Algorithms
6 version 2.0.0b
7 https://github.com/lrleon/Aleph-w
8
9 This file is part of Aleph-w library
10
11 Copyright (c) 2002-2026 Leandro Rabindranath Leon
12
13 Permission is hereby granted, free of charge, to any person obtaining a copy
14 of this software and associated documentation files (the "Software"), to deal
15 in the Software without restriction, including without limitation the rights
16 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17 copies of the Software, and to permit persons to whom the Software is
18 furnished to do so, subject to the following conditions:
19
20 The above copyright notice and this permission notice shall be included in all
21 copies or substantial portions of the Software.
22
23 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29 SOFTWARE.
30*/
31
66#ifndef COMPACT_QUOTIENT_FILTER_H
67#define COMPACT_QUOTIENT_FILTER_H
68
69#include <cstdint>
70#include <cstddef>
71#include <cmath>
72#include <algorithm>
73#include <utility>
74
75#include <ah-errors.H>
76#include <bitArray.H>
77#include <hash-fct.H>
78
79namespace Aleph {
80
85template <typename T> class Compact_Quotient_Filter
86{
89 size_t num_slots_;
90 size_t num_elems_;
92
93 // Bit-packed storage: each slot uses (3 + r) bits.
94 // Layout per slot:
95 // bit 0: is_occupied
96 // bit 1: is_continuation
97 // bit 2: is_shifted
98 // bits 3..(r+2): remainder
100
102 {
103 return 3 + r_;
104 }
105
106 [[nodiscard]] size_t slot_offset(const size_t i) const noexcept
107 {
108 return i * bits_per_slot();
109 }
110
112 {
113 return (1ULL << r_) - 1;
114 }
115
116 [[nodiscard]] bool get_bit_at(const size_t slot, const size_t bit_idx) const noexcept
117 {
118 return bits_.read_bit(slot_offset(slot) + bit_idx) != 0;
119 }
120
121 void set_bit_at(const size_t slot, const size_t bit_idx, const bool v) noexcept
122 {
123 bits_.write_bit(slot_offset(slot) + bit_idx, v ? 1 : 0);
124 }
125
126 [[nodiscard]] bool get_occ(const size_t i) const noexcept
127 {
128 return get_bit_at(i, 0);
129 }
130
131 [[nodiscard]] bool get_cont(const size_t i) const noexcept
132 {
133 return get_bit_at(i, 1);
134 }
135
136 [[nodiscard]] bool get_shft(const size_t i) const noexcept
137 {
138 return get_bit_at(i, 2);
139 }
140
141 [[nodiscard]] bool slot_empty(const size_t i) const noexcept
142 {
143 const size_t off = slot_offset(i);
144 for (size_t b = 0; b < bits_per_slot(); ++b)
145 if (bits_.read_bit(off + b))
146 return false;
147 return true;
148 }
149
150 [[nodiscard]] bool has_element(const size_t i) const noexcept
151 {
152 return get_occ(i) or get_cont(i) or get_shft(i);
153 }
154
155 [[nodiscard]] bool is_run_start(const size_t i) const noexcept
156 {
157 return has_element(i) and not get_cont(i);
158 }
159
160 void set_occ(const size_t i, const bool v) noexcept
161 {
162 set_bit_at(i, 0, v);
163 }
164
165 void set_cont(const size_t i, const bool v) noexcept
166 {
167 set_bit_at(i, 1, v);
168 }
169
170 void set_shft(const size_t i, const bool v) noexcept
171 {
172 set_bit_at(i, 2, v);
173 }
174
175 [[nodiscard]] uint64_t get_rem(const size_t i) const noexcept
176 {
177 uint64_t rem = 0;
178 const size_t off = slot_offset(i) + 3;
179 for (size_t b = 0; b < r_; ++b)
180 if (bits_.read_bit(off + b))
181 rem |= (1ULL << b);
182 return rem;
183 }
184
185 void set_rem(const size_t i, const uint64_t rem) noexcept
186 {
187 const size_t off = slot_offset(i) + 3;
188 for (size_t b = 0; b < r_; ++b)
189 bits_.write_bit(off + b, (rem & (1ULL << b)) ? 1 : 0);
190 }
191
192 void set_element(const size_t i, const bool cont, const bool shft, const uint64_t rem) noexcept
193 {
194 const bool occ = get_occ(i);
195 set_cont(i, cont);
196 set_shft(i, shft);
197 set_rem(i, rem);
198 set_occ(i, occ);
199 }
200
201 void clear_element(const size_t i) noexcept
202 {
203 const bool occ = get_occ(i);
204 set_cont(i, false);
205 set_shft(i, false);
206 set_rem(i, 0);
207 set_occ(i, occ);
208 }
209
210 [[nodiscard]] size_t incr(const size_t i) const noexcept
211 {
212 return (i + 1) & (num_slots_ - 1);
213 }
214
215 [[nodiscard]] size_t decr(const size_t i) const noexcept
216 {
217 return (i - 1) & (num_slots_ - 1);
218 }
219
220 [[nodiscard]] std::pair<size_t, uint64_t> fingerprint(const T &item) const
221 {
222 const size_t h = murmur3hash(item, seed_);
223 const size_t fq = h & (num_slots_ - 1);
224 const uint64_t fr = (static_cast<uint64_t>(h) >> q_) & rem_mask();
225 return {fq, fr};
226 }
227
228 [[nodiscard]] size_t find_run_start(const size_t fq) const noexcept
229 {
230 size_t j = fq;
231 while (get_shft(j))
232 j = decr(j);
233
234 size_t i = j;
235 while (j != fq)
236 {
237 do
238 i = incr(i);
239 while (get_cont(i));
240
241 do
242 j = incr(j);
243 while (not get_occ(j));
244 }
245 return i;
246 }
247
248 [[nodiscard]] size_t find_first_unused(const size_t start) const noexcept
249 {
250 size_t i = start;
251 while (has_element(i))
252 i = incr(i);
253 return i;
254 }
255
256 void shift_elements_right(const size_t pos, size_t empty) noexcept
257 {
258 while (empty != pos)
259 {
260 const size_t prev = decr(empty);
261 const bool occ_at_empty = get_occ(empty);
262
263 const bool prev_cont = get_cont(prev);
264 const uint64_t prev_rem = get_rem(prev);
265
266 set_element(empty, prev_cont, true, prev_rem);
267 set_occ(empty, occ_at_empty);
268
269 empty = prev;
270 }
271 }
272
273 bool do_insert(const size_t fq, const uint64_t fr)
274 {
275 if (num_elems_ >= num_slots_)
276 return false;
277
278 if (slot_empty(fq))
279 {
280 set_occ(fq, true);
281 set_rem(fq, fr);
282 ++num_elems_;
283 return true;
284 }
285
286 const bool was_occupied = get_occ(fq);
287 set_occ(fq, true);
288
289 if (was_occupied)
290 {
291 const size_t s = find_run_start(fq);
292 size_t pos = s;
293
294 do
295 {
296 uint64_t cur = get_rem(pos);
297 if (cur == fr)
298 return false;
299 if (cur > fr)
300 break;
301 pos = incr(pos);
302 } while (get_cont(pos));
303
304 const size_t empty = find_first_unused(pos);
305 shift_elements_right(pos, empty);
306
307 const bool at_run_start = (pos == s);
308 set_element(pos, not at_run_start, pos != fq, fr);
309
310 if (at_run_start)
311 set_cont(incr(pos), true);
312
313 ++num_elems_;
314 return true;
315 }
316
317 const size_t s = find_run_start(fq);
318 const size_t empty = find_first_unused(s);
319 shift_elements_right(s, empty);
320
321 set_element(s, false, s != fq, fr);
322 ++num_elems_;
323 return true;
324 }
325
326 void shift_elements_left(const size_t pos) noexcept
327 {
328 size_t curr = pos;
329 size_t nxt = incr(curr);
330
331 while (has_element(nxt) and get_shft(nxt))
332 {
333 const bool occ_at_curr = get_occ(curr);
334
335 const bool nxt_cont = get_cont(nxt);
336 const bool nxt_shft = get_shft(nxt);
337 const uint64_t nxt_rem = get_rem(nxt);
338
340 set_occ(curr, occ_at_curr);
341
342 if (not get_cont(curr) and get_occ(curr))
343 set_shft(curr, false);
344
345 curr = nxt;
346 nxt = incr(nxt);
347 }
348
349 clear_element(curr);
350 }
351
352public:
361 : q_(q), r_(r), num_slots_(0), num_elems_(0), seed_(seed), bits_()
362 {
363 ah_domain_error_if(q < 1 or q > 32) << "Compact_Quotient_Filter: q must be in [1, 32] (got " << static_cast<int>(q)
364 << ")";
365 ah_domain_error_if(r < 1 or r > 60) << "Compact_Quotient_Filter: r must be in [1, 60] (got " << static_cast<int>(r)
366 << ")";
367 ah_domain_error_if(q + r > 64) << "Compact_Quotient_Filter: q + r must be <= 64 (got " << static_cast<int>(q + r)
368 << ")";
369
370 num_slots_ = size_t{1} << q;
371 const size_t total_bits = num_slots_ * bits_per_slot();
373 }
374
385 {
386 ah_domain_error_if(expected_n == 0) << "Compact_Quotient_Filter::from_capacity: expected_n must be > 0";
388 << "Compact_Quotient_Filter::from_capacity: fp_rate must be in (0, 1) (got " << fp_rate << ")";
389
390 const double slots_needed = static_cast<double>(expected_n) / 0.75;
391 const auto q_d = std::max(1.0, std::ceil(std::log2(slots_needed)));
392 const auto r_d = std::max(1.0, std::ceil(-std::log2(fp_rate)));
393
394 int q_v = static_cast<int>(q_d);
395 int r_v = static_cast<int>(r_d);
396
398 << "Compact_Quotient_Filter::from_capacity: requested fp_rate requires r = " << r_v
399 << " remainder bits, which is outside the supported range [1, 60]";
400
401 if (q_v + r_v > 64)
402 q_v = 64 - r_v;
403
405 << "Compact_Quotient_Filter::from_capacity: requested (expected_n, fp_rate) leads to q = " << q_v
406 << ", which is outside the supported range [1, 32]";
407
408 return Compact_Quotient_Filter(static_cast<uint8_t>(q_v), static_cast<uint8_t>(r_v), seed);
409 }
410
419 {
421 << "Compact_Quotient_Filter::insert: filter is full (" << num_elems_ << "/" << num_slots_ << ")";
422
423 auto [fq, fr] = fingerprint(item);
424 do_insert(fq, fr);
425 return *this;
426 }
427
432 [[nodiscard]] bool contains(const T &item) const noexcept
433 {
434 auto [fq, fr] = fingerprint(item);
435
436 if (not get_occ(fq))
437 return false;
438
439 size_t s = find_run_start(fq);
440 do
441 {
442 uint64_t cur = get_rem(s);
443 if (cur == fr)
444 return true;
445 if (cur > fr)
446 return false;
447 s = incr(s);
448 } while (get_cont(s));
449
450 return false;
451 }
452
457 bool remove(const T &item) noexcept
458 {
459 auto [fq, fr] = fingerprint(item);
460
461 if (not get_occ(fq))
462 return false;
463
464 size_t s = find_run_start(fq);
465 size_t pos = s;
466
467 do
468 {
469 if (get_rem(pos) == fr)
470 goto found;
471 if (get_rem(pos) > fr)
472 return false;
473 pos = incr(pos);
474 } while (get_cont(pos));
475 return false;
476
477 found: {
478 const bool was_run_start = not get_cont(pos);
479 const size_t nxt = incr(pos);
480 const bool nxt_is_cont = has_element(nxt) and get_cont(nxt);
481
483 set_cont(nxt, false);
484
486 set_occ(fq, false);
487
489 --num_elems_;
490 return true;
491 }
492 }
493
496 {
497 return num_elems_;
498 }
499
502 {
503 return num_slots_;
504 }
505
508 {
509 return static_cast<double>(num_elems_) / num_slots_;
510 }
511
514 {
515 return num_elems_ == 0;
516 }
517
520 {
521 return q_;
522 }
523
526 {
527 return r_;
528 }
529
532 {
533 return std::ldexp(1.0, -static_cast<int>(r_));
534 }
535
538 {
539 return seed_;
540 }
541
544 {
545 return (num_slots_ * bits_per_slot() + 7) / 8;
546 }
547
550 {
551 const size_t total_bits = num_slots_ * bits_per_slot();
552 for (size_t i = 0; i < total_bits; ++i)
553 bits_.write_bit(i, 0);
554 num_elems_ = 0;
555 }
556
561 {
562 ah_domain_error_if(q_ != other.q_ or r_ != other.r_) << "Compact_Quotient_Filter::merge: parameter mismatch";
563 ah_domain_error_if(seed_ != other.seed_) << "Compact_Quotient_Filter::merge: seed mismatch";
564
565 for (size_t canonical = 0; canonical < other.num_slots_; ++canonical)
566 {
567 if (not other.get_occ(canonical))
568 continue;
569
570 size_t s = other.find_run_start(canonical);
571 do
572 {
573 do_insert(canonical, other.get_rem(s));
574 s = other.incr(s);
575 } while (other.get_cont(s));
576 }
577 }
578
580 [[nodiscard]] static std::pair<uint8_t, uint8_t> estimate(size_t n, double fp_rate)
581 {
582 ah_domain_error_if(n == 0) << "n must be > 0";
583 ah_domain_error_if(fp_rate <= 0.0 or fp_rate >= 1.0) << "fp_rate must be in (0, 1)";
584 const double slots = static_cast<double>(n) / 0.75;
585 auto qv = static_cast<uint8_t>(std::max(1.0, std::ceil(std::log2(slots))));
586 auto rv = static_cast<uint8_t>(std::max(1.0, std::ceil(-std::log2(fp_rate))));
587 if (qv + rv > 64)
588 qv = static_cast<uint8_t>(64 - rv);
589 return {qv, rv};
590 }
591
594 {
595 std::swap(q_, other.q_);
596 std::swap(r_, other.r_);
597 std::swap(num_slots_, other.num_slots_);
598 std::swap(num_elems_, other.num_elems_);
599 std::swap(seed_, other.seed_);
600 bits_.swap(other.bits_);
601 }
602};
603
604} // end namespace Aleph
605
606#endif // COMPACT_QUOTIENT_FILTER_H
Exception handling system with formatted messages for Aleph-w.
#define ah_overflow_error_if(C)
Throws std::overflow_error if condition holds.
Definition ah-errors.H:463
#define ah_domain_error_if(C)
Throws std::domain_error if condition holds.
Definition ah-errors.H:522
Space-efficient bit array implementation.
long double h
Definition btreepic.C:154
Contiguous array of bits.
Definition bitArray.H:189
int read_bit(const size_t i) const
Read bit i.
Definition bitArray.H:377
void write_bit(const size_t i, const unsigned int value)
Write bit i with the value.
Definition bitArray.H:392
void reserve(const size_t dim)
Reserve memory in advance for the bit array dim dimension.
Definition bitArray.H:328
void swap(BitArray &array) noexcept
Definition bitArray.H:485
Memory-optimized quotient filter with bit-packed slots.
bool is_run_start(const size_t i) const noexcept
size_t find_run_start(const size_t fq) const noexcept
void set_occ(const size_t i, const bool v) noexcept
bool get_occ(const size_t i) const noexcept
double load_factor() const noexcept
Current load factor.
size_t find_first_unused(const size_t start) const noexcept
void set_cont(const size_t i, const bool v) noexcept
size_t memory_usage() const noexcept
Memory usage in bytes.
void swap(Compact_Quotient_Filter &other) noexcept
Swap in O(1).
uint32_t seed() const noexcept
Hash seed.
Compact_Quotient_Filter(uint8_t q, uint8_t r, uint32_t seed=0x5F3759DF)
Construct a compact quotient filter.
void clear_element(const size_t i) noexcept
static std::pair< uint8_t, uint8_t > estimate(size_t n, double fp_rate)
Estimate (q, r) for n elements and target FP rate.
uint8_t q() const noexcept
Quotient bits.
size_t capacity() const noexcept
Number of slots (2^q).
void shift_elements_left(const size_t pos) noexcept
void set_bit_at(const size_t slot, const size_t bit_idx, const bool v) noexcept
static Compact_Quotient_Filter from_capacity(size_t expected_n, double fp_rate, uint32_t seed=0x5F3759DF)
Construct from desired capacity and false positive rate.
bool slot_empty(const size_t i) const noexcept
bool is_empty() const noexcept
True if empty.
size_t decr(const size_t i) const noexcept
size_t incr(const size_t i) const noexcept
void clear() noexcept
Remove all elements.
Compact_Quotient_Filter & insert(const T &item)
Insert an element.
bool get_cont(const size_t i) const noexcept
size_t size() const noexcept
Number of elements.
bool get_bit_at(const size_t slot, const size_t bit_idx) const noexcept
void merge(const Compact_Quotient_Filter &other)
Merge another filter (same q, r, seed).
bool remove(const T &item) noexcept
Remove an element.
void set_shft(const size_t i, const bool v) noexcept
uint8_t r() const noexcept
Remainder bits.
std::pair< size_t, uint64_t > fingerprint(const T &item) const
size_t slot_offset(const size_t i) const noexcept
void shift_elements_right(const size_t pos, size_t empty) noexcept
bool do_insert(const size_t fq, const uint64_t fr)
bool contains(const T &item) const noexcept
Test membership.
void set_rem(const size_t i, const uint64_t rem) noexcept
uint64_t get_rem(const size_t i) const noexcept
void set_element(const size_t i, const bool cont, const bool shft, const uint64_t rem) noexcept
bool get_shft(const size_t i) const noexcept
bool has_element(const size_t i) const noexcept
double false_positive_rate() const noexcept
Theoretical FP probability ~ 2^{-r}.
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
and
Check uniqueness with explicit hash + equality functors.
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
std::decay_t< typename HeadC::Item_Type > T
Definition ah-zip.H:105
size_t murmur3hash(const Key &key, std::uint32_t seed)
Definition hash-fct.H:334