Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
bloom-filter.H
Go to the documentation of this file.
1
2/*
3 Aleph_w
4
5 Data structures & Algorithms
6 version 2.0.0b
7 https://github.com/lrleon/Aleph-w
8
9 This file is part of Aleph-w library
10
11 Copyright (c) 2002-2026 Leandro Rabindranath Leon
12
13 Permission is hereby granted, free of charge, to any person obtaining a copy
14 of this software and associated documentation files (the "Software"), to deal
15 in the Software without restriction, including without limitation the rights
16 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17 copies of the Software, and to permit persons to whom the Software is
18 furnished to do so, subject to the following conditions:
19
20 The above copyright notice and this permission notice shall be included in all
21 copies or substantial portions of the Software.
22
23 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29 SOFTWARE.
30*/
31
32
101# ifndef BLOOM_FILTER_H
102# define BLOOM_FILTER_H
103
104# include <cassert>
105# include <cstddef>
106# include <memory>
107# include <tuple>
108# include <vector>
109# include <limits>
110# include <ctime>
111# include <utility>
112# include <gsl/gsl_rng.h>
113# include <cmath>
114# include <iostream>
115# include <ahFunctional.H>
116# include <bitArray.H>
117# include <hash-fct.H>
118# include <tpl_dynSetHash.H>
119# include <ah-errors.H>
120
121namespace Aleph
122{
123 template <typename T>
138 {
139 public:
140 using Hash_Fct = size_t (*)(const T &, unsigned long seed);
141
142 private:
144
146 size_t num_hash = 0;
147 std::vector<unsigned long> seeds;
148 size_t num_ins = 0;
149
150 bool have_same_hashes(const Bloom_Filter & f) const noexcept
151 {
152 return get_m() == f.get_m() and get_k() == f.get_k() and
153 hash_fct == f.hash_fct and seeds == f.seeds;
154 }
155
156 public:
170 static std::tuple<size_t, size_t> estimate(const size_t n, const double p)
171 {
172 assert(n > 0);
173 assert(p > 0 and p < 1);
174 const double log2 = std::log(2);
175 const auto m = static_cast<size_t>(std::ceil(-(n * std::log(p)) / (log2 * log2)));
176 const auto k = static_cast<size_t>(std::ceil((1.0 * m / n) * log2));
177 return std::make_tuple(m, k);
178 }
179
185 size_t get_m() const noexcept { return bits.size(); }
186
192 size_t get_k() const noexcept { return num_hash; }
193
199 size_t get_n() const noexcept { return num_ins; }
200
207 {
208 size_t x = 0;
209 for (size_t i = 0; i < bits.size(); ++i)
210 if (bits(i) == 1)
211 ++x;
212 return x;
213 }
214
222 size_t size() const noexcept { return get_n(); }
223
231 size_t capacity() const noexcept { return get_m(); }
232
245 Bloom_Filter(const size_t dim, size_t __num_hash,
247 const unsigned long seed = static_cast<unsigned long>(std::time(nullptr)))
250 {
251 ah_invalid_argument_if(dim == 0) << "Bloom filter dimension must be > 0";
252 ah_invalid_argument_if(num_hash == 0) << "Bloom filter number of hashes must be > 0";
253 ah_invalid_argument_if(hash_fct == nullptr) << "Bloom filter hash function is null";
254
256 const std::unique_ptr<gsl_rng, void(*)(gsl_rng *)>
258 ah_bad_alloc_if(r.get() == nullptr);
259 gsl_rng_set(r.get(), seed % gsl_rng_max(r.get()));
260
261 for (auto & s: seeds)
262 s = gsl_rng_get(r.get());
263 }
264
275 Bloom_Filter(const size_t n, const double p,
276 unsigned long seed = static_cast<unsigned long>(std::time(nullptr)),
278 : Bloom_Filter(std::get<0>(estimate(n, p)), std::get<1>(estimate(n, p)),
279 __hash_fct, seed)
280 {
281 // empty
282 }
283
289 void swap(Bloom_Filter & f) noexcept
290 {
291 bits.swap(f.bits);
292 std::swap(hash_fct, f.hash_fct);
293 std::swap(num_hash, f.num_hash);
294 seeds.swap(f.seeds);
295 std::swap(num_ins, f.num_ins);
296 }
297
307 {
308 // empty
309 }
310
317 {
318 swap(f);
319 }
320
326 auto &operator =(const Bloom_Filter & f)
327 {
328 Bloom_Filter tmp(f);
329 swap(tmp);
330 return *this;
331 }
332
338 auto &operator =(Bloom_Filter && f) noexcept
339 {
340 swap(f);
341 return *this;
342 }
343
345 virtual ~Bloom_Filter() = default;
346
353 auto &insert(const T & item) noexcept
354 {
355 const size_t & m = bits.size();
356 for (const auto & seed: seeds)
357 bits.fast_write((*hash_fct)(item, seed) % m, 1);
358
359 ++num_ins;
360
361 return *this;
362 }
363
370 auto &append(const T & item) noexcept { return insert(item); }
371
377 bool contains(const T & item) const noexcept
378 {
379 const size_t & m = bits.size();
380 for (const auto & seed: seeds)
381 if (not bits.fast_read((*hash_fct)(item, seed) % m))
382 return false;
383
384 return true;
385 }
386
393 {
395 for (const auto & seed: seeds)
396 ret.append(seed);
397 return ret;
398 }
399
405 DynList<size_t> hashes(const T & item) const
406 {
408 const size_t & m = bits.size();
409 for (const auto & seed: seeds)
410 ret.append((*hash_fct)(item, seed) % m);
411 return ret;
412 }
413
421 DynList<size_t> common_hashes(const T & i1, const T & i2) const
422 {
423 return intercept(hashes(i1), hashes(i2));
424 }
425
432 {
434 for (size_t i = 0; i < bits.size(); ++i)
435 if (bits(i) == 1)
436 ret.append(i);
437
438 return ret;
439 }
440
450 size_t expected_size(const size_t x) const noexcept
451 {
452 const auto m = static_cast<double>(capacity());
453 const auto k = static_cast<double>(get_k());
454 if (m <= 0 or k <= 0)
455 return 0;
456
457 const double p0 = 1.0 - (static_cast<double>(x) / m);
458 if (p0 <= 0)
459 return std::numeric_limits<size_t>::max();
460
461 const double n = std::ceil(-(m * std::log(p0)) / k);
462 if (not std::isfinite(n) or n < 0)
463 return std::numeric_limits<size_t>::max();
464
465 return static_cast<size_t>(n);
466 }
467
474 {
475 return expected_size(get_x());
476 }
477
484 auto &operator |=(const Bloom_Filter & f)
485 {
487 << "Bloom filter have different hashes";
488
489 bits |= f.bits;
490
491 const auto x = get_x();
492
494
495 return *this;
496 }
497
503 auto &operator &=(const Bloom_Filter & f)
504 {
506 << "Bloom filter have different hashes";
507
508 bits &= f.bits;
509
510 const auto est = expected_size();
511 const auto sum = num_ins + f.num_ins;
512 num_ins = sum >= est ? sum - est : 0;
513
514 return *this;
515 }
516
518 friend Bloom_Filter
520 {
521 Bloom_Filter ret = f1;
522 ret |= f2;
523 return ret;
524 }
525
527 friend Bloom_Filter
529 {
530 Bloom_Filter ret = f1;
531 ret &= f2;
532 return ret;
533 }
534 };
535} // end namespace Aleph
536
537# endif // BLOOM_FILTER_H
Exception handling system with formatted messages for Aleph-w.
#define ah_domain_error_if(C)
Throws std::domain_error if condition holds.
Definition ah-errors.H:522
#define ah_invalid_argument_if(C)
Throws std::invalid_argument if condition holds.
Definition ah-errors.H:639
#define ah_bad_alloc_if(C)
Throws std::bad_alloc if condition holds.
Definition ah-errors.H:429
Functional programming utilities for Aleph-w containers.
Space-efficient bit array implementation.
Contiguous array of bits.
Definition bitArray.H:189
void fast_write(const size_t i, const unsigned int value)
Definition bitArray.H:443
int fast_read(const size_t i) const noexcept
Definition bitArray.H:438
void reserve(const size_t dim)
Reserve memory in advance for the bit array dim dimension.
Definition bitArray.H:328
constexpr size_t size() const noexcept
Returns the dimension of the bit array.
Definition bitArray.H:334
void swap(BitArray &array) noexcept
Definition bitArray.H:485
Bloom filter implementation.
friend Bloom_Filter operator|(const Bloom_Filter &f1, const Bloom_Filter &f2)
Return the union of two compatible Bloom filters.
size_t size() const noexcept
Return the number of insertions.
auto & operator&=(const Bloom_Filter &f)
In-place intersection.
DynList< size_t > common_hashes(const T &i1, const T &i2) const
Return the intersection of hashes(i1) and hashes(i2).
Bloom_Filter(const size_t n, const double p, unsigned long seed=static_cast< unsigned long >(std::time(nullptr)), Hash_Fct __hash_fct=dft_hash_fct)
Construct a Bloom filter using a desired capacity in insertions.
friend Bloom_Filter operator&(const Bloom_Filter &f1, const Bloom_Filter &f2)
Return the intersection of two compatible Bloom filters.
size_t expected_size() const noexcept
Convenience overload.
bool have_same_hashes(const Bloom_Filter &f) const noexcept
static std::tuple< size_t, size_t > estimate(const size_t n, const double p)
Estimate the parameters of a Bloom filter.
size_t expected_size(const size_t x) const noexcept
Estimate the number of inserted items from the number of bits set.
std::vector< unsigned long > seeds
size_t get_n() const noexcept
Return the number of insertions performed on the filter.
DynList< size_t > set_bits() const
Return the indexes of bits set to 1.
bool contains(const T &item) const noexcept
Test membership.
size_t get_m() const noexcept
Return the number of bits of the filter.
auto & operator|=(const Bloom_Filter &f)
In-place union.
size_t(*)(const T &, unsigned long seed) Hash_Fct
Bloom_Filter(const size_t dim, size_t __num_hash, Hash_Fct __hash_fct=dft_hash_fct, const unsigned long seed=static_cast< unsigned long >(std::time(nullptr)))
Construct a Bloom filter with explicit parameters.
DynList< size_t > hash_seeds() const
Return the internally generated per-hash seeds.
virtual ~Bloom_Filter()=default
Destructor.
Bloom_Filter(const Bloom_Filter &f)
Copy constructor.
Bloom_Filter(Bloom_Filter &&f) noexcept
Move constructor.
size_t capacity() const noexcept
Return the capacity in bits.
DynList< size_t > hashes(const T &item) const
Return the bit positions used by item.
void swap(Bloom_Filter &f) noexcept
Swap this with f in O(1).
size_t get_k() const noexcept
Return the number of hash functions used by the filter.
size_t get_x() const noexcept
Return the number of bits set to 1.
auto & operator=(const Bloom_Filter &f)
Copy assignment.
auto & insert(const T &item) noexcept
Insert an item.
auto & append(const T &item) noexcept
Alias of insert().
Dynamic singly linked list with functional programming support.
Definition htlist.H:1423
T & append(const T &item)
Append a new item by copy.
Definition htlist.H:1562
__gmp_expr< T, __gmp_unary_expr< __gmp_expr< T, U >, __gmp_log2_function > > log2(const __gmp_expr< T, U > &expr)
Definition gmpfrxx.h:4064
__gmp_expr< typename __gmp_resolve_expr< T, V >::value_type, __gmp_binary_expr< __gmp_expr< T, U >, __gmp_expr< V, W >, __gmp_dim_function > > dim(const __gmp_expr< T, U > &expr1, const __gmp_expr< V, W > &expr2)
Definition gmpfrxx.h:4052
Collection of general-purpose hash functions.
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
DynList< T > intercept(const Container< T > &c1, const Container< T > &c2)
std::decay_t< typename HeadC::Item_Type > T
Definition ah-zip.H:107
size_t dft_hash_fct(const Key &key) noexcept
Definition hash-fct.H:931
DynList< T > maps(const C &c, Op op)
Classic map operation.
T sum(const Container &container, const T &init=T{})
Compute sum of all elements.
STL namespace.
Dynamic set implementations based on hash tables.