Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
bloom-filter.H
Go to the documentation of this file.
1
2/*
3 Aleph_w
4
5 Data structures & Algorithms
6 version 2.0.0b
7 https://github.com/lrleon/Aleph-w
8
9 This file is part of Aleph-w library
10
11 Copyright (c) 2002-2026 Leandro Rabindranath Leon
12
13 Permission is hereby granted, free of charge, to any person obtaining a copy
14 of this software and associated documentation files (the "Software"), to deal
15 in the Software without restriction, including without limitation the rights
16 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17 copies of the Software, and to permit persons to whom the Software is
18 furnished to do so, subject to the following conditions:
19
20 The above copyright notice and this permission notice shall be included in all
21 copies or substantial portions of the Software.
22
23 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29 SOFTWARE.
30*/
31
32
101# ifndef BLOOM_FILTER_H
102# define BLOOM_FILTER_H
103
104# include <cstddef>
105# include <memory>
106# include <tuple>
107# include <vector>
108# include <limits>
109# include <ctime>
110# include <gsl/gsl_rng.h>
111# include <cmath>
112# include <ahFunctional.H>
113# include <bitArray.H>
114# include <hash-fct.H>
115# include <tpl_dynSetHash.H>
116# include <ah-errors.H>
117
118namespace Aleph
119{
133 template <typename T>
135 {
136 public:
137 using Hash_Fct = size_t (*)(const T &, unsigned long seed);
138
139 private:
141
143 size_t num_hash = 0;
144 std::vector<unsigned long> seeds;
145 size_t num_ins = 0;
146
147 bool have_same_hashes(const Bloom_Filter & f) const noexcept
148 {
149 return get_m() == f.get_m() and get_k() == f.get_k() and
150 hash_fct == f.hash_fct and seeds == f.seeds;
151 }
152
154 Bloom_Filter(std::tuple<size_t, size_t> mk,
155 Hash_Fct hash_fn, unsigned long seed)
156 : Bloom_Filter(std::get<0>(mk), std::get<1>(mk), hash_fn, seed)
157 {
158 }
159
160 public:
174 [[nodiscard]] static std::tuple<size_t, size_t> estimate(const size_t n, const double p)
175 {
176 ah_invalid_argument_if(n == 0) << "n must be > 0";
177 ah_invalid_argument_if(p <= 0 or p >= 1) << "p must be in (0, 1)";
178 const double log2 = std::log(2);
179 const auto m = static_cast<size_t>(std::ceil(-(n * std::log(p)) / (log2 * log2)));
180 const auto k = static_cast<size_t>(std::ceil((1.0 * m / n) * log2));
181 return std::make_tuple(m, k);
182 }
183
189 [[nodiscard]] size_t get_m() const noexcept { return bits.size(); }
190
196 [[nodiscard]] size_t get_k() const noexcept { return num_hash; }
197
203 [[nodiscard]] size_t get_n() const noexcept { return num_ins; }
204
211 {
212 size_t x = 0;
213 for (size_t i = 0; i < bits.size(); ++i)
214 if (bits(i) == 1)
215 ++x;
216 return x;
217 }
218
226 [[nodiscard]] size_t size() const noexcept { return get_n(); }
227
235 [[nodiscard]] size_t capacity() const noexcept { return get_m(); }
236
249 Bloom_Filter(const size_t dim, size_t num_hashes,
251 const unsigned long seed = static_cast<unsigned long>(std::time(nullptr)))
254 {
255 ah_invalid_argument_if(dim == 0) << "Bloom filter dimension must be > 0";
256 ah_invalid_argument_if(num_hash == 0) << "Bloom filter number of hashes must be > 0";
257 ah_invalid_argument_if(hash_fct == nullptr) << "Bloom filter hash function is null";
258
260 const std::unique_ptr<gsl_rng, void(*)(gsl_rng *)>
262 ah_bad_alloc_if(r.get() == nullptr);
263 gsl_rng_set(r.get(), seed % gsl_rng_max(r.get()));
264
265 for (auto & s: seeds)
266 s = gsl_rng_get(r.get());
267 }
268
279 Bloom_Filter(const size_t n, const double p,
280 unsigned long seed = static_cast<unsigned long>(std::time(nullptr)),
283 {
284 }
285
291 void swap(Bloom_Filter & f) noexcept
292 {
293 bits.swap(f.bits);
294 std::swap(hash_fct, f.hash_fct);
295 std::swap(num_hash, f.num_hash);
296 seeds.swap(f.seeds);
297 std::swap(num_ins, f.num_ins);
298 }
299
309 {
310 // empty
311 }
312
319 {
320 swap(f);
321 }
322
328 auto &operator =(const Bloom_Filter & f)
329 {
330 Bloom_Filter tmp(f);
331 swap(tmp);
332 return *this;
333 }
334
340 auto &operator =(Bloom_Filter && f) noexcept
341 {
342 swap(f);
343 return *this;
344 }
345
347 ~Bloom_Filter() = default;
348
355 auto &insert(const T & item)
356 {
357 const size_t m = bits.size();
358 for (const auto & seed: seeds)
359 bits.fast_write((*hash_fct)(item, seed) % m, 1);
360
361 ++num_ins;
362
363 return *this;
364 }
365
372 auto &append(const T & item) { return insert(item); }
373
379 [[nodiscard]] bool contains(const T & item) const
380 {
381 const size_t m = bits.size();
382 for (const auto & seed: seeds)
383 if (not bits.fast_read((*hash_fct)(item, seed) % m))
384 return false;
385
386 return true;
387 }
388
395 {
397 for (const auto & seed: seeds)
398 ret.append(seed);
399 return ret;
400 }
401
407 [[nodiscard]] DynList<size_t> hashes(const T & item) const
408 {
410 const size_t m = bits.size();
411 for (const auto & seed: seeds)
412 ret.append((*hash_fct)(item, seed) % m);
413 return ret;
414 }
415
423 [[nodiscard]] DynList<size_t> common_hashes(const T & i1, const T & i2) const
424 {
425 return intercept(hashes(i1), hashes(i2));
426 }
427
434 {
436 for (size_t i = 0; i < bits.size(); ++i)
437 if (bits(i) == 1)
438 ret.append(i);
439
440 return ret;
441 }
442
452 [[nodiscard]] size_t expected_size(const size_t x) const noexcept
453 {
454 const auto m = static_cast<double>(capacity());
455 const auto k = static_cast<double>(get_k());
456 if (m == 0 or k == 0)
457 return 0;
458
459 const double p0 = 1.0 - (static_cast<double>(x) / m);
460 if (p0 <= 0)
461 return std::numeric_limits<size_t>::max();
462
463 const double n = std::ceil(-(m * std::log(p0)) / k);
464 if (not std::isfinite(n) or n < 0)
465 return std::numeric_limits<size_t>::max();
466
467 return static_cast<size_t>(n);
468 }
469
476 {
477 return expected_size(get_x());
478 }
479
486 auto &operator |=(const Bloom_Filter & f)
487 {
489 << "Bloom filter have different hashes";
490
491 bits |= f.bits;
492
493 const auto x = get_x();
494
496
497 return *this;
498 }
499
505 auto &operator &=(const Bloom_Filter & f)
506 {
508 << "Bloom filter have different hashes";
509
510 bits &= f.bits;
511
513
514 return *this;
515 }
516
518 friend Bloom_Filter
519 operator |(const Bloom_Filter & f1, const Bloom_Filter & f2)
520 {
521 Bloom_Filter ret = f1;
522 ret |= f2;
523 return ret;
524 }
525
527 friend Bloom_Filter
528 operator &(const Bloom_Filter & f1, const Bloom_Filter & f2)
529 {
530 Bloom_Filter ret = f1;
531 ret &= f2;
532 return ret;
533 }
534 };
535} // end namespace Aleph
536
537# endif // BLOOM_FILTER_H
Exception handling system with formatted messages for Aleph-w.
#define ah_domain_error_if(C)
Throws std::domain_error if condition holds.
Definition ah-errors.H:522
#define ah_invalid_argument_if(C)
Throws std::invalid_argument if condition holds.
Definition ah-errors.H:639
#define ah_bad_alloc_if(C)
Throws std::bad_alloc if condition holds.
Definition ah-errors.H:429
Functional programming utilities for Aleph-w containers.
Space-efficient bit array implementation.
Contiguous array of bits.
Definition bitArray.H:189
void fast_write(const size_t i, const unsigned int value)
Definition bitArray.H:443
int fast_read(const size_t i) const noexcept
Definition bitArray.H:438
void reserve(const size_t dim)
Reserve memory in advance for the bit array dim dimension.
Definition bitArray.H:328
constexpr size_t size() const noexcept
Returns the dimension of the bit array.
Definition bitArray.H:334
void swap(BitArray &array) noexcept
Definition bitArray.H:485
Bloom filter implementation.
friend Bloom_Filter operator|(const Bloom_Filter &f1, const Bloom_Filter &f2)
Return the union of two compatible Bloom filters.
~Bloom_Filter()=default
Destructor.
size_t size() const noexcept
Return the number of insertions.
auto & operator&=(const Bloom_Filter &f)
In-place intersection.
DynList< size_t > common_hashes(const T &i1, const T &i2) const
Return the intersection of hashes(i1) and hashes(i2).
Bloom_Filter(const size_t n, const double p, unsigned long seed=static_cast< unsigned long >(std::time(nullptr)), Hash_Fct hash_fn=dft_seed_hash_ptr_fct< T >)
Construct a Bloom filter using a desired capacity in insertions.
friend Bloom_Filter operator&(const Bloom_Filter &f1, const Bloom_Filter &f2)
Return the intersection of two compatible Bloom filters.
size_t expected_size() const noexcept
Convenience overload.
bool have_same_hashes(const Bloom_Filter &f) const noexcept
static std::tuple< size_t, size_t > estimate(const size_t n, const double p)
Estimate the parameters of a Bloom filter.
size_t expected_size(const size_t x) const noexcept
Estimate the number of inserted items from the number of bits set.
auto & append(const T &item)
Alias of insert().
std::vector< unsigned long > seeds
Bloom_Filter(const size_t dim, size_t num_hashes, Hash_Fct hash_fn=dft_seed_hash_ptr_fct< T >, const unsigned long seed=static_cast< unsigned long >(std::time(nullptr)))
Construct a Bloom filter with explicit parameters.
size_t get_n() const noexcept
Return the number of insertions performed on the filter.
DynList< size_t > set_bits() const
Return the indexes of bits set to 1.
size_t get_m() const noexcept
Return the number of bits of the filter.
auto & operator|=(const Bloom_Filter &f)
In-place union.
size_t(*)(const T &, unsigned long seed) Hash_Fct
bool contains(const T &item) const
Test membership.
DynList< size_t > hash_seeds() const
Return the internally generated per-hash seeds.
auto & insert(const T &item)
Insert an item.
Bloom_Filter(std::tuple< size_t, size_t > mk, Hash_Fct hash_fn, unsigned long seed)
Private delegating constructor to avoid calling estimate() twice.
Bloom_Filter(const Bloom_Filter &f)
Copy constructor.
Bloom_Filter(Bloom_Filter &&f) noexcept
Move constructor.
size_t capacity() const noexcept
Return the capacity in bits.
DynList< size_t > hashes(const T &item) const
Return the bit positions used by item.
void swap(Bloom_Filter &f) noexcept
Swap this with f in O(1).
size_t get_k() const noexcept
Return the number of hash functions used by the filter.
size_t get_x() const noexcept
Return the number of bits set to 1.
auto & operator=(const Bloom_Filter &f)
Copy assignment.
Dynamic singly linked list with functional programming support.
Definition htlist.H:1155
T & append(const T &item)
Definition htlist.H:1271
__gmp_expr< T, __gmp_unary_expr< __gmp_expr< T, U >, __gmp_log2_function > > log2(const __gmp_expr< T, U > &expr)
Definition gmpfrxx.h:4064
__gmp_expr< typename __gmp_resolve_expr< T, V >::value_type, __gmp_binary_expr< __gmp_expr< T, U >, __gmp_expr< V, W >, __gmp_dim_function > > dim(const __gmp_expr< T, U > &expr1, const __gmp_expr< V, W > &expr2)
Definition gmpfrxx.h:4052
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
and
Check uniqueness with explicit hash + equality functors.
DynList< T > intercept(const Container< T > &c1, const Container< T > &c2)
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
std::decay_t< typename HeadC::Item_Type > T
Definition ah-zip.H:105
STL namespace.
FooMap m(5, fst_unit_pair_hash, snd_unit_pair_hash)
ValueArg< size_t > seed
Definition testHash.C:48
static int * k
gsl_rng * r
Dynamic set implementations based on hash tables.