Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
count-min-sketch.H
Go to the documentation of this file.
1/*
2 Aleph_w
3
4 Data structures & Algorithms
5 version 2.0.0b
6 https://github.com/lrleon/Aleph-w
7
8 This file is part of Aleph-w library
9
10 Copyright (c) 2002-2026 Leandro Rabindranath Leon
11
12 Permission is hereby granted, free of charge, to any person obtaining a copy
13 of this software and associated documentation files (the "Software"), to deal
14 in the Software without restriction, including without limitation the rights
15 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 copies of the Software, and to permit persons to whom the Software is
17 furnished to do so, subject to the following conditions:
18
19 The above copyright notice and this permission notice shall be included in all
20 copies or substantial portions of the Software.
21
22 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 SOFTWARE.
29*/
30
31# ifndef COUNT_MIN_SKETCH_H
32# define COUNT_MIN_SKETCH_H
33
44# include <cmath>
45# include <algorithm>
46# include <concepts>
47# include <cstdint>
48# include <limits>
49
50# include <ah-errors.H>
51# include <tpl_array.H>
52# include <hash-fct.H>
53
54namespace Aleph
55{
56
69 template <typename T, std::unsigned_integral CounterT = size_t>
71 {
72 size_t w_;
73 size_t d_;
78 public:
83 Count_Min_Sketch(const size_t w, const size_t d)
84 : w_(w), d_(d)
85 {
86 ah_domain_error_if(w == 0 or d == 0)
87 << "Count_Min_Sketch: dimensions must be positive";
88
89 ah_domain_error_if(w > std::numeric_limits<size_t>::max() / d)
90 << "Count_Min_Sketch: product of dimensions w*d overflows size_t (got "
91 << w << " * " << d << ")";
92
95
96 for (size_t i = 0; i < d_; ++i)
97 seeds_.append(static_cast<uint32_t>(dft_hash_fct(i, 0x9e3779b9)));
98 }
99
110 static Count_Min_Sketch from_error_bounds(const double epsilon, const double delta)
111 {
112 ah_domain_error_if(epsilon <= 0.0 or not std::isfinite(epsilon))
113 << "Count_Min_Sketch::from_error_bounds: epsilon must be positive and finite (got " << epsilon << ")";
114 ah_domain_error_if(delta <= 0.0 or delta >= 1.0 or not std::isfinite(delta))
115 << "Count_Min_Sketch::from_error_bounds: delta must be in (0, 1) and finite (got " << delta << ")";
116
117 const double w_dbl = std::ceil(std::exp(1.0) / epsilon);
118 const double d_dbl = std::ceil(std::log(1.0 / delta));
119 const double max_sz = static_cast<double>(std::numeric_limits<size_t>::max());
120
122 << "Count_Min_Sketch::from_error_bounds: calculated dimensions exceed size_t limits";
123
124 return Count_Min_Sketch(static_cast<size_t>(w_dbl), static_cast<size_t>(d_dbl));
125 }
126
131 void update(const T & val, CounterT count = 1)
132 {
133 if (count == 0) return;
134
135 constexpr CounterT max_val = std::numeric_limits<CounterT>::max();
136
137 for (size_t i = 0; i < d_; ++i)
138 {
139 const size_t h = murmur3hash(val, seeds_[i]) % w_;
140 const size_t idx = i * w_ + h;
141 if (max_val - table_[idx] < count)
142 table_(idx) = max_val;
143 else
144 table_(idx) += count;
145 }
146
147 if (max_val - total_count_ < count)
148 total_count_ = max_val;
149 else
151 }
152
156 [[nodiscard]] CounterT estimate(const T & val) const
157 {
158 CounterT min_val = std::numeric_limits<CounterT>::max();
159 for (size_t i = 0; i < d_; ++i)
160 {
161 const size_t h = murmur3hash(val, seeds_[i]) % w_;
162 min_val = std::min(min_val, table_[i * w_ + h]);
163 }
164 return min_val;
165 }
166
171 {
172 ah_domain_error_if(w_ != other.w_ or d_ != other.d_)
173 << "Count_Min_Sketch::merge: dimension mismatch";
174
175 for (size_t i = 0; i < d_; ++i)
176 ah_domain_error_if(seeds_(i) != other.seeds_[i])
177 << "Count_Min_Sketch::merge: hash seed mismatch at index " << i;
178
179 constexpr CounterT max_val = std::numeric_limits<CounterT>::max();
180
181 for (size_t i = 0; i < table_.size(); ++i)
182 {
183 if (max_val - table_[i] < other.table_[i])
184 table_(i) = max_val;
185 else
186 table_(i) += other.table_[i];
187 }
188
189 if (max_val - total_count_ < other.total_count_)
190 total_count_ = max_val;
191 else
192 total_count_ += other.total_count_;
193 }
194
196 void clear()
197 {
198 for (size_t i = 0; i < table_.size(); ++i)
199 table_(i) = 0;
200 total_count_ = 0;
201 }
202
208 [[nodiscard]] size_t width() const noexcept { return w_; }
209
215 [[nodiscard]] size_t depth() const noexcept { return d_; }
216
223 };
224
225} // namespace Aleph
226
227# endif // COUNT_MIN_SKETCH_H
Exception handling system with formatted messages for Aleph-w.
#define ah_domain_error_if(C)
Throws std::domain_error if condition holds.
Definition ah-errors.H:522
long double h
Definition btreepic.C:154
long double w
Definition btreepic.C:153
Simple dynamic array with automatic resizing and functional operations.
Definition tpl_array.H:139
T & append(const T &data)
Append a copy of data
Definition tpl_array.H:245
void reserve(size_t cap)
Reserves cap cells into the array.
Definition tpl_array.H:315
Count-Min Sketch frequency estimator.
void merge(const Count_Min_Sketch &other)
Merge another sketch into this one.
size_t w_
Width of the sketch.
size_t d_
Depth (number of hash functions).
size_t depth() const noexcept
Number of hash functions (independent counter rows).
Array< CounterT > table_
Flat array for counters [d_ x w_].
CounterT estimate(const T &val) const
Estimate frequency of an element.
CounterT total_count() const noexcept
Sum of all event weights inserted so far.
void clear()
Clear all counters.
static Count_Min_Sketch from_error_bounds(const double epsilon, const double delta)
Create a sketch with target error bounds.
void update(const T &val, CounterT count=1)
Increment count for an element.
size_t width() const noexcept
Number of counter columns (hash buckets per row).
CounterT total_count_
Total updates processed.
Count_Min_Sketch(const size_t w, const size_t d)
Construct with explicit dimensions.
Array< uint32_t > seeds_
Random seeds for hash functions.
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
std::decay_t< typename HeadC::Item_Type > T
Definition ah-zip.H:105
size_t dft_hash_fct(const Key &key) noexcept
Primary default hash: best speed/quality trade-off.
Definition hash-fct.H:1003
size_t murmur3hash(const Key &key, std::uint32_t seed)
Definition hash-fct.H:334
Itor::difference_type count(const Itor &beg, const Itor &end, const T &value)
Count elements equal to a value.
Definition ahAlgo.H:127
Dynamic array container with automatic resizing.