Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
hyperloglog.H
Go to the documentation of this file.
1/*
2 Aleph_w
3
4 Data structures & Algorithms
5 version 2.0.0b
6 https://github.com/lrleon/Aleph-w
7
8 This file is part of Aleph-w library
9
10 Copyright (c) 2002-2026 Leandro Rabindranath Leon
11
12 Permission is hereby granted, free of charge, to any person obtaining a copy
13 of this software and associated documentation files (the "Software"), to deal
14 in the Software without restriction, including without limitation the rights
15 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 copies of the Software, and to permit persons to whom the Software is
17 furnished to do so, subject to the following conditions:
18
19 The above copyright notice and this permission notice shall be included in all
20 copies or substantial portions of the Software.
21
22 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 SOFTWARE.
29*/
30
31# ifndef HYPERLOGLOG_H
32# define HYPERLOGLOG_H
33
44# include <cmath>
45# include <algorithm>
46# include <bit>
47# include <cstdint>
48
49# include <ah-errors.H>
50# include <tpl_array.H>
51# include <hash-fct.H>
52
53namespace Aleph
54{
67 template <typename T>
69 {
71 size_t m_;
72 double alpha_m_;
78 [[nodiscard]] static uint64_t hash64(const T & val) noexcept
79 {
80 static_assert(sizeof(size_t) >= 8,
81 "HyperLogLog requires 64-bit size_t; murmur3hash is 32-bit on this platform");
82 return static_cast<uint64_t>(murmur3hash(val, 0x12345678));
83 }
84
88 [[nodiscard]] static size_t compute_m_or_throw(const uint8_t b)
89 {
91 << "HyperLogLog: b must be in range [4, 16] (got " << static_cast<int>(b) << ")";
92 return static_cast<size_t>(1) << b;
93 }
94
98 [[nodiscard]] static double compute_alpha(const size_t m) noexcept
99 {
100 if (m == 16) return 0.673;
101 if (m == 32) return 0.697;
102 if (m == 64) return 0.709;
103 return 0.7213 / (1.0 + 1.079 / static_cast<double>(m));
104 }
105
106 public:
113 explicit HyperLogLog(const uint8_t b = 12)
114 : b_(b), m_(compute_m_or_throw(b)),
117 {}
118
126 void update(const T & val)
127 {
128 const uint64_t x = hash64(val);
129 const size_t j = x >> (64 - b_); // upper b bits for index
130 const uint64_t w = x << b_ | (static_cast<uint64_t>(1) << (b_ - 1)); // remaining bits
131
132 // count leading zeros in the hash part not used for indexing
133
134 if (const auto rho = static_cast<uint8_t>(std::countl_zero(w) + 1); rho > registers_[j])
135 registers_(j) = rho;
136 }
137
147 {
148 double z = 0.0;
149 size_t v = 0; // count of empty registers for Linear Counting
150 for (size_t i = 0; i < m_; ++i)
151 {
152 // More efficient and stable than std::pow(2.0, -registers_[i])
153 z += std::ldexp(1.0, -static_cast<int>(registers_[i]));
154 if (registers_[i] == 0)
155 ++v;
156 }
157
158 // Raw estimate
159 double e = alpha_m_ * (static_cast<double>(m_) * static_cast<double>(m_)) / z;
160
161 // Small range correction (Linear Counting)
162 if (e <= 2.5 * static_cast<double>(m_))
163 {
164 if (v > 0)
165 e = static_cast<double>(m_) * std::log(static_cast<double>(m_) / static_cast<double>(v));
166 }
167 // Large range correction: use log1p for numerical stability; saturate when ratio >= 1
168 else
169 {
170 static const double two_to_64 = std::ldexp(1.0, 64);
171 if (e > (1.0 / 30.0) * two_to_64)
172 {
173 const double ratio = e / two_to_64;
174 if (ratio >= 1.0)
175 return two_to_64; // saturate at hash-space limit
176 e = -two_to_64 * std::log1p(-ratio);
177 }
178 }
179
180 return e;
181 }
182
194 void merge(const HyperLogLog & other)
195 {
196 ah_domain_error_if(b_ != other.b_) << "HyperLogLog::merge: precision mismatch";
197
198 for (size_t i = 0; i < m_; ++i)
199 registers_(i) = std::max(registers_[i], other.registers_[i]);
200 }
201
208 void clear()
209 {
210 for (size_t i = 0; i < m_; ++i)
211 registers_(i) = 0;
212 }
213
220 [[nodiscard]] size_t num_registers() const noexcept { return m_; }
221 };
222} // namespace Aleph
223
224# endif // HYPERLOGLOG_H
Exception handling system with formatted messages for Aleph-w.
#define ah_domain_error_if(C)
Throws std::domain_error if condition holds.
Definition ah-errors.H:522
long double w
Definition btreepic.C:153
Simple dynamic array with automatic resizing and functional operations.
Definition tpl_array.H:139
HyperLogLog cardinality estimator.
Definition hyperloglog.H:69
double estimate() const noexcept
Estimate current cardinality.
Array< uint8_t > registers_
Max leading zeros observed per bucket.
Definition hyperloglog.H:73
HyperLogLog(const uint8_t b=12)
Construct with precision parameter.
static size_t compute_m_or_throw(const uint8_t b)
Validate b and compute m = 2^b.
Definition hyperloglog.H:88
void clear()
Reset all registers to zero.
double alpha_m_
Bias correction constant.
Definition hyperloglog.H:72
static uint64_t hash64(const T &val) noexcept
Compute 64-bit hash using MurmurHash3.
Definition hyperloglog.H:78
static double compute_alpha(const size_t m) noexcept
Compute bias-correction constant alpha_m from Flajolet et al.
Definition hyperloglog.H:98
void merge(const HyperLogLog &other)
Merge another HyperLogLog into this one (union of sets).
void update(const T &val)
Add an element to the set.
size_t m_
Number of registers (2^b_).
Definition hyperloglog.H:71
size_t num_registers() const noexcept
Number of HyperLogLog registers (2^b).
uint8_t b_
Number of bits for bucket index.
Definition hyperloglog.H:70
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
std::decay_t< typename HeadC::Item_Type > T
Definition ah-zip.H:105
size_t murmur3hash(const Key &key, std::uint32_t seed)
Definition hash-fct.H:334
FooMap m(5, fst_unit_pair_hash, snd_unit_pair_hash)
Dynamic array container with automatic resizing.