Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
streaming_algorithms_test.cc
Go to the documentation of this file.
1/*
2 Aleph_w
3
4 Data structures & Algorithms
5 version 2.0.0b
6 https://github.com/lrleon/Aleph-w
7
8 This file is part of Aleph-w library
9
10 Copyright (c) 2002-2026 Leandro Rabindranath Leon
11
12 Permission is hereby granted, free of charge, to any person obtaining a copy
13 of this software and associated documentation files (the "Software"), to deal
14 in the Software without restriction, including without limitation the rights
15 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 copies of the Software, and to permit persons to whom the Software is
17 furnished to do so, subject to the following conditions:
18
19 The above copyright notice and this permission notice shall be included in all
20 copies or substantial portions of the Software.
21
22 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 SOFTWARE.
29*/
30
36# include <gtest/gtest.h>
37# include <string>
38# include <random>
39# include <vector>
40
41# include <tpl_dynArray.H>
42# include <tpl_hash.H>
43# include <tpl_dynList.H>
44# include <reservoir-sampling.H>
45# include <count-min-sketch.H>
46# include <hyperloglog.H>
47# include <minhash.H>
48# include <simhash.H>
49
50using namespace Aleph;
51
53{
54 DynArray<int> data;
55 for (int i = 0; i < 1000; ++i) data.append(i);
56
57 const size_t k = 50;
58 auto sample = reservoir_sample(data.begin(), data.end(), k, 12345);
59
60 ASSERT_EQ(sample.size(), k);
61
62 // All elements should be in the original range
63 for (size_t i = 0; i < sample.size(); ++i)
64 {
65 EXPECT_GE(sample[i], 0);
66 EXPECT_LT(sample[i], 1000);
67 }
68
69 // Check unique elements (mostly, since it's random sampling without replacement from unique input)
71 for (size_t i = 0; i < sample.size(); ++i)
72 unique_vals.insert(sample[i]);
73 EXPECT_EQ(unique_vals.size(), k);
74}
75
77{
79
80 cms.update("apple", 10);
81 cms.update("banana", 5);
82 cms.update("cherry", 1);
83
84 for (int i = 0; i < 100; ++i) cms.update("apple"); // total apple = 110
85
86 EXPECT_GE(cms.estimate("apple"), 110u);
87 EXPECT_GE(cms.estimate("banana"), 5u);
88 EXPECT_GE(cms.estimate("cherry"), 1u);
89 // Theoretical bound: f + epsilon * total_count = 1 + 0.01 * 121 = 2.21
90 // We use a safe bound of 10u to account for small total_count noise.
91 EXPECT_LE(cms.estimate("cherry"), 10u);
92
93 // Deterministic property check
94 const size_t before = cms.estimate("dragonfruit");
95 cms.update("dragonfruit", 5);
96 EXPECT_GE(cms.estimate("dragonfruit"), before + 5);
97
98 // Validation tests
99 EXPECT_THROW(Count_Min_Sketch<int>::from_error_bounds(0.0, 0.1), std::domain_error);
100 EXPECT_THROW(Count_Min_Sketch<int>::from_error_bounds(0.1, 0.0), std::domain_error);
101 EXPECT_THROW(Count_Min_Sketch<int>::from_error_bounds(0.1, 1.0), std::domain_error);
102}
103
105{
106 HyperLogLog<int> hll(10); // 1024 registers, ~3.2% error
107
108 const int n = 5000;
109 for (int i = 0; i < n; ++i)
110 hll.update(i % 1000); // 1000 unique elements
111
112 double est = hll.estimate();
113 EXPECT_NEAR(est, 1000.0, 1000.0 * 0.05); // allow 5% error
114
115 // Validation tests
116 EXPECT_THROW(HyperLogLog<int>(3), std::domain_error);
117 EXPECT_THROW(HyperLogLog<int>(17), std::domain_error);
118}
119
121{
122 EXPECT_EQ(mh.size(), expected_k);
123 const auto & sig = mh.get_signature();
124 EXPECT_EQ(sig.size(), expected_k);
125
126 // clear() behavior
127 mh.clear();
128 EXPECT_EQ(mh.size(), expected_k);
129 for (size_t i = 0; i < expected_k; ++i)
130 EXPECT_EQ(sig[i], std::numeric_limits<uint64_t>::max());
131}
132
134{
135 constexpr size_t K = 256;
136 MinHash<int> mh1(K);
137 MinHash<int> mh2(K);
138 MinHash<int> mh3(K);
140
141 // Set 1: [0..99]
142 // Set 2: [50..149] -> Intersection 50, Union 150 -> Jaccard = 1/3
143 // Set 3: [200..299] -> Jaccard with 1 = 0
145 set1.reserve(100);
146 set2.reserve(100);
147 set3.reserve(100);
148
149 for (int i = 0; i < 100; ++i)
150 {
151 set1.append(i);
152 set2.append(i + 50);
153 set3.append(i + 200);
154 }
155
156 for (int i = 0; i < 100; ++i) mh1.update(i);
157 for (int i = 50; i < 150; ++i) mh2.update(i);
158 for (int i = 200; i < 300; ++i) mh3.update(i);
159 mh1_range.update(set1.begin(), set1.end()); // range overload
160
161 EXPECT_EQ(mh1.size(), K);
162 EXPECT_EQ(mh2.size(), K);
163 EXPECT_EQ(mh3.size(), K);
164 EXPECT_EQ(mh1_range.size(), K);
165
166 // update(Itor, Itor) must be equivalent to repeated point updates.
167 EXPECT_DOUBLE_EQ(mh1.similarity(mh1_range), 1.0);
168 const auto & sig1 = mh1.get_signature();
169 const auto & sig1_range = mh1_range.get_signature();
170 const auto * sig_ptr = &mh1_range.get_signature();
171 EXPECT_NE(sig_ptr, nullptr);
172 EXPECT_EQ(sig_ptr, &mh1_range.get_signature());
173 ASSERT_EQ(sig1.size(), K);
174 ASSERT_EQ(sig1_range.size(), K);
175
176 bool changed_after_update = false;
177 for (size_t i = 0; i < K; ++i)
178 {
179 EXPECT_EQ(sig1[i], sig1_range[i]);
180 if (sig1_range[i] != std::numeric_limits<uint64_t>::max())
182 }
184
185 double sim12 = mh1.similarity(mh2);
186 double sim13 = mh1.similarity(mh3);
187
188 EXPECT_NEAR(sim12, 1.0/3.0, 0.1);
189 EXPECT_NEAR(sim13, 0.0, 0.05);
190
191 // Merge contract: returns *this and corresponds to union of sets.
194 union12.reserve(150);
195 for (int i = 0; i < 150; ++i)
196 union12.append(i);
197 mh_union_expected.update(union12.begin(), union12.end());
198
202 EXPECT_EQ(mh_merged.size(), K);
204
205 const auto & sig2 = mh2.get_signature();
206 const auto & sig_merged = mh_merged.get_signature();
207 const auto & sig_union = mh_union_expected.get_signature();
208 ASSERT_EQ(sig_merged.size(), K);
209 ASSERT_EQ(sig_union.size(), K);
210 for (size_t i = 0; i < K; ++i)
211 {
212 EXPECT_EQ(sig_merged[i], std::min(sig1[i], sig2[i]));
214 }
215
216 // Use helper to verify common contracts (size, get_signature, clear).
221}
222
224{
225 // update(Itor, Itor) should produce the same signature as repeated update(val).
228
230 for (int i = 0; i < 50; ++i)
231 elems.append(i);
232
233 for (int i = 0; i < 50; ++i)
234 mhLoop.update(i);
235 mhRange.update(elems.begin(), elems.end());
236
237 EXPECT_DOUBLE_EQ(mhLoop.similarity(mhRange), 1.0);
238
241
242 // merge() combines two signatures: union of [0..49] and [25..74]
243 // should give same result as building from [0..74] directly.
244 MinHash<int> mhA(64), mhB(64), mhUnion(64);
245 for (int i = 0; i < 50; ++i) mhA.update(i);
246 for (int i = 25; i < 75; ++i) mhB.update(i);
247 for (int i = 0; i < 75; ++i) mhUnion.update(i);
248
249 mhA.merge(mhB);
250 // Merged should be close to union; similarity should be high.
251 EXPECT_GT(mhA.similarity(mhUnion), 0.8);
252
253 // merge() returns *this (chaining works).
254 MinHash<int> mhC(64), mhD(64);
255 mhC.update(1);
256 mhD.update(2);
258 EXPECT_EQ(&ref, &mhC);
259
260 // size() is stable after merge and clear.
261 EXPECT_EQ(mhA.size(), 64u);
262 mhA.clear();
263 EXPECT_EQ(mhA.size(), 64u);
264}
265
267{
268 // Constructor precondition: k must be > 0.
269 EXPECT_THROW((MinHash<int>(0)), std::domain_error);
270
271 // similarity() and merge() require matching signature sizes.
272 MinHash<int> mh64(64);
273 MinHash<int> mh32(32);
274 mh64.update(1);
275 mh32.update(1);
276
277 EXPECT_THROW((mh64.similarity(mh32)), std::domain_error);
278 EXPECT_THROW((mh32.similarity(mh64)), std::domain_error);
279 EXPECT_THROW(mh64.merge(mh32), std::domain_error);
280 EXPECT_THROW(mh32.merge(mh64), std::domain_error);
281}
282
284{
286
287 const DynArray<std::string> doc1 = {"this", "is", "a", "test", "document", "with", "some", "words"};
288 const DynArray<std::string> doc2 = {"this", "is", "a", "test", "document", "with", "other", "words"}; // one word diff
289 const DynArray<std::string> doc3 = {"completely", "unrelated", "content", "nothing", "matches", "here", "at", "all"};
290
291 for (size_t i = 0; i < doc1.size(); ++i) sh1.update(doc1[i]);
292 for (size_t i = 0; i < doc2.size(); ++i) sh2.update(doc2[i]);
293 for (size_t i = 0; i < doc3.size(); ++i) sh3.update(doc3[i]);
294
295 uint64_t f1 = sh1.get_fingerprint();
296 uint64_t f2 = sh2.get_fingerprint();
297 uint64_t f3 = sh3.get_fingerprint();
298
301
302 EXPECT_GT(sim12, 0.8);
303 EXPECT_LT(sim13, 0.7);
304}
305
307{
308 // 1. Reservoir sample with k=0, k=1 and empty input
310 EXPECT_EQ(reservoir_sample(empty_list.begin(), empty_list.end(), 0).size(), 0u);
311 EXPECT_EQ(reservoir_sample(empty_list.begin(), empty_list.end(), 5).size(), 0u);
312
313 DynList<int> singleton = {42};
314 auto s1 = reservoir_sample(singleton.begin(), singleton.end(), 1, 123);
315 ASSERT_EQ(s1.size(), 1u);
316 EXPECT_EQ(s1[0], 42);
317
318 // 2. Count-Min Sketch: invalid params and empty/singleton
319 EXPECT_THROW(Count_Min_Sketch<int>::from_error_bounds(0, 0.01), std::domain_error);
320
322 EXPECT_GE(cms.estimate(42), 0u); // empty: 0
323 cms.update(42, 10);
324 EXPECT_EQ(cms.estimate(42), 10u);
325
326 Count_Min_Sketch<int> cms_other(50, 5); // different width
327 EXPECT_THROW(cms.merge(cms_other), std::domain_error);
328
329 // 3. HyperLogLog: empty and singleton
331 EXPECT_NEAR(hll.estimate(), 0.0, 0.1);
332 hll.update(42);
333 EXPECT_NEAR(hll.estimate(), 1.0, 0.5);
334
335 // 4. MinHash: empty and singletons
336 MinHash<int> mh1(64), mh2(64);
337 EXPECT_EQ(mh1.similarity(mh2), 1.0); // empty vs empty
338 mh1.update(100);
339 EXPECT_LT(mh1.similarity(mh2), 0.1); // singleton vs empty
340 mh2.update(100);
341 EXPECT_EQ(mh1.similarity(mh2), 1.0); // same singleton
342
343 // 5. SimHash: empty and singletons
345 EXPECT_EQ(sh1.get_fingerprint(), 0u);
346 EXPECT_EQ(SimHash<int>::similarity(sh1.get_fingerprint(), sh2.get_fingerprint()), 1.0);
347 sh1.update(42);
348 sh2.update(42);
349 EXPECT_EQ(sh1.get_fingerprint(), sh2.get_fingerprint());
350 EXPECT_EQ(SimHash<int>::similarity(sh1.get_fingerprint(), sh2.get_fingerprint()), 1.0);
351}
352
354{
355 // Keep internal state consistent: for n_seen_ >= k, the reservoir must be full.
356 Reservoir_Sampler<int> s(5, 42);
357 for (int i = 0; i < 5; ++i)
358 s.update(i);
359 s.set_n_seen_for_testing(std::numeric_limits<size_t>::max());
360 EXPECT_THROW(s.update(1), std::overflow_error);
361
362 // Iterator overload delegates to update(), so the same guard fires.
364 for (int i = 0; i < 5; ++i)
365 s2.update(i);
366 s2.set_n_seen_for_testing(std::numeric_limits<size_t>::max());
367 DynArray<int> one = {99};
368 EXPECT_THROW(s2.update(one.begin(), one.end()), std::overflow_error);
369}
370
372{
373 // RNG range exhaustion fires when n_seen_ >= rng_range_, but only after
374 // the reservoir is already full (n_seen_ >= k). Seed the reservoir first
375 // via legitimate updates, then jump n_seen_ to rng_range_.
376 Reservoir_Sampler<int> s(3, 42);
377 for (int i = 0; i < 3; ++i)
378 s.update(i); // fill reservoir: n_seen_ == 3
379
380 const uint64_t rng_range = s.rng_range();
381 s.set_n_seen_for_testing(rng_range); // n_seen_ == rng_range_ >= k
382 EXPECT_THROW(s.update(99), std::runtime_error);
383
384 // Same path through the iterator overload.
386 for (int i = 0; i < 3; ++i)
387 s2.update(i);
388 s2.set_n_seen_for_testing(rng_range);
389 DynArray<int> one2 = {99};
390 EXPECT_THROW(s2.update(one2.begin(), one2.end()), std::runtime_error);
391}
392
394{
395 Count_Min_Sketch<int> cms1(size_t(100), size_t(5));
396 Count_Min_Sketch<int> cms2(size_t(100), size_t(5));
397
398 cms1.update(42, 10);
399 cms2.update(42, 5);
400 cms2.update(7, 3);
401
402 cms1.merge(cms2);
403 EXPECT_GE(cms1.estimate(42), 15u);
404 EXPECT_GE(cms1.estimate(7), 3u);
405}
Count-Min Sketch frequency estimator.
static Count_Min_Sketch from_error_bounds(const double epsilon, const double delta)
Create a sketch with target error bounds.
T & append()
Allocate a new entry to the end of array.
void reserve(const size_t l, const size_t r)
Allocate a range of entries.
Dynamic singly linked list with functional programming support.
Definition htlist.H:1155
HyperLogLog cardinality estimator.
Definition hyperloglog.H:69
MinHash signature generator.
Definition minhash.H:65
MinHash & merge(const MinHash &other)
Merge another MinHash signature into this one.
Definition minhash.H:170
Incremental reservoir sampler (Algorithm R).
void update(const T &val)
Process a new element from the stream.
uint64_t rng_range() const noexcept
RNG output range (max - min + 1).
void set_n_seen_for_testing(const size_t n)
Forcibly set the internal stream counter (for testing only).
SimHash fingerprint generator.
Definition simhash.H:66
static double similarity(const uint64_t f1, const uint64_t f2) noexcept
Estimate similarity with another fingerprint.
Definition simhash.H:152
iterator end() noexcept
Return an STL-compatible end iterator.
iterator begin() noexcept
Return an STL-compatible iterator to the first element.
Count-Min Sketch for frequency estimation in streams.
#define TEST(name)
HyperLogLog for approximate cardinality estimation.
MinHash for estimating Jaccard similarity between sets.
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
auto reservoir_sample(Itor beg, Sent end, size_t k, unsigned long seed=static_cast< unsigned long >(std::chrono::high_resolution_clock::now().time_since_epoch().count()))
Convenience function to sample k elements from a range.
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
Reservoir sampling for random selection from data streams.
SimHash for estimating cosine similarity between sets of features.
static void assertMinHashContracts(MinHash< int > &mh, size_t expected_k)
static int * k
Lazy and scalable dynamic array implementation.
Alias for htlist.H (DynList implementation).
Unified hash table interface.