36# include <gtest/gtest.h>
55 for (
int i = 0; i < 1000; ++i) data.
append(i);
63 for (
size_t i = 0; i < sample.size(); ++i)
71 for (
size_t i = 0; i < sample.size(); ++i)
80 cms.update(
"apple", 10);
81 cms.update(
"banana", 5);
82 cms.update(
"cherry", 1);
84 for (
int i = 0; i < 100; ++i)
cms.update(
"apple");
94 const size_t before =
cms.estimate(
"dragonfruit");
95 cms.update(
"dragonfruit", 5);
109 for (
int i = 0; i < n; ++i)
110 hll.update(i % 1000);
112 double est =
hll.estimate();
123 const auto &
sig =
mh.get_signature();
130 EXPECT_EQ(
sig[i], std::numeric_limits<uint64_t>::max());
135 constexpr size_t K = 256;
149 for (
int i = 0; i < 100; ++i)
153 set3.append(i + 200);
156 for (
int i = 0; i < 100; ++i)
mh1.update(i);
157 for (
int i = 50; i < 150; ++i)
mh2.update(i);
158 for (
int i = 200; i < 300; ++i)
mh3.update(i);
168 const auto &
sig1 =
mh1.get_signature();
177 for (
size_t i = 0; i < K; ++i)
180 if (
sig1_range[i] != std::numeric_limits<uint64_t>::max())
195 for (
int i = 0; i < 150; ++i)
205 const auto &
sig2 =
mh2.get_signature();
210 for (
size_t i = 0; i < K; ++i)
230 for (
int i = 0; i < 50; ++i)
233 for (
int i = 0; i < 50; ++i)
245 for (
int i = 0; i < 50; ++i)
mhA.update(i);
246 for (
int i = 25; i < 75; ++i)
mhB.update(i);
247 for (
int i = 0; i < 75; ++i)
mhUnion.update(i);
291 for (
size_t i = 0; i <
doc1.size(); ++i)
sh1.update(
doc1[i]);
292 for (
size_t i = 0; i <
doc2.size(); ++i)
sh2.update(
doc2[i]);
293 for (
size_t i = 0; i <
doc3.size(); ++i)
sh3.update(
doc3[i]);
357 for (
int i = 0; i < 5; ++i)
364 for (
int i = 0; i < 5; ++i)
366 s2.set_n_seen_for_testing(std::numeric_limits<size_t>::max());
377 for (
int i = 0; i < 3; ++i)
386 for (
int i = 0; i < 3; ++i)
388 s2.set_n_seen_for_testing(rng_range);
Count-Min Sketch frequency estimator.
static Count_Min_Sketch from_error_bounds(const double epsilon, const double delta)
Create a sketch with target error bounds.
T & append()
Allocate a new entry to the end of array.
void reserve(const size_t l, const size_t r)
Allocate a range of entries.
Dynamic singly linked list with functional programming support.
HyperLogLog cardinality estimator.
MinHash signature generator.
MinHash & merge(const MinHash &other)
Merge another MinHash signature into this one.
Incremental reservoir sampler (Algorithm R).
void update(const T &val)
Process a new element from the stream.
uint64_t rng_range() const noexcept
RNG output range (max - min + 1).
void set_n_seen_for_testing(const size_t n)
Forcibly set the internal stream counter (for testing only).
SimHash fingerprint generator.
static double similarity(const uint64_t f1, const uint64_t f2) noexcept
Estimate similarity with another fingerprint.
iterator end() noexcept
Return an STL-compatible end iterator.
iterator begin() noexcept
Return an STL-compatible iterator to the first element.
Count-Min Sketch for frequency estimation in streams.
HyperLogLog for approximate cardinality estimation.
MinHash for estimating Jaccard similarity between sets.
Main namespace for Aleph-w library functions.
auto reservoir_sample(Itor beg, Sent end, size_t k, unsigned long seed=static_cast< unsigned long >(std::chrono::high_resolution_clock::now().time_since_epoch().count()))
Convenience function to sample k elements from a range.
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
Reservoir sampling for random selection from data streams.
SimHash for estimating cosine similarity between sets of features.
static void assertMinHashContracts(MinHash< int > &mh, size_t expected_k)
Lazy and scalable dynamic array implementation.
Alias for htlist.H (DynList implementation).
Unified hash table interface.