Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
reservoir-sampling.H
Go to the documentation of this file.
1/*
2 Aleph_w
3
4 Data structures & Algorithms
5 version 2.0.0b
6 https://github.com/lrleon/Aleph-w
7
8 This file is part of Aleph-w library
9
10 Copyright (c) 2002-2026 Leandro Rabindranath Leon
11
12 Permission is hereby granted, free of charge, to any person obtaining a copy
13 of this software and associated documentation files (the "Software"), to deal
14 in the Software without restriction, including without limitation the rights
15 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 copies of the Software, and to permit persons to whom the Software is
17 furnished to do so, subject to the following conditions:
18
19 The above copyright notice and this permission notice shall be included in all
20 copies or substantial portions of the Software.
21
22 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 SOFTWARE.
29*/
30
31# ifndef RESERVOIR_SAMPLING_H
32# define RESERVOIR_SAMPLING_H
33
45# include <chrono>
46# include <cstdint>
47# include <limits>
48# include <memory>
49# include <iterator>
50# include <gsl/gsl_rng.h>
51
52# include <ah-errors.H>
53# include <tpl_array.H>
54
55namespace Aleph
56{
57
71 template <typename T>
73 {
74 using GslRngPtr = std::unique_ptr<gsl_rng, void(*)(gsl_rng *)>;
75
76 size_t k_;
77 size_t n_seen_ = 0;
78 unsigned long seed_;
85 public:
93 explicit Reservoir_Sampler(const size_t k,
94 const unsigned long seed = static_cast<unsigned long>(
95 std::chrono::high_resolution_clock::now().time_since_epoch().count()))
96 : k_(k), seed_(seed),
98 {
99 ah_runtime_error_unless(rng_ != nullptr) << "Reservoir_Sampler: GSL RNG allocation failed";
100 gsl_rng_set(rng_.get(), seed_);
101 rng_min_ = static_cast<uint64_t>(gsl_rng_min(rng_.get()));
102 rng_max_ = static_cast<uint64_t>(gsl_rng_max(rng_.get()));
103 rng_range_ = rng_max_ - rng_min_ + 1ULL;
104 if (k_ > 0)
105 reservoir_.reserve(k_);
106 }
107
117 void update(const T & val)
118 {
119 ah_overflow_error_if(n_seen_ == std::numeric_limits<size_t>::max())
120 << "Reservoir_Sampler::update: stream length exceeded size_t capacity";
121
122 if (k_ == 0)
123 {
124 ++n_seen_;
125 return;
126 }
127
128 if (n_seen_ < k_)
129 reservoir_.append(val);
130 else
131 {
132 // Probability of replacement is k / (n_seen + 1)
133 ah_overflow_error_if(n_seen_ >= std::numeric_limits<unsigned long>::max())
134 << "Reservoir_Sampler::update: stream length exceeds unsigned long capacity";
135
137 << "Reservoir_Sampler::update: stream length exceeds RNG generator range (" << rng_range_ << ")";
138
139 // Exclusive upper bound for the uniform random index in [0, n_seen_]
140 const auto upper_bound = static_cast<unsigned long>(n_seen_ + 1);
141 const size_t j = gsl_rng_uniform_int(rng_.get(), upper_bound);
142 if (j < k_)
143 reservoir_(j) = val;
144 }
145 ++n_seen_;
146 }
147
157 template <std::input_iterator Itor, std::sentinel_for<Itor> Sent>
158 void update(Itor beg, Sent end)
159 {
160 while (beg != end)
161 {
162 update(*beg);
163 ++beg;
164 }
165 }
166
172 {
173 return reservoir_;
174 }
175
180 [[nodiscard]] size_t total_seen() const noexcept { return n_seen_; }
181
186 [[nodiscard]] size_t sample_size() const noexcept { return k_; }
187
193
201 void set_n_seen_for_testing(const size_t n)
202 {
203 const size_t expected = (n < k_) ? n : k_;
205 << "set_n_seen_for_testing: inconsistent state: "
206 << "n=" << n << ", k=" << k_
207 << ", reservoir.size()=" << reservoir_.size()
208 << " (expected " << expected << ")";
209 n_seen_ = n;
210 }
211
218 void clear()
219 {
220 reservoir_.clear();
221 n_seen_ = 0;
222 gsl_rng_set(rng_.get(), seed_);
223 }
224 };
225
226
243 template <std::input_iterator Itor, std::sentinel_for<Itor> Sent>
244 [[nodiscard]] inline auto
245 reservoir_sample(Itor beg, Sent end, size_t k,
246 unsigned long seed = static_cast<unsigned long>(
247 std::chrono::high_resolution_clock::now().time_since_epoch().count()))
248 {
249 using T = typename std::iterator_traits<Itor>::value_type;
251 sampler.update(beg, end);
252 return Array<T>(sampler.get_sample());
253 }
254
255} // namespace Aleph
256
257# endif // RESERVOIR_SAMPLING_H
Exception handling system with formatted messages for Aleph-w.
#define ah_runtime_error_unless(C)
Throws std::runtime_error if condition does NOT hold.
Definition ah-errors.H:250
#define ah_overflow_error_if(C)
Throws std::overflow_error if condition holds.
Definition ah-errors.H:463
#define ah_invalid_argument_if(C)
Throws std::invalid_argument if condition holds.
Definition ah-errors.H:639
Simple dynamic array with automatic resizing and functional operations.
Definition tpl_array.H:139
Incremental reservoir sampler (Algorithm R).
size_t sample_size() const noexcept
Target sample size.
uint64_t rng_range_
Cached range of the RNG (max - min + 1).
void update(const T &val)
Process a new element from the stream.
size_t n_seen_
Total elements seen so far.
void clear()
Reset the sampler while keeping k and seed.
const Array< T > & get_sample() const noexcept
Returns the current random sample.
Reservoir_Sampler(const size_t k, const unsigned long seed=static_cast< unsigned long >(std::chrono::high_resolution_clock::now().time_since_epoch().count()))
Initialize sampler.
void update(Itor beg, Sent end)
Process multiple elements.
uint64_t rng_min_
Minimum value produced by RNG.
uint64_t rng_range() const noexcept
RNG output range (max - min + 1).
GslRngPtr rng_
Random number generator.
Array< T > reservoir_
Current random sample.
uint64_t rng_max_
Maximum value produced by RNG.
void set_n_seen_for_testing(const size_t n)
Forcibly set the internal stream counter (for testing only).
std::unique_ptr< gsl_rng, void(*)(gsl_rng *)> GslRngPtr
size_t total_seen() const noexcept
Total elements processed so far.
unsigned long seed_
Stored seed for RNG reset.
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
auto reservoir_sample(Itor beg, Sent end, size_t k, unsigned long seed=static_cast< unsigned long >(std::chrono::high_resolution_clock::now().time_since_epoch().count()))
Convenience function to sample k elements from a range.
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
std::decay_t< typename HeadC::Item_Type > T
Definition ah-zip.H:105
Itor upper_bound(Itor beg, Itor end, const T &value)
Find upper bound in a sorted range.
Definition ahAlgo.H:1239
Itor::difference_type count(const Itor &beg, const Itor &end, const T &value)
Count elements equal to a value.
Definition ahAlgo.H:127
ValueArg< size_t > seed
Definition testHash.C:48
static int * k
Dynamic array container with automatic resizing.