Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
hash_statistical_test.cc
Go to the documentation of this file.
1/*
2 Aleph_w
3
4 Data structures & Algorithms
5 version 2.0.0b
6 https://github.com/lrleon/Aleph-w
7
8 This file is part of Aleph-w library
9
10 Copyright (c) 2002-2026 Leandro Rabindranath Leon
11
12 Permission is hereby granted, free of charge, to any person obtaining a copy
13 of this software and associated documentation files (the "Software"), to deal
14 in the Software without restriction, including without limitation the rights
15 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 copies of the Software, and to permit persons to whom the Software is
17 furnished to do so, subject to the following conditions:
18
19 The above copyright notice and this permission notice shall be included in all
20 copies or substantial portions of the Software.
21
22 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 SOFTWARE.
29*/
30
31#include <gtest/gtest.h>
32#include <hash-fct.H>
33
34#include <algorithm>
35#include <array>
36#include <bit>
37#include <chrono>
38#include <cmath>
39#include <cstdint>
40#include <functional>
41#include <iomanip>
42#include <iostream>
43#include <limits>
44#include <numeric>
45#include <random>
46#include <sstream>
47#include <string>
48#include <unordered_set>
49#include <utility>
50
51#include <tpl_array.H>
52#include <ahSort.H>
53
54using namespace Aleph;
55
56namespace
57{
58
59enum class HashProfile
60{
61 Weak,
62 General,
63 Strong
64};
65
66struct Thresholds
67{
68 double min_uniqueness = 0.0;
69 double max_random_chi = std::numeric_limits<double>::infinity();
70 double max_prefix_chi = std::numeric_limits<double>::infinity();
71 double max_sequential_chi = std::numeric_limits<double>::infinity();
72 double min_avalanche = 0.0;
73 double max_bit_bias = std::numeric_limits<double>::infinity();
74};
75
76struct Candidate
77{
78 const char * name = "";
79 HashProfile profile = HashProfile::Weak;
80 // Used for all statistical tests (correctness, not timing-sensitive).
81 std::function<size_t(const std::string &)> hash;
82 // Used only for throughput measurement. Must be a non-capturing lambda
83 // (implicitly convertible to a plain function pointer so the compiler sees
84 // a single constant call target, the CPU branch predictor can cache it,
85 // and -- unlike std::function -- there is no heap allocation or virtual
86 // dispatch on every call).
87 size_t (* fast_hash)(const std::string &) = nullptr;
88};
89
90struct Metrics
91{
92 double uniqueness = 0.0;
93 double random_chi = 0.0;
94 double prefix_chi = 0.0;
95 double sequential_chi = 0.0;
96 double avalanche = 0.0;
97 double max_bit_bias = 0.0;
98 double throughput_mib_s = 0.0;
99 size_t full_collisions = 0;
100 size_t max_random_bucket = 0;
101 uint64_t total_calls = 0;
102 double latency_ns = 0.0;
103 size_t output_bits = 0;
104};
105
106Thresholds thresholds_for(HashProfile profile)
107{
108 switch (profile)
109 {
110 case HashProfile::Weak:
111 return {};
112 case HashProfile::General:
113 return {.min_uniqueness = 0.970,
114 .max_random_chi = 4.25,
115 .max_prefix_chi = 6.50,
116 .max_sequential_chi = 5.50,
117 .min_avalanche = 0.34,
118 .max_bit_bias = 0.15};
119 case HashProfile::Strong:
120 return {.min_uniqueness = 0.995,
121 .max_random_chi = 2.60,
122 .max_prefix_chi = 3.40,
123 .max_sequential_chi = 3.20,
124 .min_avalanche = 0.42,
125 .max_bit_bias = 0.09};
126 }
127
128 return {};
129}
130
132 size_t min_len,
133 size_t max_len,
134 std::uint64_t seed)
135{
136 std::mt19937_64 rng(seed);
137 std::uniform_int_distribution<size_t> len_dist(min_len, max_len);
138 std::uniform_int_distribution<int> byte_dist(0, 255);
141 std::unordered_set<std::string> seen;
142 seen.reserve(count * 2);
143
144 while (out.size() < count)
145 {
146 const size_t len = len_dist(rng);
147 std::string sample(len, '\0');
148 for (size_t j = 0; j < len; ++j)
149 sample[j] = static_cast<char>(byte_dist(rng));
150 if (seen.insert(sample).second)
151 out.append(std::move(sample));
152 }
153
154 return out;
155}
156
158 size_t prefix_len,
159 size_t suffix_len)
160{
163 const std::string prefix(prefix_len, 'A');
164
165 for (size_t i = 0; i < count; ++i)
166 {
167 std::string sample = prefix;
168 // Use a 64-bit counter so shifts up to 56 bits are well-defined on
169 // 32-bit platforms where size_t is only 32 bits.
170 const std::uint64_t v = static_cast<std::uint64_t>(i);
171 for (size_t j = 0; j < suffix_len; ++j)
172 sample.push_back(static_cast<char>((v >> (j * 8)) & 0xff));
173 out.append(std::move(sample));
174 }
175
176 return out;
177}
178
180{
183
184 for (size_t i = 0; i < count; ++i)
185 {
186 std::string sample(sizeof(std::uint64_t), '\0');
187 std::uint64_t value = static_cast<std::uint64_t>(i);
188 for (size_t j = 0; j < sizeof(value); ++j)
189 sample[j] = static_cast<char>((value >> (j * 8)) & 0xff);
190 out.append(std::move(sample));
191 }
192
193 return out;
194}
195
197 std::uint64_t seed)
198{
199 return make_random_binary_samples(count, len, len, seed);
200}
201
202std::pair<double, size_t> reduced_chi_square(const Candidate & candidate,
203 const Array<std::string> & samples,
204 size_t buckets)
205{
206 Array<size_t> counts(buckets, 0);
207 for (const auto & sample : samples)
208 ++counts[candidate.hash(sample) % buckets];
209
210 const double expected = static_cast<double>(samples.size()) / buckets;
211 double chi = 0.0;
212 size_t max_bucket = 0;
213 for (const size_t value : counts)
214 {
215 const double diff = static_cast<double>(value) - expected;
216 chi += (diff * diff) / expected;
217 max_bucket = std::max(max_bucket, value);
218 }
219
220 return {chi / static_cast<double>(buckets - 1), max_bucket};
221}
222
223double uniqueness_ratio(const Candidate & candidate,
224 const Array<std::string> & samples,
225 size_t & collisions)
226{
227 std::unordered_set<size_t> unique;
228 unique.reserve(samples.size() * 2);
229
230 for (const auto & sample : samples)
231 unique.insert(candidate.hash(sample));
232
233 collisions = samples.size() - unique.size();
234 return static_cast<double>(unique.size()) / samples.size();
235}
236
237size_t effective_output_bits(const Candidate & candidate,
238 const Array<std::string> & samples)
239{
240 size_t observed_mask = 0;
241
242 for (const auto & sample : samples)
243 observed_mask |= candidate.hash(sample);
244
245 const size_t bits = std::bit_width(observed_mask);
246 return std::max<size_t>(bits, 1);
247}
248
249double max_output_bit_bias(const Candidate & candidate,
250 const Array<std::string> & samples)
251{
252 const size_t bits = effective_output_bits(candidate, samples);
253 std::array<size_t, sizeof(size_t) * 8> ones = {};
254
255 for (const auto & sample : samples)
256 {
257 const size_t hash = candidate.hash(sample);
258 for (size_t bit = 0; bit < bits; ++bit)
259 ones[bit] += (hash >> bit) & size_t{1};
260 }
261
262 // Only check the bits that are actually used by this hash function.
263 // Iterating beyond `bits` would see ones[bit]==0 for all unused entries,
264 // which gives ratio=0 → |0-0.5|=0.5 — a spurious maximum bias for every
265 // hash function whose output is narrower than 64 bits.
266 double max_bias = 0.0;
267 for (size_t bit = 0; bit < bits; ++bit)
268 {
269 const double ratio = static_cast<double>(ones[bit]) / samples.size();
270 max_bias = std::max(max_bias, std::abs(ratio - 0.5));
271 }
272
273 return max_bias;
274}
275
276double avalanche_ratio(const Candidate & candidate,
277 const Array<std::string> & samples)
278{
279 // effective_output_bits is computed from the same sample set used for
280 // flipping so it captures the true output width (32 for SuperFastHash,
281 // 64 for xxhash64/wyhash/etc.) without inflating comparisons for
282 // 32-bit hashes or deflating them for 64-bit ones.
283 const size_t output_bits = effective_output_bits(candidate, samples);
284 std::uint64_t changed_bits = 0;
285 std::uint64_t comparisons = 0;
286
287 for (const auto & base : samples)
288 {
289 const size_t original = candidate.hash(base);
290 const size_t flip_bits = std::min<size_t>(base.size() * 8, 64);
291
292 for (size_t bit = 0; bit < flip_bits; ++bit)
293 {
294 std::string mutated = base;
295 mutated[bit / 8] ^= static_cast<char>(1u << (bit % 8));
296 const size_t hashed = candidate.hash(mutated);
297 changed_bits += std::popcount(original ^ hashed);
298 comparisons += output_bits;
299 }
300 }
301
302 return comparisons == 0 ? 0.0
303 : static_cast<double>(changed_bits) / comparisons;
304}
305
306struct ThroughputResult
307{
308 double mib_s = 0.0;
309 uint64_t calls = 0;
310 double latency_ns = 0.0;
311};
312
313ThroughputResult throughput_metrics(const Candidate & candidate,
314 const Array<std::string> & samples)
315{
316 // Use fast_hash (plain function pointer) when available to avoid the
317 // ~10-15 ns std::function dispatch overhead on every call.
318 // Fall back to hash (std::function) if fast_hash was not set.
319 const bool use_fast = (candidate.fast_hash != nullptr);
320
321 constexpr std::uint64_t target_bytes = 128ULL * 1024 * 1024;
322 std::uint64_t processed = 0;
323 uint64_t calls = 0;
324 volatile size_t sink = 0;
325
326 const auto start = std::chrono::steady_clock::now();
327 while (processed < target_bytes)
328 {
329 for (const auto & sample : samples)
330 {
331 sink ^= use_fast ? candidate.fast_hash(sample) : candidate.hash(sample);
332 processed += sample.size();
333 calls++;
334 if (processed >= target_bytes)
335 break;
336 }
337 }
338 const auto stop = std::chrono::steady_clock::now();
339 const std::chrono::duration<double> elapsed = stop - start;
340 (void) sink; // prevent dead-store elimination; correctness is not guaranteed
341
342 ThroughputResult res;
343 res.mib_s = static_cast<double>(processed) / (1024.0 * 1024.0) / elapsed.count();
344 res.calls = calls;
345 res.latency_ns = (std::chrono::duration_cast<std::chrono::nanoseconds>(elapsed).count()) / static_cast<double>(calls);
346 return res;
347}
348
349Metrics measure_metrics(const Candidate & candidate,
354{
355 Metrics metrics;
357 metrics.full_collisions);
358 metrics.total_calls += random_samples.size();
359
360 const auto [random_chi, max_random_bucket]
362 metrics.random_chi = random_chi;
363 metrics.max_random_bucket = max_random_bucket;
364 metrics.total_calls += random_samples.size();
365
366 metrics.prefix_chi = reduced_chi_square(candidate, prefix_samples, 1024).first;
367 metrics.total_calls += prefix_samples.size();
368
369 metrics.sequential_chi = reduced_chi_square(candidate, sequential_samples, 1024).first;
370 metrics.total_calls += sequential_samples.size();
371
373 const size_t flip_bits = std::min<size_t>(avalanche_samples[0].size() * 8, 64);
374 metrics.total_calls += avalanche_samples.size() * (1 + flip_bits);
375
378 metrics.total_calls += random_samples.size();
379
381 metrics.throughput_mib_s = tp.mib_s;
382 metrics.latency_ns = tp.latency_ns;
383 metrics.total_calls += tp.calls;
384
385 return metrics;
386}
387
388// Helper: non-capturing lambdas used both as std::function and as plain
389// function pointers (fast_hash). They MUST remain non-capturing so that the
390// implicit conversion to size_t(*)(const std::string &) compiles.
392{
393 size_t add (const std::string & s) { return add_hash(s.data(), s.size()); }
394 size_t xor_ (const std::string & s) { return xor_hash(s.data(), s.size()); }
395 size_t rot (const std::string & s) { return rot_hash(s.data(), s.size()); }
396 size_t djb (const std::string & s) { return djb_hash(s.data(), s.size()); }
397 size_t sax (const std::string & s) { return sax_hash(s.data(), s.size()); }
398 size_t fnv (const std::string & s) { return fnv_hash(s.data(), s.size()); }
399 size_t oat (const std::string & s) { return oat_hash(s.data(), s.size()); }
400 size_t jsw (const std::string & s) { return jsw_hash(s.data(), s.size()); }
401 size_t elf (const std::string & s) { return elf_hash(s.data(), s.size()); }
402 size_t jen (const std::string & s)
403 { return jen_hash(static_cast<const void *>(s.data()), s.size(), Default_Hash_Seed); }
404 size_t sfh (const std::string & s) { return SuperFastHash(s); }
405 size_t murmur3 (const std::string & s) { return murmur3hash(s, 42ul); }
406 size_t xxh64 (const std::string & s) { return xxhash64_hash(s.data(), s.size(), 42); }
407 size_t wyh (const std::string & s) { return wyhash_hash(s.data(), s.size(), 42); }
408 size_t sip24 (const std::string & s) { return siphash24_hash(s.data(), s.size()); }
409} // namespace CandidateFns
410
412{
413 using namespace CandidateFns;
414 return {
415 {"add_hash", HashProfile::Weak, add, add},
416 {"xor_hash", HashProfile::Weak, xor_, xor_},
417 {"rot_hash", HashProfile::Weak, rot, rot},
418 {"djb_hash", HashProfile::General, djb, djb},
419 {"sax_hash", HashProfile::Weak, sax, sax},
420 {"fnv_hash", HashProfile::General, fnv, fnv},
421 {"oat_hash", HashProfile::General, oat, oat},
422 {"jsw_hash", HashProfile::Weak, jsw, jsw},
423 {"elf_hash", HashProfile::Weak, elf, elf},
424 {"jen_hash", HashProfile::Weak, jen, jen},
425 {"SuperFastHash", HashProfile::General, sfh, sfh},
426 {"murmur3hash", HashProfile::Strong, murmur3, murmur3},
427 {"xxhash64_hash", HashProfile::Strong, xxh64, xxh64},
428 {"wyhash_hash", HashProfile::Strong, wyh, wyh},
429 {"siphash24_hash", HashProfile::Strong, sip24, sip24},
430 };
431}
432
433std::string format_metrics_table(const Array<std::pair<std::string, Metrics>> & rows)
434{
435 std::ostringstream out;
436 out << "\nHash statistics summary\n";
437 out << std::left << std::setw(16) << "name"
438 << std::right << std::setw(5) << "bits"
439 << std::setw(8) << "uniq"
440 << std::setw(8) << "chi-r"
441 << std::setw(8) << "chi-p"
442 << std::setw(8) << "chi-s"
443 << std::setw(8) << "avalan"
444 << std::setw(8) << "bias"
445 << std::setw(10) << "MiB/s"
446 << std::setw(10) << "lat-ns"
447 << std::setw(8) << "coll"
448 << '\n';
449
450 for (const auto & [name, metrics] : rows)
451 {
452 out << std::left << std::setw(16) << name
453 << std::right
454 << std::setw(5) << metrics.output_bits
455 << std::fixed << std::setprecision(3)
456 << std::setw(8) << metrics.uniqueness
457 << std::setw(8) << metrics.random_chi
458 << std::setw(8) << metrics.prefix_chi
459 << std::setw(8) << metrics.sequential_chi
460 << std::setw(8) << metrics.avalanche
461 << std::setw(8) << metrics.max_bit_bias
462 << std::setw(10) << std::setprecision(1) << metrics.throughput_mib_s
463 << std::setw(10) << std::setprecision(2) << metrics.latency_ns
464 << std::setw(8) << metrics.full_collisions
465 << '\n';
466 }
467
468 return out.str();
469}
470
471class HashStatisticalTest : public ::testing::Test
472{
473protected:
474 static void SetUpTestSuite()
475 {
476 init_jsw(42);
477 }
478
479 static const Array<std::string> & random_samples()
480 {
481 static const auto samples = make_random_binary_samples(100000, 1, 64, 0x12345678ULL);
482 return samples;
483 }
484
485 static const Array<std::string> & prefix_samples()
486 {
487 static const auto samples = make_prefix_samples(100000, 28, 8);
488 return samples;
489 }
490
492 {
493 static const auto samples = make_sequential_samples(100000);
494 return samples;
495 }
496
498 {
499 static const auto samples = make_avalanche_samples(1024, 32, 0xabcdefULL);
500 return samples;
501 }
502};
503
505{
507
508 for (const auto & candidate : all_candidates())
509 {
511 const Metrics metrics = measure_metrics(candidate,
516 rows.append(std::make_pair(std::string(candidate.name), metrics));
517
518 if (candidate.profile == HashProfile::Weak)
519 {
520 EXPECT_GT(metrics.throughput_mib_s, 0.0);
521 continue;
522 }
523
524 const Thresholds thresholds = thresholds_for(candidate.profile);
525 EXPECT_GE(metrics.uniqueness, thresholds.min_uniqueness)
526 << candidate.name << " uniqueness too low";
527 EXPECT_LE(metrics.random_chi, thresholds.max_random_chi)
528 << candidate.name << " random chi-square too high";
529 EXPECT_LE(metrics.prefix_chi, thresholds.max_prefix_chi)
530 << candidate.name << " shared-prefix chi-square too high";
531 EXPECT_LE(metrics.sequential_chi, thresholds.max_sequential_chi)
532 << candidate.name << " sequential chi-square too high";
533 EXPECT_GE(metrics.avalanche, thresholds.min_avalanche)
534 << candidate.name << " avalanche ratio too low";
535 EXPECT_LE(metrics.max_bit_bias, thresholds.max_bit_bias)
536 << candidate.name << " output-bit bias too high";
537 }
538
539 std::cout << format_metrics_table(rows) << std::flush;
540
541 auto dispersion_score = [] (const Metrics & m)
542 {
543 return std::abs(1.0 - m.uniqueness) +
544 std::abs(1.0 - m.random_chi) +
545 std::abs(1.0 - m.prefix_chi) +
546 std::abs(1.0 - m.sequential_chi) +
547 std::abs(0.5 - m.avalanche) +
548 m.max_bit_bias;
549 };
550
551 auto dispersion_rows = rows;
553 [&] (const auto & a, const auto & b)
554 {
555 return dispersion_score(a.second) < dispersion_score(b.second);
556 });
557
558 std::cout << "\nRanking by Dispersion (lower is better):\n";
559 for (size_t i = 0; i < dispersion_rows.size(); ++i)
560 std::cout << std::setw(2) << i + 1 << ". "
561 << std::left << std::setw(16) << dispersion_rows[i].first
562 << " Score: " << std::fixed << std::setprecision(4)
563 << dispersion_score(dispersion_rows[i].second) << "\n";
564
565 auto speed_rows = rows;
567 [] (const auto & a, const auto & b)
568 {
569 return a.second.throughput_mib_s > b.second.throughput_mib_s;
570 });
571
572 std::cout << "\nRanking by Speed (higher is better):\n";
573 for (size_t i = 0; i < speed_rows.size(); ++i)
574 std::cout << std::setw(2) << i + 1 << ". "
575 << std::left << std::setw(16) << speed_rows[i].first
576 << " Speed: " << std::fixed << std::setprecision(1) << std::right << std::setw(8)
577 << speed_rows[i].second.throughput_mib_s << " MiB/s ("
578 << std::fixed << std::setprecision(2) << std::setw(6)
579 << speed_rows[i].second.latency_ns << " ns/op)\n";
580}
581
583{
586
587 for (std::uint64_t i = 1; i <= 4096; ++i)
588 {
589 const auto lhs = std::make_pair(i, i + 1);
590 const auto rhs = std::make_pair(i + 1, i);
595 }
596
597 std::cout << "\npair_dft swapped collisions: " << swapped_pair_dft_collisions
598 << "\npair_snd swapped collisions: " << swapped_pair_snd_collisions
599 << std::endl;
600
601 // Both combiners now use hash_combine (non-commutative), so swapped-pair
602 // collisions should be near zero for both (<= 1% of the sample).
603 EXPECT_LE(swapped_pair_dft_collisions, 41u); // <= 1 % of 4096
605}
606
608{
609 const std::pair<std::string, int> item{"aleph", 42};
610 const auto dft = map_hash_fct<std::string, int>(
611 [] (const std::string & key) { return dft_hash_fct(key); }, item);
613 [] (const std::string & key) { return snd_hash_fct(key); }, item);
615 [] (const std::string & key) { return xxhash64_hash(key); }, item);
616
617 EXPECT_EQ(dft, dft_hash_fct(item.first));
618 EXPECT_EQ(snd, snd_hash_fct(item.first));
619 EXPECT_EQ(xxh, xxhash64_hash(item.first));
620}
621
622// ---------------------------------------------------------------------------
623// CollisionScalingByBitWidth
624//
625// Demonstrates that 32-bit hash output causes birthday-paradox collisions at
626// table sizes well below 4 G entries, while 64-bit output stays collision-free
627// up to ~4 × 10^9 entries.
628//
629// Strategy:
630// 1. Generate N unique strings and hash them with wyhash_hash (64-bit).
631// 2. Count collisions for the full 64-bit value and for the low 32 bits.
632// 3. Compare empirical counts with the birthday-paradox prediction:
633// E[collisions] ≈ N² / (2 × 2^bits)
634// 4. Extrapolate to 4 G entries and assert the 32-bit rate is unacceptable.
635//
636// We cannot allocate 4 B strings, but even at N = 1 M the 32-bit collision
637// rate is already measurable; the formula gives ~2 B collisions at N = 4 G.
638// ---------------------------------------------------------------------------
639TEST_F(HashStatisticalTest, CollisionScalingByBitWidth)
640{
641 // Table sizes we can test in reasonable time.
642 const std::array<size_t, 6> sizes = {1'000, 10'000, 65'536, 100'000,
643 500'000, 1'000'000};
644
645 // Birthday-paradox formula: expected collisions for N items in 2^b slots.
646 auto expected_collisions = [] (size_t n, size_t bits) -> double
647 {
648 // E ≈ N*(N-1) / (2 * 2^bits). Use log-space to avoid overflow at 64 bits.
649 const double log2_space = static_cast<double>(bits);
650 const double log2_N2 = std::log2(static_cast<double>(n)) +
651 std::log2(static_cast<double>(n - 1));
652 const double log2_expected = log2_N2 - 1.0 - log2_space;
653 return (log2_expected > 50.0) ? std::pow(2.0, std::min(log2_expected, 60.0))
654 : std::pow(2.0, log2_expected);
655 };
656
657 std::cout << "\nCollision scaling: 32-bit vs 64-bit output (wyhash_hash)\n"
658 << std::left << std::setw(10) << "N"
659 << std::right << std::setw(14) << "32b-collisions"
660 << std::setw(14) << "32b-expected"
661 << std::setw(14) << "64b-collisions"
662 << std::setw(14) << "64b-expected"
663 << '\n';
664
665 for (const size_t n : sizes)
666 {
667 auto samples = make_random_binary_samples(n, 8, 32, 0xdeadbeefULL + n);
668
669 std::unordered_set<std::uint32_t> h32;
670 std::unordered_set<std::uint64_t> h64;
671 h32.reserve(n * 2);
672 h64.reserve(n * 2);
673
674 for (const auto & s : samples)
675 {
676 const std::uint64_t v = wyhash_hash(s.data(), s.size(), 42);
677 h32.insert(static_cast<std::uint32_t>(v)); // keep only low 32 bits
678 h64.insert(v);
679 }
680
681 const size_t coll32 = n - h32.size();
682 const size_t coll64 = n - h64.size();
683 const double exp32 = expected_collisions(n, 32);
684 const double exp64 = expected_collisions(n, 64);
685
686 std::cout << std::left << std::setw(10) << n
687 << std::right
688 << std::setw(14) << coll32
689 << std::setw(14) << std::fixed << std::setprecision(2) << exp32
690 << std::setw(14) << coll64
691 << std::setw(14) << std::fixed << std::setprecision(6) << exp64
692 << '\n';
693
694 // 64-bit collision check is only meaningful when size_t is 64 bits.
695 // On 32-bit platforms wyhash_hash() returns a 32-bit size_t truncated
696 // to the same width as the 32-bit column, making coll64 == coll32.
697 if constexpr (sizeof(size_t) >= 8)
698 EXPECT_EQ(coll64, 0u) << "64-bit hash unexpectedly collided at N=" << n;
699 }
700
701 // Theoretical projection to N = 4 × 10^9 (representative "large table").
702 const double N4G = 4e9;
703 const double exp32_4G = N4G * (N4G - 1.0) / (2.0 * std::pow(2.0, 32));
704 const double exp64_4G = N4G * (N4G - 1.0) / (2.0 * std::pow(2.0, 64));
705
706 std::cout << "\nExtrapolation to N = 4 × 10^9:\n"
707 << " 32-bit expected collisions : " << std::scientific
708 << std::setprecision(3) << exp32_4G << " (~50 % of N)\n"
709 << " 64-bit expected collisions : " << exp64_4G
710 << " (negligible)\n";
711
712 // At 4 G entries a 32-bit hash has roughly N/2 collisions — clearly unusable.
713 EXPECT_GT(exp32_4G, N4G * 0.1)
714 << "32-bit hash should show >10 % collision rate at 4 G entries";
715 EXPECT_LT(exp64_4G, 1.0)
716 << "64-bit hash should show < 1 expected collision at 4 G entries";
717}
718
719} // namespace
TEST_F(StaticArenaFixture, simple_fail)
Definition ah-arena.cc:59
High-level sorting functions for Aleph containers.
Simple dynamic array with automatic resizing and functional operations.
Definition tpl_array.H:139
constexpr size_t size() const noexcept
Return the number of elements stored in the stack.
Definition tpl_array.H:351
T & append(const T &data)
Append a copy of data
Definition tpl_array.H:245
void reserve(size_t cap)
Reserves cap cells into the array.
Definition tpl_array.H:315
static mt19937 rng
__gmp_expr< typename __gmp_resolve_expr< T, V >::value_type, __gmp_binary_expr< __gmp_expr< T, U >, __gmp_expr< V, W >, __gmp_pow_function > > pow(const __gmp_expr< T, U > &expr1, const __gmp_expr< V, W > &expr2)
Definition gmpfrxx.h:4061
size_t sax_hash(const void *key, const size_t len) noexcept
Shift-Add-XOR hash (SAX hash)
Definition hash-fct.H:183
size_t SuperFastHash(const void *key, size_t len, std::uint32_t seed=0) noexcept
Paul Hsieh super fast hash function.
Definition hash-fct.H:408
size_t add_hash(const void *key, const size_t len) noexcept
Additive hash.
Definition hash-fct.H:101
size_t jsw_hash(const void *key, size_t len) noexcept
JSW hash (Julienne Walker)
Definition hash-fct.C:247
size_t xxhash64_hash(const void *key, size_t len, std::uint64_t seed) noexcept
xxHash64 from the xxHash family.
Definition hash-fct.C:611
size_t djb_hash(const void *key, const size_t len) noexcept
Bernstein's hash (DJB hash)
Definition hash-fct.H:163
size_t jen_hash(const void *key, size_t length, unsigned initval) noexcept
Jenkins hash (lookup3)
Definition hash-fct.C:325
size_t xor_hash(const void *key, const size_t len) noexcept
XOR hash.
Definition hash-fct.H:122
size_t fnv_hash(const void *key, const size_t len) noexcept
FNV-1a hash.
Definition hash-fct.H:203
size_t siphash24_hash(const void *key, size_t len, std::uint64_t key0, std::uint64_t key1) noexcept
SipHash-2-4 keyed hash.
Definition hash-fct.C:766
void init_jsw() noexcept
Initializes the randomized lookup table used by jsw_hash().
Definition hash-fct.C:219
size_t rot_hash(const void *key, const size_t len) noexcept
Rotating hash.
Definition hash-fct.H:143
size_t wyhash_hash(const void *key, size_t len, std::uint64_t seed) noexcept
wyhash non-cryptographic hash.
Definition hash-fct.C:694
size_t elf_hash(const void *key, const size_t len) noexcept
ELF hash.
Definition hash-fct.H:279
size_t oat_hash(const void *key, const size_t len) noexcept
One-at-a-Time hash (OAT hash)
Definition hash-fct.H:223
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
size_t pair_snd_hash_fct(const std::pair< K1, K2 > &p) noexcept
Definition hash-fct.H:1134
Itor unique(Itor __first, Itor __last, BinaryPredicate __binary_pred=BinaryPredicate())
Remove consecutive duplicates in place.
Definition ahAlgo.H:1058
size_t size(Node *root) noexcept
size_t snd_hash_fct(const Key &key) noexcept
Secondary default hash: different distribution from dft_hash_fct.
Definition hash-fct.H:1081
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
DynArray< T > & in_place_sort(DynArray< T > &c, Cmp cmp=Cmp())
Sorts a DynArray in place.
Definition ahSort.H:321
static void prefix(Node *root, DynList< Node * > &acc)
bool diff(const C1 &c1, const C2 &c2, Eq e=Eq())
Check if two containers differ.
size_t dft_hash_fct(const Key &key) noexcept
Primary default hash: best speed/quality trade-off.
Definition hash-fct.H:1003
size_t pair_dft_hash_fct(const std::pair< K1, K2 > &p) noexcept
Definition hash-fct.H:1121
const unsigned Default_Hash_Seed
Definition hash-fct.C:45
size_t murmur3hash(const Key &key, std::uint32_t seed)
Definition hash-fct.H:334
Itor::difference_type count(const Itor &beg, const Itor &end, const T &value)
Count elements equal to a value.
Definition ahAlgo.H:127
STL namespace.
FooMap m(5, fst_unit_pair_hash, snd_unit_pair_hash)
ValueArg< size_t > seed
Definition testHash.C:48
Dynamic array container with automatic resizing.