31#include <gtest/gtest.h>
48#include <unordered_set>
68 double min_uniqueness = 0.0;
69 double max_random_chi = std::numeric_limits<double>::infinity();
70 double max_prefix_chi = std::numeric_limits<double>::infinity();
71 double max_sequential_chi = std::numeric_limits<double>::infinity();
72 double min_avalanche = 0.0;
73 double max_bit_bias = std::numeric_limits<double>::infinity();
78 const char * name =
"";
79 HashProfile profile = HashProfile::Weak;
81 std::function<size_t(
const std::string &)> hash;
87 size_t (* fast_hash)(
const std::string &) =
nullptr;
92 double uniqueness = 0.0;
93 double random_chi = 0.0;
94 double prefix_chi = 0.0;
95 double sequential_chi = 0.0;
96 double avalanche = 0.0;
97 double max_bit_bias = 0.0;
98 double throughput_mib_s = 0.0;
99 size_t full_collisions = 0;
100 size_t max_random_bucket = 0;
102 double latency_ns = 0.0;
103 size_t output_bits = 0;
110 case HashProfile::Weak:
112 case HashProfile::General:
113 return {.min_uniqueness = 0.970,
114 .max_random_chi = 4.25,
115 .max_prefix_chi = 6.50,
116 .max_sequential_chi = 5.50,
117 .min_avalanche = 0.34,
118 .max_bit_bias = 0.15};
119 case HashProfile::Strong:
120 return {.min_uniqueness = 0.995,
121 .max_random_chi = 2.60,
122 .max_prefix_chi = 3.40,
123 .max_sequential_chi = 3.20,
124 .min_avalanche = 0.42,
125 .max_bit_bias = 0.09};
138 std::uniform_int_distribution<int>
byte_dist(0, 255);
141 std::unordered_set<std::string>
seen;
147 std::string sample(len,
'\0');
148 for (
size_t j = 0; j < len; ++j)
150 if (
seen.insert(sample).second)
151 out.append(std::move(sample));
165 for (
size_t i = 0; i <
count; ++i)
167 std::string sample =
prefix;
170 const std::uint64_t v =
static_cast<std::uint64_t
>(i);
172 sample.push_back(
static_cast<char>((v >> (j * 8)) & 0xff));
173 out.append(std::move(sample));
184 for (
size_t i = 0; i <
count; ++i)
186 std::string sample(
sizeof(std::uint64_t),
'\0');
187 std::uint64_t value =
static_cast<std::uint64_t
>(i);
188 for (
size_t j = 0; j <
sizeof(value); ++j)
189 sample[j] =
static_cast<char>((value >> (j * 8)) & 0xff);
190 out.append(std::move(sample));
207 for (
const auto & sample : samples)
210 const double expected =
static_cast<double>(samples.
size()) / buckets;
213 for (
const size_t value :
counts)
215 const double diff =
static_cast<double>(value) -
expected;
227 std::unordered_set<size_t>
unique;
230 for (
const auto & sample : samples)
234 return static_cast<double>(
unique.size()) / samples.
size();
242 for (
const auto & sample : samples)
246 return std::max<size_t>(bits, 1);
253 std::array<size_t,
sizeof(size_t) * 8>
ones = {};
255 for (
const auto & sample : samples)
257 const size_t hash =
candidate.hash(sample);
269 const double ratio =
static_cast<double>(
ones[
bit]) / samples.
size();
287 for (
const auto & base : samples)
290 const size_t flip_bits = std::min<size_t>(base.size() * 8, 64);
306struct ThroughputResult
310 double latency_ns = 0.0;
321 constexpr std::uint64_t
target_bytes = 128ULL * 1024 * 1024;
324 volatile size_t sink = 0;
326 const auto start = std::chrono::steady_clock::now();
329 for (
const auto & sample : samples)
338 const auto stop = std::chrono::steady_clock::now();
339 const std::chrono::duration<double> elapsed = stop - start;
342 ThroughputResult
res;
343 res.mib_s =
static_cast<double>(
processed) / (1024.0 * 1024.0) / elapsed.count();
345 res.latency_ns = (std::chrono::duration_cast<std::chrono::nanoseconds>(elapsed).count()) /
static_cast<double>(calls);
360 const auto [random_chi, max_random_bucket]
362 metrics.random_chi = random_chi;
363 metrics.max_random_bucket = max_random_bucket;
381 metrics.throughput_mib_s = tp.mib_s;
382 metrics.latency_ns = tp.latency_ns;
383 metrics.total_calls += tp.calls;
393 size_t add (
const std::string & s) {
return add_hash(s.data(), s.size()); }
394 size_t xor_ (
const std::string & s) {
return xor_hash(s.data(), s.size()); }
395 size_t rot (
const std::string & s) {
return rot_hash(s.data(), s.size()); }
396 size_t djb (
const std::string & s) {
return djb_hash(s.data(), s.size()); }
397 size_t sax (
const std::string & s) {
return sax_hash(s.data(), s.size()); }
398 size_t fnv (
const std::string & s) {
return fnv_hash(s.data(), s.size()); }
399 size_t oat (
const std::string & s) {
return oat_hash(s.data(), s.size()); }
400 size_t jsw (
const std::string & s) {
return jsw_hash(s.data(), s.size()); }
401 size_t elf (
const std::string & s) {
return elf_hash(s.data(), s.size()); }
402 size_t jen (
const std::string & s)
406 size_t xxh64 (
const std::string & s) {
return xxhash64_hash(s.data(), s.size(), 42); }
407 size_t wyh (
const std::string & s) {
return wyhash_hash(s.data(), s.size(), 42); }
415 {
"add_hash", HashProfile::Weak, add, add},
416 {
"xor_hash", HashProfile::Weak,
xor_,
xor_},
417 {
"rot_hash", HashProfile::Weak,
rot,
rot},
418 {
"djb_hash", HashProfile::General,
djb,
djb},
419 {
"sax_hash", HashProfile::Weak,
sax,
sax},
420 {
"fnv_hash", HashProfile::General,
fnv,
fnv},
421 {
"oat_hash", HashProfile::General,
oat,
oat},
422 {
"jsw_hash", HashProfile::Weak,
jsw,
jsw},
423 {
"elf_hash", HashProfile::Weak,
elf,
elf},
424 {
"jen_hash", HashProfile::Weak,
jen,
jen},
425 {
"SuperFastHash", HashProfile::General,
sfh,
sfh},
427 {
"xxhash64_hash", HashProfile::Strong,
xxh64,
xxh64},
428 {
"wyhash_hash", HashProfile::Strong,
wyh,
wyh},
429 {
"siphash24_hash", HashProfile::Strong,
sip24,
sip24},
435 std::ostringstream
out;
436 out <<
"\nHash statistics summary\n";
437 out << std::left << std::setw(16) <<
"name"
438 << std::right << std::setw(5) <<
"bits"
439 << std::setw(8) <<
"uniq"
440 << std::setw(8) <<
"chi-r"
441 << std::setw(8) <<
"chi-p"
442 << std::setw(8) <<
"chi-s"
443 << std::setw(8) <<
"avalan"
444 << std::setw(8) <<
"bias"
445 << std::setw(10) <<
"MiB/s"
446 << std::setw(10) <<
"lat-ns"
447 << std::setw(8) <<
"coll"
450 for (
const auto & [name,
metrics] : rows)
452 out << std::left << std::setw(16) << name
454 << std::setw(5) <<
metrics.output_bits
455 << std::fixed << std::setprecision(3)
456 << std::setw(8) <<
metrics.uniqueness
457 << std::setw(8) <<
metrics.random_chi
458 << std::setw(8) <<
metrics.prefix_chi
459 << std::setw(8) <<
metrics.sequential_chi
460 << std::setw(8) <<
metrics.avalanche
461 << std::setw(8) <<
metrics.max_bit_bias
462 << std::setw(10) << std::setprecision(1) <<
metrics.throughput_mib_s
463 << std::setw(10) << std::setprecision(2) <<
metrics.latency_ns
464 << std::setw(8) <<
metrics.full_collisions
471class HashStatisticalTest :
public ::testing::Test
474 static void SetUpTestSuite()
518 if (
candidate.profile == HashProfile::Weak)
526 <<
candidate.name <<
" uniqueness too low";
528 <<
candidate.name <<
" random chi-square too high";
530 <<
candidate.name <<
" shared-prefix chi-square too high";
532 <<
candidate.name <<
" sequential chi-square too high";
534 <<
candidate.name <<
" avalanche ratio too low";
536 <<
candidate.name <<
" output-bit bias too high";
543 return std::abs(1.0 -
m.uniqueness) +
544 std::abs(1.0 -
m.random_chi) +
545 std::abs(1.0 -
m.prefix_chi) +
546 std::abs(1.0 -
m.sequential_chi) +
547 std::abs(0.5 -
m.avalanche) +
553 [&] (
const auto & a,
const auto & b)
558 std::cout <<
"\nRanking by Dispersion (lower is better):\n";
560 std::cout << std::setw(2) << i + 1 <<
". "
562 <<
" Score: " << std::fixed << std::setprecision(4)
567 [] (
const auto & a,
const auto & b)
569 return a.second.throughput_mib_s > b.second.throughput_mib_s;
572 std::cout <<
"\nRanking by Speed (higher is better):\n";
573 for (
size_t i = 0; i <
speed_rows.size(); ++i)
574 std::cout << std::setw(2) << i + 1 <<
". "
575 << std::left << std::setw(16) <<
speed_rows[i].first
576 <<
" Speed: " << std::fixed << std::setprecision(1) << std::right << std::setw(8)
577 <<
speed_rows[i].second.throughput_mib_s <<
" MiB/s ("
578 << std::fixed << std::setprecision(2) << std::setw(6)
579 <<
speed_rows[i].second.latency_ns <<
" ns/op)\n";
587 for (std::uint64_t i = 1; i <= 4096; ++i)
589 const auto lhs = std::make_pair(i, i + 1);
590 const auto rhs = std::make_pair(i + 1, i);
609 const std::pair<std::string, int> item{
"aleph", 42};
611 [] (
const std::string & key) {
return dft_hash_fct(key); }, item);
613 [] (
const std::string & key) {
return snd_hash_fct(key); }, item);
615 [] (
const std::string & key) {
return xxhash64_hash(key); }, item);
642 const std::array<size_t, 6> sizes = {1'000, 10'000, 65'536, 100'000,
649 const double log2_space =
static_cast<double>(bits);
650 const double log2_N2 = std::log2(
static_cast<double>(n)) +
651 std::log2(
static_cast<double>(n - 1));
657 std::cout <<
"\nCollision scaling: 32-bit vs 64-bit output (wyhash_hash)\n"
658 << std::left << std::setw(10) <<
"N"
659 << std::right << std::setw(14) <<
"32b-collisions"
660 << std::setw(14) <<
"32b-expected"
661 << std::setw(14) <<
"64b-collisions"
662 << std::setw(14) <<
"64b-expected"
665 for (
const size_t n : sizes)
669 std::unordered_set<std::uint32_t>
h32;
670 std::unordered_set<std::uint64_t>
h64;
674 for (
const auto & s : samples)
676 const std::uint64_t v =
wyhash_hash(s.data(), s.size(), 42);
677 h32.insert(
static_cast<std::uint32_t
>(v));
686 std::cout << std::left << std::setw(10) << n
688 << std::setw(14) <<
coll32
689 << std::setw(14) << std::fixed << std::setprecision(2) <<
exp32
690 << std::setw(14) <<
coll64
691 << std::setw(14) << std::fixed << std::setprecision(6) <<
exp64
697 if constexpr (
sizeof(size_t) >= 8)
698 EXPECT_EQ(
coll64, 0u) <<
"64-bit hash unexpectedly collided at N=" << n;
702 const double N4G = 4
e9;
703 const double exp32_4G =
N4G * (
N4G - 1.0) / (2.0 * std::pow(2.0, 32));
704 const double exp64_4G =
N4G * (
N4G - 1.0) / (2.0 * std::pow(2.0, 64));
706 std::cout <<
"\nExtrapolation to N = 4 × 10^9:\n"
707 <<
" 32-bit expected collisions : " << std::scientific
708 << std::setprecision(3) <<
exp32_4G <<
" (~50 % of N)\n"
709 <<
" 64-bit expected collisions : " <<
exp64_4G
710 <<
" (negligible)\n";
714 <<
"32-bit hash should show >10 % collision rate at 4 G entries";
716 <<
"64-bit hash should show < 1 expected collision at 4 G entries";
TEST_F(StaticArenaFixture, simple_fail)
High-level sorting functions for Aleph containers.
Simple dynamic array with automatic resizing and functional operations.
constexpr size_t size() const noexcept
Return the number of elements stored in the stack.
T & append(const T &data)
Append a copy of data
void reserve(size_t cap)
Reserves cap cells into the array.
__gmp_expr< typename __gmp_resolve_expr< T, V >::value_type, __gmp_binary_expr< __gmp_expr< T, U >, __gmp_expr< V, W >, __gmp_pow_function > > pow(const __gmp_expr< T, U > &expr1, const __gmp_expr< V, W > &expr2)
size_t sax_hash(const void *key, const size_t len) noexcept
Shift-Add-XOR hash (SAX hash)
size_t SuperFastHash(const void *key, size_t len, std::uint32_t seed=0) noexcept
Paul Hsieh super fast hash function.
size_t add_hash(const void *key, const size_t len) noexcept
Additive hash.
size_t jsw_hash(const void *key, size_t len) noexcept
JSW hash (Julienne Walker)
size_t xxhash64_hash(const void *key, size_t len, std::uint64_t seed) noexcept
xxHash64 from the xxHash family.
size_t djb_hash(const void *key, const size_t len) noexcept
Bernstein's hash (DJB hash)
size_t jen_hash(const void *key, size_t length, unsigned initval) noexcept
Jenkins hash (lookup3)
size_t xor_hash(const void *key, const size_t len) noexcept
XOR hash.
size_t fnv_hash(const void *key, const size_t len) noexcept
FNV-1a hash.
size_t siphash24_hash(const void *key, size_t len, std::uint64_t key0, std::uint64_t key1) noexcept
SipHash-2-4 keyed hash.
void init_jsw() noexcept
Initializes the randomized lookup table used by jsw_hash().
size_t rot_hash(const void *key, const size_t len) noexcept
Rotating hash.
size_t wyhash_hash(const void *key, size_t len, std::uint64_t seed) noexcept
wyhash non-cryptographic hash.
size_t elf_hash(const void *key, const size_t len) noexcept
ELF hash.
size_t oat_hash(const void *key, const size_t len) noexcept
One-at-a-Time hash (OAT hash)
Main namespace for Aleph-w library functions.
size_t pair_snd_hash_fct(const std::pair< K1, K2 > &p) noexcept
Itor unique(Itor __first, Itor __last, BinaryPredicate __binary_pred=BinaryPredicate())
Remove consecutive duplicates in place.
size_t size(Node *root) noexcept
size_t snd_hash_fct(const Key &key) noexcept
Secondary default hash: different distribution from dft_hash_fct.
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
DynArray< T > & in_place_sort(DynArray< T > &c, Cmp cmp=Cmp())
Sorts a DynArray in place.
static void prefix(Node *root, DynList< Node * > &acc)
bool diff(const C1 &c1, const C2 &c2, Eq e=Eq())
Check if two containers differ.
size_t dft_hash_fct(const Key &key) noexcept
Primary default hash: best speed/quality trade-off.
size_t pair_dft_hash_fct(const std::pair< K1, K2 > &p) noexcept
const unsigned Default_Hash_Seed
size_t murmur3hash(const Key &key, std::uint32_t seed)
Itor::difference_type count(const Itor &beg, const Itor &end, const T &value)
Count elements equal to a value.
FooMap m(5, fst_unit_pair_hash, snd_unit_pair_hash)
Dynamic array container with automatic resizing.