Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
minhash.H
Go to the documentation of this file.
1/*
2 Aleph_w
3
4 Data structures & Algorithms
5 version 2.0.0b
6 https://github.com/lrleon/Aleph-w
7
8 This file is part of Aleph-w library
9
10 Copyright (c) 2002-2026 Leandro Rabindranath Leon
11
12 Permission is hereby granted, free of charge, to any person obtaining a copy
13 of this software and associated documentation files (the "Software"), to deal
14 in the Software without restriction, including without limitation the rights
15 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 copies of the Software, and to permit persons to whom the Software is
17 furnished to do so, subject to the following conditions:
18
19 The above copyright notice and this permission notice shall be included in all
20 copies or substantial portions of the Software.
21
22 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 SOFTWARE.
29*/
30
31# ifndef MINHASH_H
32# define MINHASH_H
33
44# include <algorithm>
45# include <iterator>
46# include <limits>
47# include <cstdint>
48
49# include <ah-errors.H>
50# include <tpl_array.H>
51# include <hash-fct.H>
52
53namespace Aleph
54{
63 template <typename T>
64 class MinHash
65 {
66 size_t k_;
70 public:
75 explicit MinHash(size_t k = 128)
76 : k_(k)
77 {
78 ah_domain_error_if(k == 0) << "MinHash: k must be > 0";
79
81 for (size_t i = 0; i < k; ++i)
82 signature_.append(std::numeric_limits<uint64_t>::max());
84
85 // Use dft_hash_fct to generate independent seeds
86 for (size_t i = 0; i < k_; ++i)
87 seeds_.append(static_cast<uint32_t>(dft_hash_fct(i, 0xdeadbeef)));
88 }
89
95 void update(const T & val)
96 {
97 for (size_t i = 0; i < k_; ++i)
98 if (const auto h = static_cast<uint64_t>(murmur3hash(val, seeds_[i])); h < signature_[i])
99 signature_(i) = h;
100 }
101
112 template <std::input_iterator Itor>
113 void update(Itor beg, const Itor & end)
114 {
115 while (beg != end)
116 {
117 update(*beg);
118 ++beg;
119 }
120 }
121
127 [[nodiscard]] double similarity(const MinHash & other) const
128 {
130 << "MinHash::similarity: signature size mismatch";
131
132 size_t matches = 0;
133 for (size_t i = 0; i < k_; ++i)
134 if (signature_[i] == other.signature_[i])
135 ++matches;
136
137 return static_cast<double>(matches) / static_cast<double>(k_);
138 }
139
146 {
147 return signature_;
148 }
149
154 void clear()
155 {
156 constexpr uint64_t inf = std::numeric_limits<uint64_t>::max();
157 for (size_t i = 0; i < k_; ++i)
158 signature_(i) = inf;
159 }
160
171 {
173 << "MinHash::merge: signature size mismatch ("
174 << k_ << " vs " << other.k_ << ")";
175 for (size_t i = 0; i < k_; ++i)
176 signature_(i) = std::min(signature_(i), other.signature_(i));
177 return *this;
178 }
179
185 [[nodiscard]] size_t size() const noexcept { return k_; }
186 };
187} // namespace Aleph
188
189# endif // MINHASH_H
Exception handling system with formatted messages for Aleph-w.
#define ah_domain_error_if(C)
Throws std::domain_error if condition holds.
Definition ah-errors.H:522
long double h
Definition btreepic.C:154
Simple dynamic array with automatic resizing and functional operations.
Definition tpl_array.H:139
T & append(const T &data)
Append a copy of data
Definition tpl_array.H:245
void reserve(size_t cap)
Reserves cap cells into the array.
Definition tpl_array.H:315
MinHash signature generator.
Definition minhash.H:65
MinHash & merge(const MinHash &other)
Merge another MinHash signature into this one.
Definition minhash.H:170
size_t size() const noexcept
Number of hash functions used in the signature.
Definition minhash.H:185
void clear()
Reset the signature to the initial all-maximum state.
Definition minhash.H:154
void update(const T &val)
Add an element to the set.
Definition minhash.H:95
double similarity(const MinHash &other) const
Estimate Jaccard similarity with another signature.
Definition minhash.H:127
MinHash(size_t k=128)
Construct with signature size.
Definition minhash.H:75
size_t k_
Signature size.
Definition minhash.H:66
const Array< uint64_t > & get_signature() const noexcept
Returns the current signature.
Definition minhash.H:145
Array< uint64_t > signature_
Minimized hash values.
Definition minhash.H:67
Array< uint32_t > seeds_
Random seeds for hash functions.
Definition minhash.H:68
void update(Itor beg, const Itor &end)
Add all elements in a range to the MinHash signature.
Definition minhash.H:113
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
std::decay_t< typename HeadC::Item_Type > T
Definition ah-zip.H:105
size_t dft_hash_fct(const Key &key) noexcept
Primary default hash: best speed/quality trade-off.
Definition hash-fct.H:1003
size_t murmur3hash(const Key &key, std::uint32_t seed)
Definition hash-fct.H:334
static int * k
Dynamic array container with automatic resizing.