Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
simhash.H
Go to the documentation of this file.
1/*
2 Aleph_w
3
4 Data structures & Algorithms
5 version 2.0.0b
6 https://github.com/lrleon/Aleph-w
7
8 This file is part of Aleph-w library
9
10 Copyright (c) 2002-2026 Leandro Rabindranath Leon
11
12 Permission is hereby granted, free of charge, to any person obtaining a copy
13 of this software and associated documentation files (the "Software"), to deal
14 in the Software without restriction, including without limitation the rights
15 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 copies of the Software, and to permit persons to whom the Software is
17 furnished to do so, subject to the following conditions:
18
19 The above copyright notice and this permission notice shall be included in all
20 copies or substantial portions of the Software.
21
22 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 SOFTWARE.
29*/
30
31# ifndef SIMHASH_H
32# define SIMHASH_H
33
45# include <algorithm>
46# include <cmath>
47# include <cstdint>
48# include <bit>
49
50# include <ah-errors.H>
51# include <tpl_array.H>
52# include <hash-fct.H>
53
54namespace Aleph
55{
64 template <typename T>
65 class SimHash
66 {
67 public:
74 static constexpr size_t FINGERPRINT_SIZE = 64;
75
76 private:
84 [[nodiscard]] static uint64_t hash64(const T & val) noexcept
85 {
86 if constexpr (sizeof(size_t) >= 8)
87 return static_cast<uint64_t>(murmur3hash(val, 0x9e3779b9UL));
88
89 const uint64_t lo =
90 static_cast<uint64_t>(murmur3hash(val, 0x9e3779b9UL)) & 0xffffffffULL;
91 const uint64_t hi =
92 static_cast<uint64_t>(murmur3hash(val, 0x85ebca6bUL)) & 0xffffffffULL;
93 return (hi << 32) | lo;
94 }
95
96 public:
99 : v_(FINGERPRINT_SIZE, 0.0)
100 {}
101
107 void update(const T & val, const double weight = 1.0)
108 {
109 const uint64_t hash = hash64(val);
110 for (size_t i = 0; i < FINGERPRINT_SIZE; ++i)
111 if ((hash >> i) & 1)
112 v_(i) += weight;
113 else
114 v_(i) -= weight;
115 }
116
128 template <typename Itor>
129 void update(Itor beg, const Itor & end)
130 {
131 while (beg != end)
132 {
133 update((*beg).first, (*beg).second);
134 ++beg;
135 }
136 }
137
140 {
141 uint64_t fingerprint = 0;
142 for (size_t i = 0; i < FINGERPRINT_SIZE; ++i)
143 if (v_(i) > 0)
144 fingerprint |= (static_cast<uint64_t>(1) << i);
145
146 return fingerprint;
147 }
148
152 [[nodiscard]] static double similarity(const uint64_t f1, const uint64_t f2) noexcept
153 {
154 const int distance = std::popcount(f1 ^ f2);
155 return 1.0 - (static_cast<double>(distance) / static_cast<double>(FINGERPRINT_SIZE));
156 }
157
159 void clear()
160 {
161 for (size_t i = 0; i < FINGERPRINT_SIZE; ++i)
162 v_(i) = 0.0;
163 }
164 };
165} // namespace Aleph
166
167# endif // SIMHASH_H
Exception handling system with formatted messages for Aleph-w.
Simple dynamic array with automatic resizing and functional operations.
Definition tpl_array.H:139
SimHash fingerprint generator.
Definition simhash.H:66
static constexpr size_t FINGERPRINT_SIZE
Number of bits in the SimHash fingerprint (always 64).
Definition simhash.H:74
static uint64_t hash64(const T &val) noexcept
Compute a 64-bit hash value from a feature.
Definition simhash.H:84
SimHash()
Construct SimHash accumulator.
Definition simhash.H:98
void update(const T &val, const double weight=1.0)
Add a feature to the set.
Definition simhash.H:107
void update(Itor beg, const Itor &end)
Add all features in a range to the SimHash accumulator.
Definition simhash.H:129
static double similarity(const uint64_t f1, const uint64_t f2) noexcept
Estimate similarity with another fingerprint.
Definition simhash.H:152
void clear()
Reset the accumulator.
Definition simhash.H:159
Array< double > v_
Accumulator vector for each bit.
Definition simhash.H:77
uint64_t get_fingerprint() const noexcept
Returns the fingerprint.
Definition simhash.H:139
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
std::decay_t< typename HeadC::Item_Type > T
Definition ah-zip.H:105
size_t murmur3hash(const Key &key, std::uint32_t seed)
Definition hash-fct.H:334
Dynamic array container with automatic resizing.