Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
string_search_test.cc
Go to the documentation of this file.
1/*
2 Aleph_w
3
4 Data structures & Algorithms
5 version 2.0.0b
6 https://github.com/lrleon/Aleph-w
7
8 This file is part of Aleph-w library
9
10 Copyright (c) 2002-2026 Leandro Rabindranath Leon
11
12 Permission is hereby granted, free of charge, to any person obtaining a copy
13 of this software and associated documentation files (the "Software"), to deal
14 in the Software without restriction, including without limitation the rights
15 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 copies of the Software, and to permit persons to whom the Software is
17 furnished to do so, subject to the following conditions:
18
19 The above copyright notice and this permission notice shall be included in all
20 copies or substantial portions of the Software.
21
22 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 SOFTWARE.
29*/
30
31
37# include <gtest/gtest.h>
38
39# include <random>
40# include <String_Search.H>
41
42using namespace Aleph;
43
44namespace
45{
46 template <typename T, typename U>
47 void expect_array_eq(const Array<T> & a,
48 std::initializer_list<U> expected)
49 {
50 ASSERT_EQ(a.size(), expected.size());
51 size_t i = 0;
52 for (const auto & v : expected)
53 EXPECT_EQ(a[i++], static_cast<T>(v));
54 }
55}
56
58{
59 const auto pi = kmp_prefix_function("aabaaab");
60 expect_array_eq(pi, {0, 1, 0, 1, 2, 2, 3});
61}
62
64{
65 const auto matches = kmp_search("ababa", "aba");
66 expect_array_eq(matches, {0, 2});
67}
68
70{
71 const auto matches = kmp_search("abc", "");
72 expect_array_eq(matches, {0, 1, 2, 3});
73}
74
76{
77 const auto z = z_algorithm("aaaaa");
78 expect_array_eq(z, {5, 4, 3, 2, 1});
79}
80
82{
83 const std::string text = "abcababcababcab";
84 const std::string pattern = "abcab";
85
86 const auto kmp = kmp_search(text, pattern);
87 const auto z = z_search(text, pattern);
88
89 ASSERT_EQ(z.size(), kmp.size());
90 for (size_t i = 0; i < z.size(); ++i)
91 EXPECT_EQ(z[i], kmp[i]);
92}
93
95{
96 const std::string text = "abracadabra abracadabra";
97 const std::string pattern = "abra";
98
99 const auto expected = kmp_search(text, pattern);
100 const auto got = boyer_moore_horspool_search(text, pattern);
101
102 ASSERT_EQ(got.size(), expected.size());
103 for (size_t i = 0; i < got.size(); ++i)
104 EXPECT_EQ(got[i], expected[i]);
105}
106
108{
109 const std::string text = "the quick brown fox jumps over the quick brown fox";
110 const std::string pattern = "quick";
111
112 const auto expected = kmp_search(text, pattern);
113 const auto got = rabin_karp_search(text, pattern);
114
115 ASSERT_EQ(got.size(), expected.size());
116 for (size_t i = 0; i < got.size(); ++i)
117 EXPECT_EQ(got[i], expected[i]);
118}
119
121{
122 const auto a = kmp_search("abcdefgh", "xyz");
123 const auto b = z_search("abcdefgh", "xyz");
124 const auto c = boyer_moore_horspool_search("abcdefgh", "xyz");
125 const auto d = rabin_karp_search("abcdefgh", "xyz");
126
127 EXPECT_TRUE(a.is_empty());
128 EXPECT_TRUE(b.is_empty());
129 EXPECT_TRUE(c.is_empty());
130 EXPECT_TRUE(d.is_empty());
131}
132
134{
135 const auto a = kmp_search("a", "a");
136 const auto b = z_search("a", "a");
137 const auto c = boyer_moore_horspool_search("a", "a");
138 const auto d = rabin_karp_search("a", "a");
139
140 expect_array_eq(a, {0});
141 expect_array_eq(b, {0});
142 expect_array_eq(c, {0});
143 expect_array_eq(d, {0});
144}
145
147{
148 EXPECT_TRUE(kmp_search("b", "a").is_empty());
149 EXPECT_TRUE(z_search("b", "a").is_empty());
150 EXPECT_TRUE(boyer_moore_horspool_search("b", "a").is_empty());
151 EXPECT_TRUE(rabin_karp_search("b", "a").is_empty());
152}
153
155{
156 EXPECT_TRUE(kmp_search("", "abc").is_empty());
157 EXPECT_TRUE(z_search("", "abc").is_empty());
158 EXPECT_TRUE(boyer_moore_horspool_search("", "abc").is_empty());
159 EXPECT_TRUE(rabin_karp_search("", "abc").is_empty());
160}
161
163{
164 EXPECT_TRUE(kmp_search("ab", "abcdef").is_empty());
165 EXPECT_TRUE(z_search("ab", "abcdef").is_empty());
166 EXPECT_TRUE(boyer_moore_horspool_search("ab", "abcdef").is_empty());
167 EXPECT_TRUE(rabin_karp_search("ab", "abcdef").is_empty());
168}
169
171{
172 // Build a long text with a known pattern planted at specific positions
173 const std::string pattern = "XYZW";
174 std::string text(10000, 'a');
175
176 // Plant pattern at known positions
177 const size_t positions[] = {0, 42, 500, 1234, 5000, 9996};
178 for (const size_t pos : positions)
179 text.replace(pos, pattern.size(), pattern);
180
181 const auto kmp = kmp_search(text, pattern);
182 const auto z = z_search(text, pattern);
183 const auto bmh = boyer_moore_horspool_search(text, pattern);
184 const auto rk = rabin_karp_search(text, pattern);
185
186 // All algorithms must agree
187 ASSERT_EQ(kmp.size(), z.size());
188 ASSERT_EQ(kmp.size(), bmh.size());
189 ASSERT_EQ(kmp.size(), rk.size());
190
191 for (size_t i = 0; i < kmp.size(); ++i)
192 {
193 EXPECT_EQ(kmp[i], z[i]);
194 EXPECT_EQ(kmp[i], bmh[i]);
195 EXPECT_EQ(kmp[i], rk[i]);
196 }
197
198 // Must find at least the planted positions
199 EXPECT_EQ(kmp.size(), 6u);
200}
201
203{
204 // Build text with non-ASCII / binary content
205 std::string text;
206 for (int i = 0; i < 256; ++i)
207 text.push_back(static_cast<char>(i));
208 text += text; // 512 bytes, all byte values
209
210 std::string pattern;
211 pattern.push_back(static_cast<char>(0xFE));
212 pattern.push_back(static_cast<char>(0xFF));
213 pattern.push_back(static_cast<char>(0x00));
214
215 const auto kmp = kmp_search(text, pattern);
216 const auto bmh = boyer_moore_horspool_search(text, pattern);
217 const auto rk = rabin_karp_search(text, pattern);
218
219 ASSERT_EQ(kmp.size(), bmh.size());
220 ASSERT_EQ(kmp.size(), rk.size());
221
222 for (size_t i = 0; i < kmp.size(); ++i)
223 {
224 EXPECT_EQ(kmp[i], bmh[i]);
225 EXPECT_EQ(kmp[i], rk[i]);
226 }
227
228 // Pattern {0xFE, 0xFF, 0x00} spans the boundary once (pos 254)
229 EXPECT_EQ(kmp.size(), 1u);
230}
231
233{
234 const std::string s = "hello world";
235 expect_array_eq(kmp_search(s, s), {0});
236 expect_array_eq(z_search(s, s), {0});
237 expect_array_eq(boyer_moore_horspool_search(s, s), {0});
238 expect_array_eq(rabin_karp_search(s, s), {0});
239}
240
242{
243 // "aaa...a" (1000 a's), pattern "aaaa" — many overlapping matches
244 const std::string text(1000, 'a');
245 const std::string pattern(4, 'a');
246
247 const auto kmp = kmp_search(text, pattern);
248 const auto z = z_search(text, pattern);
249 const auto bmh = boyer_moore_horspool_search(text, pattern);
250 const auto rk = rabin_karp_search(text, pattern);
251
252 EXPECT_EQ(kmp.size(), 997u); // 1000 - 4 + 1
253 EXPECT_EQ(z.size(), 997u);
254 EXPECT_EQ(bmh.size(), 997u);
255 EXPECT_EQ(rk.size(), 997u);
256}
257
258// --- 3.1 Z-algorithm with full 256-byte alphabet ---
259
261{
262 // Build a text that uses all 256 byte values so that
263 // find_unused_delimiter fails and z_search must handle it.
264 std::string text;
265 for (int i = 0; i < 256; ++i)
266 text.push_back(static_cast<char>(i));
267 text += text; // 512 bytes, every byte present
268
269 // Pattern: bytes {0x80, 0x81, 0x82}
270 std::string pattern;
271 pattern.push_back(static_cast<char>(0x80));
272 pattern.push_back(static_cast<char>(0x81));
273 pattern.push_back(static_cast<char>(0x82));
274
275 const auto kmp = kmp_search(text, pattern);
276 const auto z = z_search(text, pattern);
277
278 // Z-search must agree with KMP regardless of delimiter availability
279 ASSERT_EQ(z.size(), kmp.size());
280 for (size_t i = 0; i < z.size(); ++i)
281 EXPECT_EQ(z[i], kmp[i]);
282
283 // The pattern appears once per 256-byte block
284 EXPECT_EQ(kmp.size(), 2u);
285}
286
288{
289 // Construct text with all 256 bytes, then plant an overlapping pattern
290 std::string text;
291 for (int i = 0; i < 256; ++i)
292 text.push_back(static_cast<char>(i));
293
294 // Append a known overlapping section at the boundary
295 const std::string pattern = text.substr(250, 6); // last 6 bytes of first block
296 text += text; // 512 bytes
297
298 const auto kmp = kmp_search(text, pattern);
299 const auto z = z_search(text, pattern);
300
301 ASSERT_EQ(z.size(), kmp.size());
302 for (size_t i = 0; i < z.size(); ++i)
303 EXPECT_EQ(z[i], kmp[i]);
304
305 EXPECT_GE(kmp.size(), 2u);
306}
307
308// --- 3.2 Rabin-Karp with adversarial / collision-prone inputs ---
309
311{
312 // All same character: maximizes hash collisions for naive hashes,
313 // since every window has the same character distribution.
314 const std::string text(5000, 'A');
315 const std::string pattern(7, 'A');
316
317 const auto kmp = kmp_search(text, pattern);
318 const auto rk = rabin_karp_search(text, pattern);
319
320 ASSERT_EQ(rk.size(), kmp.size());
321 for (size_t i = 0; i < rk.size(); ++i)
322 EXPECT_EQ(rk[i], kmp[i]);
323
324 EXPECT_EQ(rk.size(), 5000u - 7u + 1u);
325}
326
328{
329 // Two-character alphabet with patterns that are permutations of each
330 // other. "aab" and "aba" have identical character frequencies, which
331 // can cause collisions in sum-based hashes.
332 std::string text;
333 for (int i = 0; i < 500; ++i)
334 text += "aab";
335 // Plant "aba" at a few positions
336 text.replace(99, 3, "aba");
337 text.replace(600, 3, "aba");
338
339 const auto kmp_aab = kmp_search(text, "aab");
340 const auto rk_aab = rabin_karp_search(text, "aab");
341
342 ASSERT_EQ(rk_aab.size(), kmp_aab.size());
343 for (size_t i = 0; i < rk_aab.size(); ++i)
344 EXPECT_EQ(rk_aab[i], kmp_aab[i]);
345
346 const auto kmp_aba = kmp_search(text, "aba");
347 const auto rk_aba = rabin_karp_search(text, "aba");
348
349 ASSERT_EQ(rk_aba.size(), kmp_aba.size());
350 for (size_t i = 0; i < rk_aba.size(); ++i)
351 EXPECT_EQ(rk_aba[i], kmp_aba[i]);
352}
353
355{
356 // Long pattern with very few matches — stresses the hash verification path.
357 std::string text(10000, 'x');
358 const std::string pattern(100, 'y');
359
360 // Plant the pattern at exactly two positions
361 text.replace(1234, 100, pattern);
362 text.replace(7777, 100, pattern);
363
364 const auto kmp = kmp_search(text, pattern);
365 const auto rk = rabin_karp_search(text, pattern);
366
367 ASSERT_EQ(rk.size(), kmp.size());
368 for (size_t i = 0; i < rk.size(); ++i)
369 EXPECT_EQ(rk[i], kmp[i]);
370
371 EXPECT_EQ(rk.size(), 2u);
372}
373
375{
376 // Verify correctness with different user-supplied bases
377 const std::string text = "the cat sat on the mat the cat";
378 const std::string pattern = "the cat";
379
380 const auto kmp = kmp_search(text, pattern);
381 const uint64_t bases[] = {2, 31, 256, 1000000007ull, 911382323ull};
382
383 for (const uint64_t base : bases)
384 {
385 const auto rk = rabin_karp_search(text, pattern, base);
386 ASSERT_EQ(rk.size(), kmp.size()) << "Failed with base=" << base;
387 for (size_t i = 0; i < rk.size(); ++i)
388 EXPECT_EQ(rk[i], kmp[i]) << "Failed with base=" << base;
389 }
390}
391
393{
394 std::mt19937 gen(42);
395 for (int iter = 0; iter < 1000; ++iter)
396 {
397 // Randomly choose alphabet size: 2 (binary-like), 4, 26, or 256 (full byte)
398 const int alphabet_choice = std::uniform_int_distribution<int>(0, 3)(gen);
399 const int alphabet_size = (alphabet_choice == 0) ? 2 :
400 (alphabet_choice == 1) ? 4 :
401 (alphabet_choice == 2) ? 26 : 256;
402
403 const int text_len = std::uniform_int_distribution<int>(0, 1000)(gen);
404 int pat_len = std::uniform_int_distribution<int>(0, 50)(gen);
405
406 // Occasional long pattern
407 if (iter % 100 == 0) pat_len = std::uniform_int_distribution<int>(100, 500)(gen);
408
409 std::string text(static_cast<size_t>(text_len), ' ');
410 for (int i = 0; i < text_len; ++i)
411 text[static_cast<size_t>(i)] = static_cast<char>(std::uniform_int_distribution<int>(0, alphabet_size - 1)(gen));
412
413 std::string pattern(static_cast<size_t>(pat_len), ' ');
414 for (int i = 0; i < pat_len; ++i)
415 pattern[static_cast<size_t>(i)] = static_cast<char>(std::uniform_int_distribution<int>(0, alphabet_size - 1)(gen));
416
417 // Occasional pattern == text
418 if (iter % 50 == 0 && text_len > 0) pattern = text;
419
420 const auto kmp = kmp_search(text, pattern);
421 const auto z = z_search(text, pattern);
422 const auto bmh = boyer_moore_horspool_search(text, pattern);
423 const auto rk = rabin_karp_search(text, pattern);
424
425 ASSERT_EQ(kmp.size(), z.size()) << "Mismatch at iteration " << iter << " alphabet_size " << alphabet_size;
426 ASSERT_EQ(kmp.size(), bmh.size()) << "Mismatch at iteration " << iter << " alphabet_size " << alphabet_size;
427 ASSERT_EQ(kmp.size(), rk.size()) << "Mismatch at iteration " << iter << " alphabet_size " << alphabet_size;
428
429 for (size_t i = 0; i < kmp.size(); ++i)
430 {
431 EXPECT_EQ(kmp[i], z[i]) << "Mismatch at iteration " << iter << ", index " << i;
432 EXPECT_EQ(kmp[i], bmh[i]) << "Mismatch at iteration " << iter << ", index " << i;
433 EXPECT_EQ(kmp[i], rk[i]) << "Mismatch at iteration " << iter << ", index " << i;
434 }
435 }
436}
Classical pattern searching algorithms over strings.
Simple dynamic array with automatic resizing and functional operations.
Definition tpl_array.H:139
constexpr size_t size() const noexcept
Return the number of elements stored in the stack.
Definition tpl_array.H:351
#define TEST(name)
void expect_array_eq(const Array< size_t > &a, std::initializer_list< size_t > expected)
Assert that an Aleph::Array contains exactly the elements in an initializer_list.
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
Array< size_t > z_search(const std::string_view text, const std::string_view pattern)
Find all occurrences of a pattern using the Z-algorithm.
Array< size_t > rabin_karp_search(const std::string_view text, const std::string_view pattern, const uint64_t base=911382323ull)
Find all occurrences using Rabin-Karp with rolling hash.
Array< size_t > z_algorithm(const std::string_view s)
Compute Z-array for a string.
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
std::decay_t< typename HeadC::Item_Type > T
Definition ah-zip.H:105
Array< size_t > kmp_search(const std::string_view text, const std::string_view pattern)
Find all occurrences of a pattern using KMP.
Array< size_t > kmp_prefix_function(const std::string_view pattern)
Prefix-function used by KMP.
Array< size_t > boyer_moore_horspool_search(const std::string_view text, const std::string_view pattern)
Find all occurrences using Boyer-Moore-Horspool.