Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
aho_corasick_test.cc
Go to the documentation of this file.
1/*
2 Aleph_w
3
4 Data structures & Algorithms
5 version 2.0.0b
6 https://github.com/lrleon/Aleph-w
7
8 This file is part of Aleph-w library
9
10 Copyright (c) 2002-2026 Leandro Rabindranath Leon
11
12 Permission is hereby granted, free of charge, to any person obtaining a copy
13 of this software and associated documentation files (the "Software"), to deal
14 in the Software without restriction, including without limitation the rights
15 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 copies of the Software, and to permit persons to whom the Software is
17 furnished to do so, subject to the following conditions:
18
19 The above copyright notice and this permission notice shall be included in all
20 copies or substantial portions of the Software.
21
22 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 SOFTWARE.
29*/
30
31
37# include <gtest/gtest.h>
38
39# include <stdexcept>
40
41# include <Aho_Corasick.H>
42
43using namespace Aleph;
44
46{
48 const size_t id_he = ac.add_pattern("he");
49 const size_t id_she = ac.add_pattern("she");
50 const size_t id_his = ac.add_pattern("his");
51 const size_t id_hers = ac.add_pattern("hers");
52
53 ac.build();
54
55 const auto matches = ac.search("ahishers");
56
57 ASSERT_EQ(matches.size(), 4u);
62}
63
65{
67 const size_t id_a = ac.add_pattern("a");
68 const size_t id_aa = ac.add_pattern("aa");
69 const size_t id_aaa = ac.add_pattern("aaa");
70 ac.build();
71
72 const auto matches = ac.search("aaaa");
73
74 size_t cnt_a = 0;
75 size_t cnt_aa = 0;
76 size_t cnt_aaa = 0;
77
78 for (const auto & m : matches)
79 {
80 if (m.pattern_id == id_a)
81 ++cnt_a;
82 else if (m.pattern_id == id_aa)
83 ++cnt_aa;
84 else if (m.pattern_id == id_aaa)
85 ++cnt_aaa;
86 }
87
88 EXPECT_EQ(cnt_a, 4u);
89 EXPECT_EQ(cnt_aa, 3u);
90 EXPECT_EQ(cnt_aaa, 2u);
91 EXPECT_TRUE(ac.contains_any("xyz aaaa xyz"));
92}
93
95{
97 ac.add_pattern("abc");
98 EXPECT_THROW((void) ac.search("abc"), std::runtime_error);
99}
100
102{
104 const size_t id_he = ac.add_pattern("he");
105 ac.build();
106
107 const auto he_only = ac.search("he");
108 ASSERT_EQ(he_only.size(), 1u);
110
111 const size_t id_she = ac.add_pattern("she");
112 ac.build();
113
114 const auto she_matches = ac.search("she");
115 ASSERT_EQ(she_matches.size(), 2u);
118
119 // Rebuilding incrementally must not create spurious matches.
120 const auto he_after_rebuild = ac.search("he");
121 ASSERT_EQ(he_after_rebuild.size(), 1u);
123}
124
126{
128 EXPECT_THROW((void) ac.add_pattern(""), std::invalid_argument);
129}
130
132{
134 ac.add_pattern("test");
135 ac.build();
136 EXPECT_TRUE(ac.contains_any("unit test"));
137
138 ac.clear();
139 EXPECT_EQ(ac.pattern_count(), 0u);
140 EXPECT_FALSE(ac.is_built());
141
142 const size_t id = ac.add_pattern("abc");
143 ac.build();
144
145 const auto matches = ac.search("xxabcxx");
146 ASSERT_EQ(matches.size(), 1u);
148 EXPECT_EQ(ac.pattern(id), "abc");
149}
150
152{
154
155 // Insert 100 patterns: "p000" through "p099"
157 for (int i = 0; i < 100; ++i)
158 {
159 std::string pat = "p";
160 if (i < 10)
161 pat += "00";
162 else
163 pat += "0";
164 pat += std::to_string(i);
165 ids.append(ac.add_pattern(pat));
166 }
167
168 ac.build();
169
170 // Build text containing some patterns
171 std::string text = "xxp000xxp050xxp099xx";
172 const auto matches = ac.search(text);
173
174 EXPECT_GE(matches.size(), 3u);
175
176 // Verify each match points to a valid pattern
177 for (size_t i = 0; i < matches.size(); ++i)
178 {
179 const auto & m = matches[i];
180 EXPECT_LT(m.pattern_id, 100u);
181 const auto & pat = ac.pattern(m.pattern_id);
182 EXPECT_EQ(text.substr(m.position, pat.size()), pat);
183 }
184}
185
187{
189 ac.add_pattern("needle");
190 ac.add_pattern("hay");
191 ac.build();
192
193 // Build a 10K text with "stack" repeated and a few "needle" insertions
194 std::string text;
195 text.reserve(10000);
196 for (int i = 0; i < 2000; ++i)
197 text += "stack";
198 // Insert needles at known positions
199 text.replace(100, 6, "needle");
200 text.replace(5000, 6, "needle");
201
202 const auto matches = ac.search(text);
203 EXPECT_GE(matches.size(), 2u);
204
205 size_t needle_count = 0;
206 for (size_t i = 0; i < matches.size(); ++i)
207 if (ac.pattern(matches[i].pattern_id) == "needle")
208 ++needle_count;
210}
211
213{
215 const size_t id_a = ac.add_pattern("a");
216 const size_t id_b = ac.add_pattern("b");
217 ac.build();
218
219 const auto matches = ac.search("abba");
220 ASSERT_EQ(matches.size(), 4u);
225}
226
228{
230 std::string pat;
231 pat.push_back(static_cast<char>(0x00));
232 pat.push_back(static_cast<char>(0xFF));
234 ac.build();
235
236 std::string text = "abc";
237 text.push_back(static_cast<char>(0x00));
238 text.push_back(static_cast<char>(0xFF));
239 text += "xyz";
240
241 const auto matches = ac.search(text);
242 ASSERT_EQ(matches.size(), 1u);
243 EXPECT_EQ(matches[0].position, 3u);
244}
245
247{
249 ac.add_pattern("he");
250 ac.add_pattern("she");
251 ac.build();
252
253 const auto matches = ac.search("");
254 EXPECT_TRUE(matches.is_empty());
255}
256
258{
260 ac.add_pattern("he");
261 ac.add_pattern("she");
262 ac.build();
263
264 const auto matches = ac.search("xyzxyz");
265 EXPECT_TRUE(matches.is_empty());
266}
267
268// --- 3.3 Aho-Corasick with patterns that are prefixes of each other ---
269
271{
272 // Patterns {"a", "ab", "abc"} where each is a prefix of the next.
273 // Exercises output link propagation through the trie.
275 const size_t id_a = ac.add_pattern("a");
276 const size_t id_ab = ac.add_pattern("ab");
277 const size_t id_abc = ac.add_pattern("abc");
278 ac.build();
279
280 const auto matches = ac.search("abcabc");
281
282 // Count matches per pattern
283 size_t cnt_a = 0;
284 size_t cnt_ab = 0;
285 size_t cnt_abc = 0;
286
287 for (size_t i = 0; i < matches.size(); ++i)
288 {
289 if (matches[i].pattern_id == id_a)
290 ++cnt_a;
291 else if (matches[i].pattern_id == id_ab)
292 ++cnt_ab;
293 else if (matches[i].pattern_id == id_abc)
294 ++cnt_abc;
295 }
296
297 // "a" at positions 0, 3
298 EXPECT_EQ(cnt_a, 2u);
299 // "ab" at positions 0, 3
300 EXPECT_EQ(cnt_ab, 2u);
301 // "abc" at positions 0, 3
302 EXPECT_EQ(cnt_abc, 2u);
303
304 EXPECT_EQ(matches.size(), 6u);
305}
306
308{
309 // Patterns {"a", "ab", "abc", "bc", "c"} — mixture of prefixes and
310 // suffix-overlapping patterns to exercise output links.
312 const size_t id_a = ac.add_pattern("a");
313 const size_t id_ab = ac.add_pattern("ab");
314 const size_t id_abc = ac.add_pattern("abc");
315 const size_t id_bc = ac.add_pattern("bc");
316 const size_t id_c = ac.add_pattern("c");
317 ac.build();
318
319 const auto matches = ac.search("abc");
320
321 size_t cnt_a = 0;
322 size_t cnt_ab = 0;
323 size_t cnt_abc = 0;
324 size_t cnt_bc = 0;
325 size_t cnt_c = 0;
326
327 for (size_t i = 0; i < matches.size(); ++i)
328 {
329 if (matches[i].pattern_id == id_a) ++cnt_a;
330 else if (matches[i].pattern_id == id_ab) ++cnt_ab;
331 else if (matches[i].pattern_id == id_abc) ++cnt_abc;
332 else if (matches[i].pattern_id == id_bc) ++cnt_bc;
333 else if (matches[i].pattern_id == id_c) ++cnt_c;
334 }
335
336 EXPECT_EQ(cnt_a, 1u);
337 EXPECT_EQ(cnt_ab, 1u);
338 EXPECT_EQ(cnt_abc, 1u);
339 EXPECT_EQ(cnt_bc, 1u);
340 EXPECT_EQ(cnt_c, 1u);
341 EXPECT_EQ(matches.size(), 5u);
342}
343
345{
346 // Patterns "x", "xx", "xxx", ..., "xxxxxxxxxx" (1..10 x's)
349 for (int len = 1; len <= 10; ++len)
350 ids.append(ac.add_pattern(std::string(static_cast<size_t>(len), 'x')));
351 ac.build();
352
353 // Text is 15 x's
354 const std::string text(15, 'x');
355 const auto matches = ac.search(text);
356
357 // Count occurrences of each pattern length
359 for (size_t i = 0; i < 10; ++i)
360 counts[i] = 0;
361
362 for (size_t i = 0; i < matches.size(); ++i)
363 ++counts[matches[i].pattern_id];
364
365 // Pattern of length k appears (15 - k + 1) times
366 for (int k = 1; k <= 10; ++k)
367 EXPECT_EQ(counts[static_cast<size_t>(k - 1)],
368 static_cast<size_t>(15 - k + 1))
369 << "pattern length " << k;
370}
371
373{
374 // Verify exact match positions for prefix patterns
376 const size_t id_a = ac.add_pattern("a");
377 const size_t id_ab = ac.add_pattern("ab");
378 const size_t id_abc = ac.add_pattern("abc");
379 ac.build();
380
381 const auto matches = ac.search("xabcx");
382
386
387 for (size_t i = 0; i < matches.size(); ++i)
388 {
389 if (matches[i].pattern_id == id_a)
390 pos_a.append(matches[i].position);
391 else if (matches[i].pattern_id == id_ab)
392 pos_ab.append(matches[i].position);
393 else if (matches[i].pattern_id == id_abc)
394 pos_abc.append(matches[i].position);
395 }
396
397 ASSERT_EQ(pos_a.size(), 1u);
398 EXPECT_EQ(pos_a[0], 1u);
399
400 ASSERT_EQ(pos_ab.size(), 1u);
401 EXPECT_EQ(pos_ab[0], 1u);
402
403 ASSERT_EQ(pos_abc.size(), 1u);
404 EXPECT_EQ(pos_abc[0], 1u);
405}
406
408{
410 const int num_patterns = 1000;
412
413 // Deterministic pattern generation
414 for (int i = 0; i < num_patterns; ++i)
415 {
416 std::string p = "pat" + std::to_string(i);
417 patterns.append(p);
418 ac.add_pattern(p);
419 }
420
421 ac.build();
422
423 // Deterministic text generation
424 std::string text;
425 text.reserve(200000);
426 for (int i = 0; i < 5000; ++i)
427 {
428 text += "some_random_filler_";
429 if (i % 10 == 0)
430 text += patterns[i % num_patterns];
431 }
432
433 const auto matches = ac.search(text);
434 EXPECT_GT(matches.size(), 0u);
435
436 for (const auto & m : matches)
437 {
438 const auto & pat = ac.pattern(m.pattern_id);
439 EXPECT_EQ(text.substr(m.position, pat.size()), pat);
440 }
441}
Multi-pattern string matching with the Aho-Corasick automaton.
Aho-Corasick multi-pattern automaton.
size_t add_pattern(std::string pattern)
Add one pattern to the automaton.
Simple dynamic array with automatic resizing and functional operations.
Definition tpl_array.H:139
static Array create(size_t n)
Create an array with n logical elements.
Definition tpl_array.H:194
T & append(const T &data)
Append a copy of data
Definition tpl_array.H:245
constexpr size_t size() const noexcept
Returns the number of entries in the table.
Definition hashDry.H:619
#define TEST(name)
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
One match occurrence in the searched text.
FooMap m(5, fst_unit_pair_hash, snd_unit_pair_hash)
static int * k