Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
Aho_Corasick.H
Go to the documentation of this file.
1/*
2 Aleph_w
3
4 Data structures & Algorithms
5 version 2.0.0b
6 https://github.com/lrleon/Aleph-w
7
8 This file is part of Aleph-w library
9
10 Copyright (c) 2002-2026 Leandro Rabindranath Leon
11
12 Permission is hereby granted, free of charge, to any person obtaining a copy
13 of this software and associated documentation files (the "Software"), to deal
14 in the Software without restriction, including without limitation the rights
15 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 copies of the Software, and to permit persons to whom the Software is
17 furnished to do so, subject to the following conditions:
18
19 The above copyright notice and this permission notice shall be included in all
20 copies or substantial portions of the Software.
21
22 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 SOFTWARE.
29*/
30
31
44# ifndef AHO_CORASICK_H
45# define AHO_CORASICK_H
46
47# include <string>
48# include <string_view>
49# include <utility>
50# include <array>
51
52# include <ah-errors.H>
53# include <tpl_array.H>
54
55namespace Aleph
56{
63 {
64 public:
70 struct Match
71 {
72 size_t position = 0;
73 size_t pattern_id = 0;
79 bool operator==(const Match & other) const noexcept
80 {
81 return position == other.position and pattern_id == other.pattern_id;
82 }
83 };
84
85 private:
86 struct Node
87 {
88 std::array<int, 256> next;
89 int fail = 0;
90 int out_link = -1;
92
94 {
95 next.fill(-1);
96 }
97 };
98
101 bool built_ = false;
102
103 size_t create_node()
104 {
105 nodes_.append(Node{});
106 return nodes_.size() - 1;
107 }
108
109 public:
112 {
113 nodes_.append(Node{});
114 }
115
117 void clear()
118 {
119 nodes_.empty();
121 nodes_.append(Node{});
122 built_ = false;
123 }
124
136 size_t add_pattern(std::string pattern)
137 {
139 << "Aho_Corasick::add_pattern(): pattern cannot be empty";
140
141 size_t state = 0;
142 for (const unsigned char c: pattern)
143 {
144 int next_state = nodes_[state].next[c];
145 if (next_state == -1)
146 {
147 const size_t new_state = create_node();
148 nodes_[state].next[c] = static_cast<int>(new_state);
149 next_state = static_cast<int>(new_state);
150 }
151
152 state = static_cast<size_t>(next_state);
153 }
154
155 const size_t id = patterns_.size();
156 patterns_.append(std::move(pattern));
157 nodes_[state].output.append(id);
158
159 built_ = false;
160 return id;
161 }
162
175 void build()
176 {
177 Array<size_t> queue;
178 size_t head = 0;
179
180 nodes_[0].fail = 0;
181 nodes_[0].out_link = -1;
182
183 for (size_t c = 0; c < 256; ++c)
184 if (const int next_state = nodes_[0].next[c]; next_state != -1)
185 {
186 const auto child = static_cast<size_t>(next_state);
187 nodes_[child].fail = 0;
188 nodes_[child].out_link = -1;
189 queue.append(child);
190 }
191
192 while (head < queue.size())
193 {
194 const size_t state = queue[head++];
195
196 for (size_t c = 0; c < 256; ++c)
197 {
198 const int next_state = nodes_[state].next[c];
199 if (next_state == -1)
200 continue;
201
202 const auto child = static_cast<size_t>(next_state);
203 auto fail_to = static_cast<size_t>(nodes_[state].fail);
204
205 while (fail_to != 0 and nodes_[fail_to].next[c] == -1)
206 fail_to = static_cast<size_t>(nodes_[fail_to].fail);
207
208 if (const int candidate = nodes_[fail_to].next[c]; candidate != -1)
209 fail_to = static_cast<size_t>(candidate);
210
211 nodes_[child].fail = static_cast<int>(fail_to);
212 nodes_[child].out_link =
213 nodes_[fail_to].output.is_empty() ?
214 nodes_[fail_to].out_link : static_cast<int>(fail_to);
215
216 queue.append(child);
217 }
218 }
219
220 built_ = true;
221 }
222
234 [[nodiscard]] Array<Match> search(const std::string_view text) const
235 {
237 << "Aho_Corasick::search(): call build() before search()";
238
240 size_t state = 0;
241
242 for (size_t i = 0; i < text.size(); ++i)
243 {
244 const unsigned char c = static_cast<unsigned char>(text[i]);
245
246 while (state != 0 and nodes_[state].next[c] == -1)
247 state = static_cast<size_t>(nodes_[state].fail);
248
249 if (const int next_state = nodes_[state].next[c]; next_state != -1)
250 state = static_cast<size_t>(next_state);
251 else
252 state = 0;
253
254 size_t out_state = state;
255 while (true)
256 {
257 const Array<size_t> & output = nodes_[out_state].output;
258 for (size_t k = 0; k < output.size(); ++k)
259 {
260 const size_t id = output[k];
261 matches.append(Match{i + 1 - patterns_[id].size(), id});
262 }
263
264 const int next_out = nodes_[out_state].out_link;
265 if (next_out == -1)
266 break;
267 out_state = static_cast<size_t>(next_out);
268 }
269 }
270
271 return matches;
272 }
273
282 [[nodiscard]] bool contains_any(const std::string_view text) const
283 {
285 << "Aho_Corasick::contains_any(): call build() before search()";
286
287 size_t state = 0;
288 for (size_t i = 0; i < text.size(); ++i)
289 {
290 const unsigned char c = static_cast<unsigned char>(text[i]);
291
292 while (state != 0 and nodes_[state].next[c] == -1)
293 state = static_cast<size_t>(nodes_[state].fail);
294
295 if (const int next_state = nodes_[state].next[c]; next_state != -1)
296 state = static_cast<size_t>(next_state);
297 else
298 state = 0;
299
300 if (not nodes_[state].output.is_empty())
301 return true;
302
303 if (nodes_[state].out_link != -1)
304 return true;
305 }
306
307 return false;
308 }
309
315 {
316 return patterns_.size();
317 }
318
324 {
325 return built_;
326 }
327
336 [[nodiscard]] const std::string &pattern(const size_t id) const
337 {
339 << "Aho_Corasick::pattern(): id out of range";
340 return patterns_[id];
341 }
342 };
343} // namespace Aleph
344
345# endif // AHO_CORASICK_H
Exception handling system with formatted messages for Aleph-w.
#define ah_runtime_error_unless(C)
Throws std::runtime_error if condition does NOT hold.
Definition ah-errors.H:250
#define ah_out_of_range_error_if(C)
Throws std::out_of_range if condition holds.
Definition ah-errors.H:579
#define ah_invalid_argument_if(C)
Throws std::invalid_argument if condition holds.
Definition ah-errors.H:639
Aho-Corasick multi-pattern automaton.
const std::string & pattern(const size_t id) const
Return the pattern text by id.
bool contains_any(const std::string_view text) const
Return true if at least one pattern appears in the text.
void clear()
Clear all patterns and reset to an empty automaton.
size_t pattern_count() const noexcept
Return the number of inserted patterns.
bool is_built() const noexcept
Return whether build() has been executed since last insertion.
Aho_Corasick()
Build an empty automaton (root only).
void build()
Build failure links and transition completion.
Array< Node > nodes_
size_t add_pattern(std::string pattern)
Add one pattern to the automaton.
Array< Match > search(const std::string_view text) const
Search all patterns in a text.
Array< std::string > patterns_
Simple dynamic array with automatic resizing and functional operations.
Definition tpl_array.H:139
constexpr size_t size() const noexcept
Return the number of elements stored in the stack.
Definition tpl_array.H:351
void empty() noexcept
Empties the container.
Definition tpl_array.H:336
constexpr bool is_empty() const noexcept
Checks if the container is empty.
Definition tpl_array.H:348
T & append(const T &data)
Append a copy of data
Definition tpl_array.H:245
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
and
Check uniqueness with explicit hash + equality functors.
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
void next()
Advance all underlying iterators (bounds-checked).
Definition ah-zip.H:171
One match occurrence in the searched text.
bool operator==(const Match &other) const noexcept
Equality operator for Match.
size_t pattern_id
Insertion-order ID of the pattern.
size_t position
0-based start index of the match.
std::array< int, 256 > next
static int * k
Dynamic array container with automatic resizing.
ofstream output
Definition writeHeap.C:215