Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
Suffix_Structures.H
Go to the documentation of this file.
1/*
2 Aleph_w
3
4 Data structures & Algorithms
5 version 2.0.0b
6 https://github.com/lrleon/Aleph-w
7
8 This file is part of Aleph-w library
9
10 Copyright (c) 2002-2026 Leandro Rabindranath Leon
11
12 Permission is hereby granted, free of charge, to any person obtaining a copy
13 of this software and associated documentation files (the "Software"), to deal
14 in the Software without restriction, including without limitation the rights
15 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 copies of the Software, and to permit persons to whom the Software is
17 furnished to do so, subject to the following conditions:
18
19 The above copyright notice and this permission notice shall be included in all
20 copies or substantial portions of the Software.
21
22 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 SOFTWARE.
29*/
30
31
49# ifndef SUFFIX_STRUCTURES_H
50# define SUFFIX_STRUCTURES_H
51
52# include <algorithm>
53# include <limits>
54# include <string>
55# include <string_view>
56# include <utility>
57# include <array>
58
59# include <ah-errors.H>
60# include <tpl_array.H>
61# include <tpl_sort_utils.H>
62
63namespace Aleph
64{
65 namespace suffix_structures_detail
66 {
71 inline Array<size_t> all_positions(const size_t n)
72 {
74 out.reserve(n + 1);
75 for (size_t i = 0; i <= n; ++i)
76 out.append(i);
77 return out;
78 }
79
80
84 inline void sort_matches(Array<size_t> & a)
85 {
86 introsort(a);
87 }
88 } // namespace suffix_structures_detail
89
90
103 inline Array<size_t> suffix_array(const std::string_view text)
104 {
105 const size_t n = text.size();
106 if (n == 0)
107 return {};
108
113
114 for (size_t i = 0; i < n; ++i)
115 {
116 sa[i] = i;
117 rank[i] = static_cast<unsigned char>(text[i]);
118 }
119
120 int max_rank = 255; // initial ranks are char codes in [0, 255]
121
122 for (size_t k = 1; ; )
123 {
124 auto key2 = [&](const size_t idx) -> int
125 {
126 return (idx + k < n) ? rank[idx + k] : -1;
127 };
128
129 auto key1 = [&](const size_t idx) -> int
130 {
131 return rank[idx];
132 };
133
136
137 auto less_idx = [&](const size_t a, const size_t b)
138 {
139 if (rank[a] != rank[b])
140 return rank[a] < rank[b];
141 const int ra = (a + k < n) ? rank[a + k] : -1;
142 const int rb = (b + k < n) ? rank[b + k] : -1;
143 return ra < rb;
144 };
145
146 tmp_rank[sa[0]] = 0;
147 for (size_t i = 1; i < n; ++i)
148 tmp_rank[sa[i]] =
149 tmp_rank[sa[i - 1]] + (less_idx(sa[i - 1], sa[i]) ? 1 : 0);
150
151 for (size_t i = 0; i < n; ++i)
152 rank[i] = tmp_rank[i];
153
154 max_rank = rank[sa[n - 1]];
155
156 if (max_rank == static_cast<int>(n - 1))
157 break;
158
159 if (k >= n)
160 break;
161
162 if (k > n / 2)
163 break;
164
165 k *= 2;
166 }
167
168 return sa;
169 }
170
171
190 inline Array<size_t> lcp_array_kasai(const std::string_view text,
191 const Array<size_t> & sa)
192 {
193 const size_t n = text.size();
194
195 ah_domain_error_if(sa.size() != n)
196 << "lcp_array_kasai(): suffix array size must match text size";
197
199 if (n == 0)
200 return lcp;
201
203 for (size_t i = 0; i < n; ++i)
204 lcp[i] = 0;
205
206 Array<size_t> rank(n, std::numeric_limits<size_t>::max());
207 for (size_t i = 0; i < n; ++i)
208 {
209 ah_out_of_range_error_if(sa[i] >= n)
210 << "lcp_array_kasai(): suffix array contains invalid index";
211 ah_out_of_range_error_if(rank[sa[i]] != std::numeric_limits<size_t>::max())
212 << "lcp_array_kasai(): suffix array contains duplicate index";
213 rank[sa[i]] = i;
214 }
215
216 size_t k = 0;
217 for (size_t i = 0; i < n; ++i)
218 {
219 const size_t r = rank[i];
220 if (r + 1 >= n)
221 {
222 k = 0;
223 continue;
224 }
225
226 const size_t j = sa[r + 1];
227 while (i + k < n and j + k < n and text[i + k] == text[j + k])
228 ++k;
229
230 lcp[r] = k;
231
232 if (k > 0)
233 --k;
234 }
235
236 return lcp;
237 }
238
239
253 {
254 public:
255 static constexpr size_t npos = std::numeric_limits<size_t>::max();
256
261 struct Node
262 {
263 size_t start = 0;
264 size_t end = 0;
265 size_t parent = npos;
268 };
269
270 private:
271 std::string text_;
272 size_t original_size_ = 0;
274
275 static char choose_terminal(const std::string_view s)
276 {
277 std::array<bool, 256> used;
278 used.fill(false);
279 for (const unsigned char c: s)
280 used[c] = true;
281
282 for (size_t i = 0; i < 256; ++i)
283 if (not used[i])
284 return static_cast<char>(i);
285
287 << "Naive_Suffix_Tree::choose_terminal(): all 256 byte values "
288 << "are present in the input; no terminal marker available";
289
290 return '\0'; // unreachable
291 }
292
293 size_t create_node(const size_t start,
294 const size_t end,
295 const size_t parent,
296 const size_t suffix_index)
297 {
298 Node n;
299 n.start = start;
300 n.end = end;
301 n.parent = parent;
302 n.suffix_index = suffix_index;
303 nodes_.append(n);
304 return nodes_.size() - 1;
305 }
306
307 size_t find_child_by_first_char(const size_t node,
308 const char c) const
309 {
310 const Array<size_t> & children = nodes_[node].children;
311 for (size_t i = 0; i < children.size(); ++i)
312 if (const size_t child = children[i]; text_[nodes_[child].start] == c)
313 return child;
314
315 return npos;
316 }
317
318 void replace_child(const size_t parent,
319 const size_t old_child,
320 const size_t new_child)
321 {
322 for (Array<size_t> & children = nodes_[parent].children; size_t & i : children)
323 if (i == old_child)
324 {
325 i = new_child;
326 return;
327 }
328 }
329
330 void insert_suffix(const size_t suffix_start)
331 {
332 size_t current = 0;
333 size_t pos = suffix_start;
334
335 while (pos < text_.size())
336 {
337 const size_t child = find_child_by_first_char(current, text_[pos]);
338 if (child == npos)
339 {
340 const size_t leaf = create_node(pos, text_.size(), current, suffix_start);
341 nodes_[current].children.append(leaf);
342 return;
343 }
344
345 const size_t edge_start = nodes_[child].start;
346 const size_t edge_end = nodes_[child].end;
347
348 size_t k = 0;
349 while (pos + k < text_.size()
351 and text_[pos + k] == text_[edge_start + k])
352 ++k;
353
354 if (edge_start + k == edge_end)
355 {
356 current = child;
357 pos += k;
358 continue;
359 }
360
361 const size_t split = create_node(edge_start,
362 edge_start + k,
363 current,
364 npos);
365
366 replace_child(current, child, split);
367
368 nodes_[child].start = edge_start + k;
369 nodes_[child].parent = split;
370 nodes_[split].children.append(child);
371
372 const size_t leaf = create_node(pos + k,
373 text_.size(),
374 split,
376 nodes_[split].children.append(leaf);
377
378 return;
379 }
380 }
381
382 void collect_leaf_suffixes(const size_t node,
383 Array<size_t> & out) const
384 {
385 Array<size_t> stack;
386 stack.append(node);
387
388 while (not stack.is_empty())
389 {
390 const size_t u = stack.remove_last();
391 if (nodes_[u].suffix_index != npos)
392 {
393 if (nodes_[u].suffix_index < original_size_)
394 out.append(nodes_[u].suffix_index);
395 continue;
396 }
397
398 for (const Array<size_t> & children = nodes_[u].children; size_t i: children)
399 stack.append(i);
400 }
401 }
402
403 public:
412 explicit Naive_Suffix_Tree(const std::string_view text = "")
413 {
414 build(text);
415 }
416
424 void build(const std::string_view text)
425 {
426 const char terminal = choose_terminal(text);
427 std::string new_text(text.begin(), text.end());
428 new_text.push_back(terminal);
429
430 original_size_ = text.size();
431 text_ = std::move(new_text);
432
433 nodes_.empty();
434 nodes_.append(Node{}); // root
435
436 for (size_t i = 0; i < text_.size(); ++i)
437 insert_suffix(i);
438 }
439
448 [[nodiscard]] bool contains(const std::string_view pattern) const
449 {
450 if (pattern.empty())
451 return true;
452
453 size_t current = 0;
454 size_t pos = 0;
455
456 while (pos < pattern.size())
457 {
458 const size_t child = find_child_by_first_char(current, pattern[pos]);
459 if (child == npos)
460 return false;
461
462 const size_t edge_start = nodes_[child].start;
463 const size_t edge_end = nodes_[child].end;
464
465 size_t k = 0;
466 while (pos + k < pattern.size()
468 and text_[edge_start + k] != text_.back() // Never match sentinel
469 and pattern[pos + k] == text_[edge_start + k])
470 ++k;
471
472 if (pos + k == pattern.size())
473 return true;
474
475 if (edge_start + k == edge_end)
476 {
477 current = child;
478 pos += k;
479 continue;
480 }
481
482 return false;
483 }
484
485 return true;
486 }
487
500 [[nodiscard]] Array<size_t> find_all(const std::string_view pattern) const
501 {
502 if (pattern.empty())
504
505 size_t current = 0;
506 size_t pos = 0;
507
508 while (pos < pattern.size())
509 {
510 const size_t child = find_child_by_first_char(current, pattern[pos]);
511 if (child == npos)
512 return {};
513
514 const size_t edge_start = nodes_[child].start;
515 const size_t edge_end = nodes_[child].end;
516
517 size_t k = 0;
518 while (pos + k < pattern.size()
520 and text_[edge_start + k] != text_.back() // Never match sentinel
521 and pattern[pos + k] == text_[edge_start + k])
522 ++k;
523
524 if (pos + k == pattern.size())
525 {
529 return matches;
530 }
531
532 if (edge_start + k == edge_end)
533 {
534 current = child;
535 pos += k;
536 continue;
537 }
538
539 return {};
540 }
541
542 return {};
543 }
544
552 {
553 return nodes_.size();
554 }
555
564 {
565 return original_size_;
566 }
567
575 {
576 return nodes_;
577 }
578 };
579
580
587 {
588 public:
590 struct State
591 {
592 std::array<int, 256> next;
593 int link = -1;
594 size_t len = 0;
595 size_t first_pos = 0;
596 bool terminal = false;
599 {
600 next.fill(-1);
601 }
602 };
603
604 private:
606 int last_ = 0;
607
612 {
613 int p = last_;
614 while (p != -1)
615 {
616 states_[static_cast<size_t>(p)].terminal = true;
617 p = states_[static_cast<size_t>(p)].link;
618 }
619 }
620
621 public:
628 {
629 clear();
630 }
631
637 void clear()
638 {
639 states_.empty();
640 states_.append(State{});
641 states_[0].link = -1;
642 states_[0].len = 0;
643 states_[0].first_pos = 0;
644 states_[0].terminal = false;
645 last_ = 0;
646 }
647
654 void extend(const char ch)
655 {
656 const auto c = static_cast<unsigned char>(ch);
657
658 const int cur = static_cast<int>(states_.size());
659 states_.append(State{});
660 states_[static_cast<size_t>(cur)].len = states_[static_cast<size_t>(last_)].len + 1;
661 states_[static_cast<size_t>(cur)].first_pos =
662 states_[static_cast<size_t>(cur)].len - 1;
663
664 int p = last_;
665 while (p != -1 and states_[static_cast<size_t>(p)].next[c] == -1)
666 {
667 states_[static_cast<size_t>(p)].next[c] = cur;
668 p = states_[static_cast<size_t>(p)].link;
669 }
670
671 if (p == -1)
672 states_[static_cast<size_t>(cur)].link = 0;
673 else
674 {
675 if (const int q = states_[static_cast<size_t>(p)].next[c]; states_[static_cast<size_t>(p)].len + 1 == states_[static_cast<size_t>(q)].len)
676 states_[static_cast<size_t>(cur)].link = q;
677 else
678 {
679 const int clone = static_cast<int>(states_.size());
680 // Copy q's state before append: append may reallocate states_,
681 // invalidating any reference into the old buffer.
682 const State clone_state = states_[static_cast<size_t>(q)];
683 states_.append(clone_state);
684 states_[static_cast<size_t>(clone)].len =
685 states_[static_cast<size_t>(p)].len + 1;
686 states_[static_cast<size_t>(clone)].terminal = false;
687 // Enforce invariant: only root (state 0) may have link == -1
688 if (states_[static_cast<size_t>(clone)].link == -1)
689 states_[static_cast<size_t>(clone)].link = 0;
690
691 while (p != -1 and states_[static_cast<size_t>(p)].next[c] == q)
692 {
693 states_[static_cast<size_t>(p)].next[c] = clone;
694 p = states_[static_cast<size_t>(p)].link;
695 }
696
697 states_[static_cast<size_t>(q)].link = clone;
698 states_[static_cast<size_t>(cur)].link = clone;
699 }
700 }
701
702 last_ = cur;
703 }
704
711 void build(const std::string_view text)
712 {
713 clear();
714 for (const char c: text)
715 extend(c);
716
718 }
719
727 [[nodiscard]] bool contains(const std::string_view pattern) const
728 {
729 int state = 0;
730 for (const unsigned char c: pattern)
731 {
732 state = states_[static_cast<size_t>(state)].next[c];
733 if (state == -1)
734 return false;
735 }
736
737 return true;
738 }
739
747 {
748 size_t total = 0;
749 for (size_t i = 1; i < states_.size(); ++i)
750 {
751 const State & s = states_[i];
752 const State & parent = states_[static_cast<size_t>(s.link)];
753 total += s.len - parent.len;
754 }
755
756 return total;
757 }
758
767 [[nodiscard]] std::string longest_common_substring(const std::string_view other) const
768 {
769 int state = 0;
770 size_t len = 0;
771
772 size_t best_len = 0;
773 size_t best_pos = 0;
774
775 for (size_t i = 0; i < other.size(); ++i)
776 {
777 const auto c = static_cast<unsigned char>(other[i]);
778
779 while (state != 0 and states_[static_cast<size_t>(state)].next[c] == -1)
780 {
781 state = states_[static_cast<size_t>(state)].link;
782 len = states_[static_cast<size_t>(state)].len;
783 }
784
785 if (states_[static_cast<size_t>(state)].next[c] != -1)
786 {
787 state = states_[static_cast<size_t>(state)].next[c];
788 ++len;
789 }
790 else
791 {
792 state = 0;
793 len = 0;
794 }
795
796 if (len > best_len)
797 {
798 best_len = len;
799 best_pos = i;
800 }
801 }
802
803 if (best_len == 0)
804 return "";
805
806 return std::string(other.substr(best_pos + 1 - best_len, best_len));
807 }
808
816 {
817 return states_.size();
818 }
819
827 {
828 return states_;
829 }
830 };
831
832
847 inline std::string longest_common_substring_sam(const std::string_view a,
848 const std::string_view b)
849 {
851 sam.build(a);
852 return sam.longest_common_substring(b);
853 }
854} // namespace Aleph
855
856# endif // SUFFIX_STRUCTURES_H
Exception handling system with formatted messages for Aleph-w.
#define ah_domain_error()
Throws std::domain_error unconditionally.
Definition ah-errors.H:554
#define ah_out_of_range_error_if(C)
Throws std::out_of_range if condition holds.
Definition ah-errors.H:579
#define ah_domain_error_if(C)
Throws std::domain_error if condition holds.
Definition ah-errors.H:522
Simple dynamic array with automatic resizing and functional operations.
Definition tpl_array.H:139
static Array create(size_t n)
Create an array with n logical elements.
Definition tpl_array.H:194
constexpr size_t size() const noexcept
Return the number of elements stored in the stack.
Definition tpl_array.H:351
constexpr bool is_empty() const noexcept
Checks if the container is empty.
Definition tpl_array.H:348
T & append(const T &data)
Append a copy of data
Definition tpl_array.H:245
void reserve(size_t cap)
Reserves cap cells into the array.
Definition tpl_array.H:315
Naive compressed suffix tree (didactic implementation).
Array< size_t > find_all(const std::string_view pattern) const
Return all occurrences of a pattern.
void replace_child(const size_t parent, const size_t old_child, const size_t new_child)
Naive_Suffix_Tree(const std::string_view text="")
Construct a naive suffix tree.
static constexpr size_t npos
size_t node_count() const noexcept
Return the number of nodes in the tree.
size_t create_node(const size_t start, const size_t end, const size_t parent, const size_t suffix_index)
size_t find_child_by_first_char(const size_t node, const char c) const
void build(const std::string_view text)
Build or rebuild the suffix tree for text.
size_t text_size() const noexcept
Return the text size (without terminal marker).
void insert_suffix(const size_t suffix_start)
void collect_leaf_suffixes(const size_t node, Array< size_t > &out) const
bool contains(const std::string_view pattern) const
Return true if the pattern appears in the text.
static char choose_terminal(const std::string_view s)
const Array< Node > & nodes() const noexcept
Return the internal node array for inspection/debugging.
Suffix automaton (SAM) over byte alphabet.
std::string longest_common_substring(const std::string_view other) const
Find the longest common substring between the built SAM text and other.
void build(const std::string_view text)
Build the SAM from an entire string.
bool contains(const std::string_view pattern) const
Return true if pattern is a substring of the built text.
const Array< State > & states() const noexcept
Access the internal states array for testing/inspection.
void mark_terminals()
Traverse suffix links from last and mark all states as terminal.
size_t distinct_substring_count() const
Count the number of distinct substrings in the built text.
Suffix_Automaton()
Construct an empty suffix automaton (SAM) with only the root state.
size_t state_count() const noexcept
Return the number of states in the SAM.
void clear()
Reset the automaton to its initial empty state (root only).
void extend(const char ch)
Extend the SAM with a single character.
Array< size_t > all_positions(const size_t n)
Generate an array of all positions from 0 to n.
void sort_matches(Array< size_t > &a)
Sort matches in place using introsort.
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
and
Check uniqueness with explicit hash + equality functors.
and std::convertible_to< std::invoke_result_t< const KeyFn &, size_t >, int > void counting_sort_indices(Array< size_t > &sa, Array< size_t > &tmp, const size_t n, const int min_key, const int max_key, const KeyFn &key_of)
Stable counting sort on an index array by integer keys.
Array< size_t > lcp_array_kasai(const std::string_view text, const Array< size_t > &sa)
Compute LCP array from text and suffix array using Kasai.
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
std::string longest_common_substring_sam(const std::string_view a, const std::string_view b)
Convenience function: LCS via suffix automaton.
std::vector< std::string > & split(const std::string &s, const char delim, std::vector< std::string > &elems)
Split a std::string by a single delimiter character.
void next()
Advance all underlying iterators (bounds-checked).
Definition ah-zip.H:171
void introsort(T *a, const long l, const long r, const Compare &cmp=Compare())
Sort an array using introsort (introspective sort).
Array< size_t > suffix_array(const std::string_view text)
Build suffix array with doubling algorithm.
size_t parent
Parent node index.
size_t start
Start index in text.
size_t end
End index in text.
Array< size_t > children
Indices of children nodes.
size_t suffix_index
Index of suffix starting at leaf.
bool terminal
true if state is terminal
std::array< int, 256 > next
Transitions for each byte.
size_t first_pos
First occurrence end position.
size_t len
Max length of substring in this state.
static int * k
gsl_rng * r
Dynamic array container with automatic resizing.
Comprehensive sorting algorithms and search utilities for Aleph-w.