49# ifndef SUFFIX_STRUCTURES_H
50# define SUFFIX_STRUCTURES_H
55# include <string_view>
65 namespace suffix_structures_detail
75 for (
size_t i = 0; i <= n; ++i)
114 for (
size_t i = 0; i < n; ++i)
117 rank[i] =
static_cast<unsigned char>(
text[i]);
122 for (
size_t k = 1; ; )
124 auto key2 = [&](
const size_t idx) ->
int
126 return (idx +
k < n) ? rank[idx +
k] : -1;
129 auto key1 = [&](
const size_t idx) ->
int
137 auto less_idx = [&](
const size_t a,
const size_t b)
139 if (rank[a] != rank[b])
140 return rank[a] < rank[b];
141 const int ra = (a +
k < n) ? rank[a +
k] : -1;
142 const int rb = (b +
k < n) ? rank[b +
k] : -1;
147 for (
size_t i = 1; i < n; ++i)
151 for (
size_t i = 0; i < n; ++i)
156 if (
max_rank ==
static_cast<int>(n - 1))
196 <<
"lcp_array_kasai(): suffix array size must match text size";
203 for (
size_t i = 0; i < n; ++i)
207 for (
size_t i = 0; i < n; ++i)
210 <<
"lcp_array_kasai(): suffix array contains invalid index";
212 <<
"lcp_array_kasai(): suffix array contains duplicate index";
217 for (
size_t i = 0; i < n; ++i)
219 const size_t r = rank[i];
226 const size_t j = sa[
r + 1];
255 static constexpr size_t npos = std::numeric_limits<size_t>::max();
277 std::array<bool, 256>
used;
279 for (
const unsigned char c: s)
282 for (
size_t i = 0; i < 256; ++i)
284 return static_cast<char>(i);
287 <<
"Naive_Suffix_Tree::choose_terminal(): all 256 byte values "
288 <<
"are present in the input; no terminal marker available";
296 const size_t suffix_index)
311 for (
size_t i = 0; i < children.
size(); ++i)
312 if (
const size_t child = children[i];
text_[
nodes_[child].start] == c)
335 while (pos <
text_.size())
341 nodes_[current].children.append(leaf);
349 while (pos +
k <
text_.size()
436 for (
size_t i = 0; i <
text_.size(); ++i)
456 while (pos < pattern.size())
466 while (pos +
k < pattern.size()
472 if (pos +
k == pattern.size())
508 while (pos < pattern.size())
518 while (pos +
k < pattern.size()
524 if (pos +
k == pattern.size())
616 states_[
static_cast<size_t>(p)].terminal =
true;
617 p =
states_[
static_cast<size_t>(p)].link;
656 const auto c =
static_cast<unsigned char>(
ch);
658 const int cur =
static_cast<int>(
states_.size());
665 while (p != -1
and states_[
static_cast<size_t>(p)].next[c] == -1)
668 p =
states_[
static_cast<size_t>(p)].link;
675 if (
const int q =
states_[
static_cast<size_t>(p)].next[c];
states_[
static_cast<size_t>(p)].len + 1 ==
states_[
static_cast<size_t>(q)].len)
679 const int clone =
static_cast<int>(
states_.size());
684 states_[
static_cast<size_t>(clone)].len =
686 states_[
static_cast<size_t>(clone)].terminal =
false;
688 if (
states_[
static_cast<size_t>(clone)].link == -1)
689 states_[
static_cast<size_t>(clone)].link = 0;
691 while (p != -1
and states_[
static_cast<size_t>(p)].next[c] == q)
694 p =
states_[
static_cast<size_t>(p)].link;
697 states_[
static_cast<size_t>(q)].link = clone;
698 states_[
static_cast<size_t>(
cur)].link = clone;
714 for (
const char c:
text)
730 for (
const unsigned char c: pattern)
732 state =
states_[
static_cast<size_t>(state)].
next[c];
749 for (
size_t i = 1; i <
states_.size(); ++i)
775 for (
size_t i = 0; i <
other.size(); ++i)
777 const auto c =
static_cast<unsigned char>(
other[i]);
779 while (state != 0
and states_[
static_cast<size_t>(state)].next[c] == -1)
781 state =
states_[
static_cast<size_t>(state)].link;
782 len =
states_[
static_cast<size_t>(state)].len;
785 if (
states_[
static_cast<size_t>(state)].next[c] != -1)
787 state =
states_[
static_cast<size_t>(state)].
next[c];
848 const std::string_view b)
852 return sam.longest_common_substring(b);
Exception handling system with formatted messages for Aleph-w.
#define ah_domain_error()
Throws std::domain_error unconditionally.
#define ah_out_of_range_error_if(C)
Throws std::out_of_range if condition holds.
#define ah_domain_error_if(C)
Throws std::domain_error if condition holds.
Simple dynamic array with automatic resizing and functional operations.
static Array create(size_t n)
Create an array with n logical elements.
constexpr size_t size() const noexcept
Return the number of elements stored in the stack.
constexpr bool is_empty() const noexcept
Checks if the container is empty.
T & append(const T &data)
Append a copy of data
void reserve(size_t cap)
Reserves cap cells into the array.
Naive compressed suffix tree (didactic implementation).
Array< size_t > find_all(const std::string_view pattern) const
Return all occurrences of a pattern.
void replace_child(const size_t parent, const size_t old_child, const size_t new_child)
Naive_Suffix_Tree(const std::string_view text="")
Construct a naive suffix tree.
static constexpr size_t npos
size_t node_count() const noexcept
Return the number of nodes in the tree.
size_t create_node(const size_t start, const size_t end, const size_t parent, const size_t suffix_index)
size_t find_child_by_first_char(const size_t node, const char c) const
void build(const std::string_view text)
Build or rebuild the suffix tree for text.
size_t text_size() const noexcept
Return the text size (without terminal marker).
void insert_suffix(const size_t suffix_start)
void collect_leaf_suffixes(const size_t node, Array< size_t > &out) const
bool contains(const std::string_view pattern) const
Return true if the pattern appears in the text.
static char choose_terminal(const std::string_view s)
const Array< Node > & nodes() const noexcept
Return the internal node array for inspection/debugging.
Suffix automaton (SAM) over byte alphabet.
std::string longest_common_substring(const std::string_view other) const
Find the longest common substring between the built SAM text and other.
void build(const std::string_view text)
Build the SAM from an entire string.
bool contains(const std::string_view pattern) const
Return true if pattern is a substring of the built text.
const Array< State > & states() const noexcept
Access the internal states array for testing/inspection.
void mark_terminals()
Traverse suffix links from last and mark all states as terminal.
size_t distinct_substring_count() const
Count the number of distinct substrings in the built text.
Suffix_Automaton()
Construct an empty suffix automaton (SAM) with only the root state.
size_t state_count() const noexcept
Return the number of states in the SAM.
void clear()
Reset the automaton to its initial empty state (root only).
void extend(const char ch)
Extend the SAM with a single character.
Array< size_t > all_positions(const size_t n)
Generate an array of all positions from 0 to n.
void sort_matches(Array< size_t > &a)
Sort matches in place using introsort.
Main namespace for Aleph-w library functions.
and
Check uniqueness with explicit hash + equality functors.
and std::convertible_to< std::invoke_result_t< const KeyFn &, size_t >, int > void counting_sort_indices(Array< size_t > &sa, Array< size_t > &tmp, const size_t n, const int min_key, const int max_key, const KeyFn &key_of)
Stable counting sort on an index array by integer keys.
Array< size_t > lcp_array_kasai(const std::string_view text, const Array< size_t > &sa)
Compute LCP array from text and suffix array using Kasai.
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
std::string longest_common_substring_sam(const std::string_view a, const std::string_view b)
Convenience function: LCS via suffix automaton.
std::vector< std::string > & split(const std::string &s, const char delim, std::vector< std::string > &elems)
Split a std::string by a single delimiter character.
void next()
Advance all underlying iterators (bounds-checked).
void introsort(T *a, const long l, const long r, const Compare &cmp=Compare())
Sort an array using introsort (introspective sort).
Array< size_t > suffix_array(const std::string_view text)
Build suffix array with doubling algorithm.
size_t parent
Parent node index.
size_t start
Start index in text.
size_t end
End index in text.
Array< size_t > children
Indices of children nodes.
size_t suffix_index
Index of suffix starting at leaf.
bool terminal
true if state is terminal
std::array< int, 256 > next
Transitions for each byte.
size_t first_pos
First occurrence end position.
size_t len
Max length of substring in this state.
Dynamic array container with automatic resizing.
Comprehensive sorting algorithms and search utilities for Aleph-w.