Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
suffix_structures_test.cc
Go to the documentation of this file.
1/*
2 Aleph_w
3
4 Data structures & Algorithms
5 version 2.0.0b
6 https://github.com/lrleon/Aleph-w
7
8 This file is part of Aleph-w library
9
10 Copyright (c) 2002-2026 Leandro Rabindranath Leon
11
12 Permission is hereby granted, free of charge, to any person obtaining a copy
13 of this software and associated documentation files (the "Software"), to deal
14 in the Software without restriction, including without limitation the rights
15 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 copies of the Software, and to permit persons to whom the Software is
17 furnished to do so, subject to the following conditions:
18
19 The above copyright notice and this permission notice shall be included in all
20 copies or substantial portions of the Software.
21
22 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 SOFTWARE.
29*/
30
31
37# include <gtest/gtest.h>
38
39# include <stdexcept>
40# include <chrono>
41
42# include <Suffix_Structures.H>
43# include "test_helpers.H"
44
45using namespace Aleph;
46using namespace Aleph::test_helpers;
47
49{
50 const auto sa = suffix_array("banana");
51 expect_array_eq(sa, {5, 3, 1, 0, 4, 2});
52}
53
55{
56 const auto sa = suffix_array("banana");
57 const auto lcp = lcp_array_kasai("banana", sa);
58 expect_array_eq(lcp, {1, 3, 0, 0, 2, 0});
59}
60
62{
64 bad.append(0);
65 EXPECT_THROW((void) lcp_array_kasai("banana", bad), std::domain_error);
66}
67
69{
70 // Suffix array for "abc" is {0, 1, 2}.
71 // We provide a malformed SA {0, 1, 1} which has a duplicate.
73 EXPECT_THROW((void) lcp_array_kasai("abc", sa), std::out_of_range);
74}
75
77{
78 // Suffix array for "abc" is {0, 1, 2}.
79 // We provide a malformed SA {0, 1, 3} where 3 is out of range.
81 EXPECT_THROW((void) lcp_array_kasai("abc", sa), std::out_of_range);
82}
83
85{
86 Naive_Suffix_Tree st("banana");
87
88 EXPECT_EQ(st.text_size(), 6u);
89 EXPECT_EQ(st.node_count(), 11u);
90 EXPECT_EQ(st.nodes().size(), st.node_count());
91
92 EXPECT_TRUE(st.contains("ana"));
93 EXPECT_TRUE(st.contains("nana"));
94 EXPECT_FALSE(st.contains("apple"));
95
96 expect_array_eq(st.find_all("ana"), {1, 3});
97 expect_array_eq(st.find_all("nana"), {2});
98 EXPECT_TRUE(st.find_all("apple").is_empty());
99 expect_array_eq(st.find_all(""), {0, 1, 2, 3, 4, 5, 6});
100}
101
103{
104 // Construct a string containing all 256 byte values
105 std::string full_alphabet;
106 for (int i = 0; i < 256; ++i)
107 full_alphabet.push_back(static_cast<char>(i));
108
109 // Should throw domain_error because no unique sentinel is available
110 EXPECT_THROW(Naive_Suffix_Tree st(full_alphabet), std::domain_error);
111}
112
114{
116 sam.build("ababa");
117
118 EXPECT_TRUE(sam.contains("aba"));
119 EXPECT_TRUE(sam.contains("bab"));
120 EXPECT_FALSE(sam.contains("abc"));
121
122 EXPECT_EQ(sam.distinct_substring_count(), 9u);
123 EXPECT_LE(sam.state_count(), 2u * 5u);
124}
125
127{
129 sam.build("xabxac");
130
131 EXPECT_EQ(sam.longest_common_substring("abcabxabcd"), "abxa");
132 EXPECT_EQ(longest_common_substring_sam("xabxac", "abcabxabcd"), "abxa");
133}
134
136{
137 const std::string text = "mississippi";
138 const auto sa = suffix_array(text);
139
140 ASSERT_EQ(sa.size(), text.size());
141
142 // Verify lexicographic ordering of suffixes
143 for (size_t i = 1; i < sa.size(); ++i)
144 {
145 const std::string_view prev_suffix(text.data() + sa[i - 1],
146 text.size() - sa[i - 1]);
147 const std::string_view curr_suffix(text.data() + sa[i],
148 text.size() - sa[i]);
150 }
151}
152
154{
155 const auto sa = suffix_array("a");
156 expect_array_eq(sa, {0});
157}
158
160{
161 const auto sa = suffix_array("");
162 EXPECT_TRUE(sa.is_empty());
163}
164
166{
167 Naive_Suffix_Tree st("abcabcabc");
168
169 EXPECT_TRUE(st.contains("abc"));
170 EXPECT_TRUE(st.contains("cab"));
171 EXPECT_TRUE(st.contains("bcab"));
172 EXPECT_FALSE(st.contains("abcd"));
173
174 const auto matches = st.find_all("abc");
175 ASSERT_EQ(matches.size(), 3u);
176 expect_array_eq(matches, {0, 3, 6});
177}
178
180{
181 Naive_Suffix_Tree st("x");
182 EXPECT_TRUE(st.contains("x"));
183 EXPECT_FALSE(st.contains("y"));
184 expect_array_eq(st.find_all("x"), {0});
185}
186
188{
190 sam.build("");
191 EXPECT_EQ(sam.distinct_substring_count(), 0u);
192 EXPECT_TRUE(sam.contains(""));
193 EXPECT_FALSE(sam.contains("a"));
194
195 sam.build("z");
196 EXPECT_EQ(sam.distinct_substring_count(), 1u);
197 EXPECT_TRUE(sam.contains("z"));
198 EXPECT_FALSE(sam.contains("a"));
199}
200
205
207{
208 // Build a 5000-char string and verify SA ordering
209 std::string text;
210 text.reserve(5000);
211 for (int i = 0; i < 5000; ++i)
212 text.push_back('a' + (i * 7 + 3) % 26);
213
214 const auto start = std::chrono::steady_clock::now();
215 const auto sa = suffix_array(text);
216 const auto end = std::chrono::steady_clock::now();
217 const auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
218
219 // Performance regression check: O(n log n) construction for n=5000 is fast.
220 // We only fail the test if ENABLE_PERF_TESTS is set, otherwise we just log.
221 if (std::getenv("ENABLE_PERF_TESTS"))
222 EXPECT_LT(elapsed, 500) << "Performance regression: suffix_array(5000) took " << elapsed << "ms";
223 else if (elapsed >= 500)
224 std::cout << "[ PERF ] Warning: suffix_array(5000) took " << elapsed << "ms\n";
225
226 ASSERT_EQ(sa.size(), text.size());
227
228 for (size_t i = 1; i < sa.size(); ++i)
229 {
230 const std::string_view prev_suffix(text.data() + sa[i - 1],
231 text.size() - sa[i - 1]);
232 const std::string_view curr_suffix(text.data() + sa[i],
233 text.size() - sa[i]);
235 }
236}
237
239{
240 std::string text;
241 for (int i = 0; i < 256; ++i)
242 text.push_back(static_cast<char>(i));
243
245 sam.build(text);
246
247 // Every single byte should be a substring
248 for (int i = 0; i < 256; ++i)
249 {
250 std::string pat(1, static_cast<char>(i));
251 EXPECT_TRUE(sam.contains(pat));
252 }
253
254 EXPECT_FALSE(sam.contains(text + "x"));
255}
256
258{
259 // Regression: cloning a state could produce a non-root state with
260 // link == -1, causing out-of-range access in distinct_substring_count()
261 // and longest_common_substring().
263 sam.build("bcaaaccaaacccaacca");
264
265 // Verify invariant: only state 0 may have link == -1
266 const auto & states = sam.states();
267 for (size_t i = 1; i < states.size(); ++i)
268 EXPECT_GE(states[i].link, 0) << "state " << i << " has link == -1";
269
270 // These must not crash
271 EXPECT_GT(sam.distinct_substring_count(), 0u);
272 EXPECT_EQ(sam.longest_common_substring("aacca"), "aacca");
273 EXPECT_TRUE(sam.contains("caaaccaaaccc"));
274}
Suffix structures: suffix array/LCP, suffix tree, suffix automaton.
Simple dynamic array with automatic resizing and functional operations.
Definition tpl_array.H:139
constexpr bool is_empty() const noexcept
Checks if the container is empty.
Definition tpl_array.H:348
T & append(const T &data)
Append a copy of data
Definition tpl_array.H:245
Naive compressed suffix tree (didactic implementation).
Array< size_t > find_all(const std::string_view pattern) const
Return all occurrences of a pattern.
size_t node_count() const noexcept
Return the number of nodes in the tree.
size_t text_size() const noexcept
Return the text size (without terminal marker).
bool contains(const std::string_view pattern) const
Return true if the pattern appears in the text.
const Array< Node > & nodes() const noexcept
Return the internal node array for inspection/debugging.
Suffix automaton (SAM) over byte alphabet.
void build(const std::string_view text)
Build the SAM from an entire string.
#define TEST(name)
void expect_array_eq(const Array< size_t > &a, std::initializer_list< size_t > expected)
Assert that an Aleph::Array contains exactly the elements in an initializer_list.
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
Array< size_t > lcp_array_kasai(const std::string_view text, const Array< size_t > &sa)
Compute LCP array from text and suffix array using Kasai.
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
std::string longest_common_substring_sam(const std::string_view a, const std::string_view b)
Convenience function: LCS via suffix automaton.
Array< size_t > suffix_array(const std::string_view text)
Build suffix array with doubling algorithm.
Shared helper functions for Aleph-w unit tests.