46# if !defined(__SIZEOF_INT128__)
47# error "ntt.H requires compiler support for __uint128_t"
57# include <string_view>
58# include <type_traits>
61# if (defined(__GNUC__) or defined(__clang__)) \
62 and (defined(__x86_64__) or defined(__i386__) \
63 or defined(_M_X64) or defined(_M_IX86))
64# include <immintrin.h>
65# define ALEPH_NTT_HAS_X86_AVX2_DISPATCH 1
66# define ALEPH_NTT_AVX2_TARGET __attribute__((target("avx2")))
68# define ALEPH_NTT_HAS_X86_AVX2_DISPATCH 0
71# if (defined(__GNUC__) or defined(__clang__)) \
72 and (defined(__aarch64__) or defined(_M_ARM64))
74# define ALEPH_NTT_HAS_ARM_NEON_DISPATCH 1
76# define ALEPH_NTT_HAS_ARM_NEON_DISPATCH 0
79# if ALEPH_NTT_HAS_ARM_NEON_DISPATCH and defined(__linux__)
81# include <asm/hwcap.h>
113 template <u
int64_t MOD = 998244353ULL, u
int64_t ROOT = 3ULL>
116 static_assert(
MOD > 1,
"NTT requires MOD > 1");
117 static_assert((
MOD & 1ULL) == 1ULL,
"NTT requires an odd modulus");
118 static_assert(
ROOT > 0
and ROOT <
MOD,
"NTT root must lie in (0, MOD)");
149 return n != 0
and (n & (n - 1)) == 0;
200 while ((value & 1ULL) == 0)
218 return order != 0
and (
MOD - 1) % order == 0;
227 if (n >
static_cast<size_t>((
MOD - 1) / 2))
234 if (n > std::numeric_limits<size_t>::max() / 2)
241 if (
conv_size > std::numeric_limits<size_t>::max() / 2)
251 const char *
const ctx)
254 << ctx <<
": root order must be positive";
256 << ctx <<
": order " << order
257 <<
" does not divide MOD - 1 (" << (
MOD - 1) <<
")";
268 const char *
const ctx)
271 << ctx <<
": size must be positive";
273 << ctx <<
": size " << n
274 <<
" is not supported by MOD " <<
MOD
275 <<
" (power-of-two sizes require n | 2^k <= "
277 <<
"; Bluestein sizes require 2*n | (MOD - 1) and an internal "
283 const char *
const ctx)
292 << ctx <<
": next power of two overflows size_t for requested size "
305 for (
size_t i = 0; i < n; ++i)
308 for (
size_t i = 0; i <
input.size(); ++i)
319 <<
"NTT::prefix_copy: length " << length
320 <<
" exceeds input size " <<
input.
size();
324 for (
size_t i = 0; i < length; ++i)
333 [[
nodiscard]]
static constexpr const char *
349 [[
nodiscard]]
static constexpr const char *
369 if (
const char *
mode = std::getenv(
"ALEPH_NTT_SIMD");
372 const std::string_view value(
mode);
373 if (value ==
"scalar")
399# if ALEPH_NTT_HAS_X86_AVX2_DISPATCH
400 static const bool available = []()
noexcept
415# if ALEPH_NTT_HAS_ARM_NEON_DISPATCH
416# if defined(__linux__) and defined(HWCAP_ASIMD)
417 static const bool available = []()
noexcept
488 template <
typename F>
493 const size_t chunk_size)
505 for (
size_t i = 0; i <
count; ++i)
516 const size_t end)
const
518 for (
size_t j = begin; j < end; ++j)
531# if !ALEPH_NTT_HAS_X86_AVX2_DISPATCH
538 if (pool !=
nullptr and pool->num_threads() > 1)
548# if !ALEPH_NTT_HAS_ARM_NEON_DISPATCH
555 if (pool !=
nullptr and pool->num_threads() > 1)
562# if ALEPH_NTT_HAS_X86_AVX2_DISPATCH
568 0x8000000000000000ULL,
569 0x8000000000000000ULL,
570 0x8000000000000000ULL,
571 0x8000000000000000ULL
621 const size_t half =
static_cast<size_t>(1) <<
stage;
622 const size_t len =
half << 1;
623 const size_t blocks =
n_ / len;
626 for (
size_t block = 0; block < blocks; ++block)
628 const size_t base = block * len;
630 for (; j + 4 <=
half; j += 4)
641# if ALEPH_NTT_HAS_ARM_NEON_DISPATCH
675 const size_t half =
static_cast<size_t>(1) <<
stage;
676 const size_t len =
half << 1;
677 const size_t blocks =
n_ / len;
680 for (
size_t block = 0; block < blocks; ++block)
682 const size_t base = block * len;
684 for (; j + 2 <=
half; j += 2)
700 for (
size_t i = 1, j = 0; i <
n_; ++i)
702 size_t bit =
n_ >> 1;
703 for (; j &
bit;
bit >>= 1)
722 const size_t half =
static_cast<size_t>(1) <<
stage;
723 const size_t len =
half << 1;
734 for (
size_t j = 0; j <
half; ++j)
749 for (
size_t value =
n_; value > 1; value >>= 1)
763 <<
"NTT::Plan: size " <<
n_
764 <<
" is too large for Bluestein convolution sizing";
770 <<
" exceeds the power-of-two capacity of MOD " <<
MOD;
781 for (
size_t i = 0; i <
n_; ++i)
799 for (
size_t i = 1; i <
n_; ++i)
817 for (
size_t i = 0; i <
n_; ++i)
825 const size_t chunk_size)
const
827 auto lift_one = [&a](
const size_t i)
837 const size_t chunk_size)
const
839 auto scale_one = [
this, &a](
const size_t i)
849 const size_t chunk_size)
const
862 const size_t chunk_size)
const
866 const size_t half =
static_cast<size_t>(1) <<
stage;
867 const size_t len =
half << 1;
868 const size_t blocks =
n_ / len;
874 const size_t base = block * len;
887 const size_t chunk_size)
const
892# if ALEPH_NTT_HAS_X86_AVX2_DISPATCH
900# if ALEPH_NTT_HAS_ARM_NEON_DISPATCH
915 const size_t chunk_size)
const
918 <<
"NTT::Plan::apply_bluestein_transform: missing internal plan";
943 auto pointwise = [&work, &kernel](
const size_t i)
970 const size_t chunk_size)
const
973 <<
"NTT::Plan::transform: input size " << a.
size()
974 <<
" does not match plan size " <<
n_;
997 <<
"NTT::Plan::apply_transform: Bluestein path expects standard "
998 <<
"input representation";
1000 <<
"NTT::Plan::apply_transform: Bluestein path returns standard "
1001 <<
"output representation";
1011 const size_t chunk_size)
const
1017 std::numeric_limits<size_t>::max()
1019 <<
"NTT::Plan::multiply: product size exceeds size_t capacity";
1023 <<
"NTT::Plan::multiply: product size " <<
required
1024 <<
" exceeds plan size " <<
n_;
1125 const bool invert =
false)
const
1159 const size_t chunk_size = 0)
const
1179 const bool invert =
false,
1180 const size_t chunk_size = 0)
const
1201 const size_t chunk_size = 0)
const
1215 for (
size_t i = 0; i <
batch.size(); ++i)
1218 <<
"NTT::Plan::transform_batch: batch item " << i
1219 <<
" has size " <<
batch[i].size()
1220 <<
" but plan size is " <<
n_;
1243 const size_t chunk_size = 0)
const
1245 for (
size_t i = 0; i <
batch.size(); ++i)
1247 <<
"NTT::Plan::ptransform_batch: batch item " << i
1248 <<
" has size " <<
batch[i].size()
1249 <<
" but plan size is " <<
n_;
1251 if (
batch.is_empty())
1256 for (
size_t i = 0; i <
batch.size(); ++i)
1282 const bool invert =
false)
const
1301 const bool invert =
false,
1302 const size_t chunk_size = 0)
const
1323 for (
size_t i = 0; i <
input.size(); ++i)
1333 for (
size_t i = 0; i < n; ++i)
1343 const size_t limit = std::min(
input.size(), n);
1344 for (
size_t i = 0; i <
limit; ++i)
1355 for (
size_t i = 0; i <
input.size()
and i < n; ++i)
1365 for (
size_t i =
input.size(); i > 0; --i)
1376 for (
size_t i = 0; i < n; ++i)
1391 for (
size_t i = 0; i < n; ++i)
1404 const size_t n = std::max(
lhs.size(),
rhs.size());
1414 const size_t n = std::max(
lhs.size(),
rhs.size());
1427 for (
size_t i = 0; i <
input.size()
and i < n; ++i)
1451 if (coeffs.
size() <= 1)
1455 for (
size_t i = 1; i < coeffs.
size(); ++i)
1467 for (
size_t i = 0; i < coeffs.
size(); ++i)
1477 const char *
const ctx)
1485 << ctx <<
": constant term " << a
1486 <<
" is not a quadratic residue modulo " <<
MOD;
1496 while ((q & 1ULL) == 0)
1522 << ctx <<
": Tonelli-Shanks failed to converge";
1525 for (
size_t j = 0; j + i + 1 <
m; ++j)
1535 return std::min(
r,
r == 0 ? 0 :
MOD -
r);
1542 const size_t capacity =
count == 0 ? 0 :
count * 4 + 4;
1544 for (
size_t i = 0; i < capacity; ++i)
1556 if (left + 1 == right)
1565 const size_t mid = left + (right - left) / 2;
1568 tree(node) =
multiply(tree[node << 1], tree[(node << 1) | 1]);
1573 const char *
const ctx)
1575 for (
size_t i = 0; i < points.
size(); ++i)
1576 for (
size_t j = i + 1; j < points.
size(); ++j)
1578 << ctx <<
": points[" << i <<
"] and points[" << j
1579 <<
"] collide modulo " <<
MOD;
1606 if (left + 1 == right)
1612 const size_t mid = left + (right - left) / 2;
1614 poly.
size() < tree[node << 1].size() ?
1618 poly.
size() < tree[(node << 1) | 1].
size() ?
1620 poly_mod(poly, tree[(node << 1) | 1]);
1623 node << 1, left,
mid);
1625 (node << 1) | 1,
mid, right);
1635 if (left + 1 == right)
1638 const size_t mid = left + (right - left) / 2;
1684 if (
not std::is_constant_evaluated())
1713 const bool invert =
false)
1736 std::numeric_limits<size_t>::max()
1738 <<
"NTT::multiply: product size exceeds size_t capacity";
1781 <<
"NTT::negacyclic_multiply: inputs must have positive size";
1783 <<
"NTT::negacyclic_multiply: lhs size " << a.
size()
1784 <<
" does not match rhs size " << b.
size();
1786 const size_t n = a.
size();
1788 <<
"NTT::negacyclic_multiply: size " << n
1789 <<
" is not a power of two";
1791 <<
"NTT::negacyclic_multiply: size " << n
1792 <<
" requires a primitive root of order " << (n << 1)
1793 <<
", but the largest supported power-of-two order is "
1803 for (
size_t i = 0; i < n; ++i)
1812 for (
size_t i = 0; i < n; ++i)
1817 for (
size_t i = 0; i < n; ++i)
1836 if (
batch.is_empty())
1851 const bool invert =
false)
1853 if (
batch.is_empty())
1871 const size_t chunk_size = 0)
1888 const bool invert =
false,
1889 const size_t chunk_size = 0)
1910 const size_t chunk_size = 0)
1916 std::numeric_limits<size_t>::max()
1918 <<
"NTT::pmultiply: product size exceeds size_t capacity";
1943 const size_t chunk_size = 0)
1945 if (
batch.is_empty())
1963 for (
size_t i = coeffs.
size(); i > 0; --i)
1986 <<
"NTT::poly_inverse: input polynomial must be non-empty";
1990 <<
"NTT::poly_inverse: constant term must be invertible modulo " <<
MOD;
1998 const size_t m2 = std::min(n,
m << 1);
2003 for (
size_t i = 1; i <
m2; ++i)
2027 <<
"NTT::poly_divmod: divisor cannot be the zero polynomial";
2064 <<
"NTT::poly_log: constant term must be 1 modulo " <<
MOD;
2092 <<
"NTT::poly_exp: constant term must be 0 modulo " <<
MOD;
2100 const size_t m2 = std::min(n,
m << 1);
2104 delta(0) =
add_mod(delta[0], 1);
2136 <<
"NTT::poly_sqrt: first non-zero term appears at odd degree "
2141 const size_t shift =
lead / 2;
2144 for (
size_t i =
lead; i <
input.size(); ++i)
2149 for (
size_t i = 0; i <
rooted.size()
and i + shift < n; ++i)
2161 const size_t m2 = std::min(n,
m << 1);
2208 const size_t shift =
static_cast<size_t>(
shift128);
2209 const size_t target = n - shift;
2215 for (
size_t i =
lead; i <
input.size(); ++i)
2219 for (
size_t i = 0; i <
scaled_log.size(); ++i)
2228 for (
size_t i = 0; i <
powered.size()
and i + shift < n; ++i)
2247 for (
size_t i = 0; i < points.
size(); ++i)
2254 for (
size_t i = 0; i <
output.size(); ++i)
2276 <<
"NTT::interpolate: points size " << points.
size()
2277 <<
" does not match values size " << values.
size();
2284 for (
size_t i = 0; i < points.
size(); ++i)
2300 for (
size_t i = 0; i < values.
size(); ++i)
2303 <<
"NTT::interpolate: derivative vanished at point index " << i;
2333 template <u
int64_t Base = (1ULL << 15)>
2334 [[nodiscard]] static Array<u
int64_t>
2335 big
int_multiply(const Array<u
int64_t> & a,
2336 const Array<u
int64_t> & b);
2354 template <u
int64_t Base = (1ULL << 15)>
2355 [[nodiscard]] static Array<u
int64_t>
2356 pbig
int_multiply(ThreadPool & pool,
2357 const Array<u
int64_t> & a,
2358 const Array<u
int64_t> & b,
2359 const
size_t chunk_size = 0);
2371 u
int64_t max_power_of_two = 0;
2392 using coeff_type = __u
int128_t;
2395 using Prime0NTT = NTT<998244353ULL, 3ULL>;
2396 using Prime1NTT = NTT<469762049ULL, 3ULL>;
2397 using Prime2NTT = NTT<1004535809ULL, 3ULL>;
2399 struct CoefficientStats
2401 u
int64_t max_value = 0;
2402 size_t non_zero = 0;
2406 static constexpr NTTPrime primes_[] = {
2407 {998244353ULL, 3ULL, 23ULL},
2408 {469762049ULL, 3ULL, 26ULL},
2409 {1004535809ULL, 3ULL, 21ULL}
2412 [[nodiscard]] static constexpr coeff_type
2413 exact_modulus_product_impl() noexcept
2415 return static_cast<coeff_type>(primes_[0].mod)
2416 * static_cast<coeff_type>(primes_[1].mod)
2417 * static_cast<coeff_type>(primes_[2].mod);
2420 [[nodiscard]] static constexpr u
int64_t
2421 sub_mod(const u
int64_t lhs,
2423 const u
int64_t mod) noexcept
2425 return lhs >= rhs ? lhs - rhs : mod - (rhs - lhs);
2428 [[nodiscard]] static constexpr coeff_type
2429 add_capped(const coeff_type lhs,
2430 const coeff_type rhs,
2431 const coeff_type cap) noexcept
2433 return lhs >= cap -
rhs ? cap :
lhs +
rhs;
2436 [[
nodiscard]]
static constexpr coeff_type
2455 if (value > std::numeric_limits<size_t>::max() / 2)
2463 template <
typename PrimeNTT>
2470 if (PrimeNTT::supports_size(
required))
2473 const size_t n = next_power_of_two(
required);
2474 return n != 0
and PrimeNTT::supports_size(n);
2486 const auto digit =
static_cast<unsigned>(value % 10);
2499 const coeff_type cap = exact_modulus_product();
2500 for (
size_t i = 0; i <
input.size(); ++i)
2522 const coeff_type cap = exact_modulus_product();
2526 if (
lhs.non_zero == 0
or rhs.non_zero == 0)
2557 const char *
const ctx)
2560 std::numeric_limits<size_t>::max()
2562 << ctx <<
": product size exceeds size_t capacity";
2566 << ctx <<
": required product size " <<
required
2567 <<
" is not supported by the three-prime CRT pack";
2569 const coeff_type bound = conservative_bound(a, b);
2571 << ctx <<
": cannot guarantee exact reconstruction inside CRT range "
2572 << coeff_to_string(exact_modulus_product())
2573 <<
" with conservative coefficient bound "
2574 << coeff_to_string(bound);
2609 const size_t chunk_size)
2612 <<
"NTTExact::reconstruct_product: inconsistent residue sizes";
2618 output(i) = reconstruct_coefficient(
c0[i],
c1[i],
c2[i]);
2624 for (
size_t i = 0; i <
c0.size(); ++i)
2635 return sizeof(primes_) /
sizeof(primes_[0]);
2643 [[
nodiscard]]
static constexpr coeff_type
2646 return exact_modulus_product_impl();
2679 validate_inputs(a, b,
"NTTExact::multiply");
2684 return reconstruct_product(
c0,
c1,
c2,
nullptr, 0);
2707 const size_t chunk_size = 0)
2712 validate_inputs(a, b,
"NTTExact::pmultiply");
2715 return multiply(a, b);
2719 return Prime0NTT::multiply(a, b);
2721 auto f1 = pool.
enqueue([&a, &b]()
2723 return Prime1NTT::multiply(a, b);
2725 auto f2 = pool.
enqueue([&a, &b]()
2727 return Prime2NTT::multiply(a, b);
2733 return reconstruct_product(
c0,
c1,
c2, &pool, chunk_size);
2737 template <u
int64_t MOD, u
int64_t ROOT>
2738 template <u
int64_t Base>
2743 static_assert(Base > 1,
"NTT::bigint_multiply requires Base >= 2");
2754 const char *
const name,
2755 const char *
const ctx)
2757 for (
size_t i = 0; i <
digits.size(); ++i)
2759 << ctx <<
": " << name <<
"[" << i <<
"] = " <<
digits[i]
2760 <<
" is not in [0, " << Base <<
")";
2767 for (
size_t i = 0; i <
input.size(); ++i)
2771 static_cast<void>(
output.remove_last());
2777 if (coeffs.is_empty())
2784 for (
size_t i = 0; i < coeffs.size(); ++i)
2798 static_cast<void>(
output.remove_last());
2808 if (
lhs.is_empty()
or rhs.is_empty())
2812 std::numeric_limits<size_t>::max()
2814 <<
"NTT::bigint_multiply: product size exceeds size_t capacity";
2820 next_power_of_two(
required,
"NTT::bigint_multiply");
2832 for (
size_t i = 0; i < coeffs.
size(); ++i)
2840 template <u
int64_t MOD, u
int64_t ROOT>
2841 template <u
int64_t Base>
2846 const size_t chunk_size)
2848 static_assert(Base > 1,
"NTT::pbigint_multiply requires Base >= 2");
2859 const char *
const name,
2860 const char *
const ctx)
2862 for (
size_t i = 0; i <
digits.size(); ++i)
2864 << ctx <<
": " << name <<
"[" << i <<
"] = " <<
digits[i]
2865 <<
" is not in [0, " << Base <<
")";
2872 for (
size_t i = 0; i <
input.size(); ++i)
2876 static_cast<void>(
output.remove_last());
2882 if (coeffs.is_empty())
2889 for (
size_t i = 0; i < coeffs.size(); ++i)
2903 static_cast<void>(
output.remove_last());
2913 if (
lhs.is_empty()
or rhs.is_empty())
2917 std::numeric_limits<size_t>::max()
2919 <<
"NTT::pbigint_multiply: product size exceeds size_t capacity";
2925 next_power_of_two(
required,
"NTT::pbigint_multiply");
2937 for (
size_t i = 0; i < coeffs.
size(); ++i)
Exception handling system with formatted messages for Aleph-w.
#define ah_runtime_error_unless(C)
Throws std::runtime_error if condition does NOT hold.
#define ah_overflow_error_if(C)
Throws std::overflow_error if condition holds.
#define ah_runtime_error_if(C)
Throws std::runtime_error if condition holds.
#define ah_invalid_argument_if(C)
Throws std::invalid_argument if condition holds.
Simple dynamic array with automatic resizing and functional operations.
static Array create(size_t n)
Create an array with n logical elements.
constexpr size_t size() const noexcept
Return the number of elements stored in the stack.
constexpr bool is_empty() const noexcept
Checks if the container is empty.
T & append(const T &data)
Append a copy of data
T & get_last() noexcept
return a modifiable reference to the last element.
void reserve(size_t cap)
Reserves cap cells into the array.
static coeff_type conservative_bound(const Array< uint64_t > &a, const Array< uint64_t > &b)
static constexpr coeff_type mul_capped(const coeff_type lhs, const coeff_type rhs, const coeff_type cap) noexcept
static Array< coeff_type > reconstruct_product(const Array< uint64_t > &c0, const Array< uint64_t > &c1, const Array< uint64_t > &c2, ThreadPool *const pool, const size_t chunk_size)
static constexpr coeff_type exact_modulus_product() noexcept
Product of the three CRT moduli.
static CoefficientStats analyze_coefficients(const Array< uint64_t > &input)
static coeff_type reconstruct_coefficient(const uint64_t r0, const uint64_t r1, const uint64_t r2)
static Array< coeff_type > pmultiply(ThreadPool &pool, const Array< uint64_t > &a, const Array< uint64_t > &b, const size_t chunk_size=0)
Exact parallel polynomial multiplication.
static constexpr bool prime_supports_product_size(const size_t required) noexcept
static constexpr bool supports_product_size(const size_t required) noexcept
Check whether a target product length is supported.
static constexpr size_t next_power_of_two(const size_t n) noexcept
static std::string coeff_to_string(coeff_type value)
static void validate_inputs(const Array< uint64_t > &a, const Array< uint64_t > &b, const char *const ctx)
static Array< coeff_type > multiply(const Array< uint64_t > &a, const Array< uint64_t > &b)
Exact sequential polynomial multiplication.
static constexpr size_t prime_count() noexcept
Number of CRT primes in the exact multiplier.
Precomputed plans for NTT transforms.
Array< Array< uint64_t > > transformed_batch(const Array< Array< uint64_t > > &input, const bool invert=false) const
Return a transformed copy of an entire batch.
Array< uint64_t > multiply(const Array< uint64_t > &a, const Array< uint64_t > &b) const
Multiply two polynomials using this plan size.
void ptransform(ThreadPool &pool, Array< uint64_t > &a, const bool invert, const size_t chunk_size=0) const
Parallel in-place transform using a ThreadPool.
Array< uint64_t > multiply_impl(const Array< uint64_t > &a, const Array< uint64_t > &b, ThreadPool *const pool, const size_t chunk_size) const
size_t size() const noexcept
Return the transform size bound to the plan.
void initialize_twiddles()
Array< uint64_t > bluestein_kernel_forward_
void apply_bit_reversal(Array< uint64_t > &a) const noexcept
Array< uint64_t > transformed(const Array< uint64_t > &input, const bool invert=false) const
Transform an input array and return the result.
std::shared_ptr< const Plan > bluestein_plan_
Array< uint64_t > pmultiply(ThreadPool &pool, const Array< uint64_t > &a, const Array< uint64_t > &b, const size_t chunk_size=0) const
Parallel polynomial multiplication using this plan size.
Plan(const size_t n)
Construct a reusable plan for a fixed transform size.
Array< uint64_t > bluestein_chirp_inv_
void transform(Array< uint64_t > &a, const bool invert) const
In-place forward or inverse transform.
void ptransform_batch(ThreadPool &pool, Array< Array< uint64_t > > &batch, const bool invert, const size_t chunk_size=0) const
Parallel batch transform for equal-sized inputs.
void lift_input(Array< uint64_t > &a, ThreadPool *const pool, const size_t chunk_size) const
void initialize_power_of_two_plan()
Array< uint64_t > twiddles_inv_
static void for_each_index(ThreadPool *const pool, const size_t count, F &&fn, const size_t chunk_size)
Array< Array< uint64_t > > ptransformed_batch(ThreadPool &pool, const Array< Array< uint64_t > > &input, const bool invert=false, const size_t chunk_size=0) const
Return a parallel-transformed copy of an entire batch.
Array< uint64_t > bluestein_chirp_fwd_
void apply_transform(Array< uint64_t > &a, const bool invert, const Representation input_repr, const Representation output_repr, ThreadPool *const pool, const size_t chunk_size) const
bool should_use_avx2(ThreadPool *const pool) const noexcept
void apply_butterflies_scalar(Array< uint64_t > &a, const Array< uint64_t > &twiddles, ThreadPool *const pool, const size_t chunk_size) const
void apply_butterflies(Array< uint64_t > &a, const bool invert, ThreadPool *const pool, const size_t chunk_size) const
void apply_bluestein_transform(Array< uint64_t > &a, const bool invert, ThreadPool *const pool, const size_t chunk_size) const
bool should_use_neon(ThreadPool *const pool) const noexcept
void initialize_bit_reversal()
void scale_inverse(Array< uint64_t > &a, ThreadPool *const pool, const size_t chunk_size) const
void apply_scalar_butterfly_range(Array< uint64_t > &a, const Array< uint64_t > &twiddles, const size_t base, const size_t half, const size_t offset, const size_t begin, const size_t end) const
void transform_batch(Array< Array< uint64_t > > &batch, const bool invert) const
Sequential batch transform for equal-sized inputs.
Array< uint64_t > twiddles_fwd_
void lower_output(Array< uint64_t > &a, ThreadPool *const pool, const size_t chunk_size) const
void initialize_bluestein_plan()
Array< uint64_t > bluestein_kernel_inverse_
Array< uint64_t > ptransformed(ThreadPool &pool, const Array< uint64_t > &input, const bool invert=false, const size_t chunk_size=0) const
Parallel transform returning a new array.
Number Theoretic Transform over Z / MOD Z.
static Array< uint64_t > interpolate_recursive(const Array< Array< uint64_t > > &tree, const Array< uint64_t > &scaled_values, const size_t node, const size_t left, const size_t right)
static Array< uint64_t > poly_sqrt(const Array< uint64_t > &coeffs, const size_t n)
Formal polynomial square root modulo x^n.
static uint64_t tonelli_shanks(const uint64_t value, const char *const ctx)
static bool avx2_dispatch_available() noexcept
Returns whether AVX2 dispatch is available at runtime.
static constexpr const char * simd_preference_name(const SimdPreference preference) noexcept
static void validate_distinct_points(const Array< uint64_t > &points, const char *const ctx)
static constexpr bool supports_bluestein_size(const size_t n) noexcept
static Array< uint64_t > bigint_multiply(const Array< uint64_t > &a, const Array< uint64_t > &b)
Multiply two non-negative integers represented as base-Base digits.
static Array< uint64_t > poly_exp(const Array< uint64_t > &coeffs, const size_t n)
Formal polynomial exponential modulo x^n.
static Array< Array< uint64_t > > transformed_batch(const Array< Array< uint64_t > > &batch, const bool invert=false)
Return a transformed copy of an entire batch.
static constexpr uint64_t add_mod(const uint64_t lhs, const uint64_t rhs) noexcept
static void transform_batch(Array< Array< uint64_t > > &batch, const bool invert)
Sequential batch transform for equal-sized inputs.
static Array< uint64_t > poly_sub_series(const Array< uint64_t > &lhs, const Array< uint64_t > &rhs, const size_t n)
static Array< uint64_t > negacyclic_multiply(const Array< uint64_t > &a, const Array< uint64_t > &b)
Negacyclic convolution modulo x^N + 1.
static Array< uint64_t > poly_integral(const Array< uint64_t > &coeffs)
static constexpr bool supports_size(const size_t n) noexcept
Check whether a transform size is supported.
static constexpr uint64_t sub_mod(const uint64_t lhs, const uint64_t rhs) noexcept
static void multipoint_eval_recursive(const Array< Array< uint64_t > > &tree, const Array< uint64_t > &poly, Array< uint64_t > &output, const size_t node, const size_t left, const size_t right)
static Array< Array< uint64_t > > make_product_tree_storage(const size_t count)
NTTSimdBackend
SIMD backends available to the NTT butterfly core.
@ avx2
x86-64 AVX2 grouped butterfly path.
@ scalar
Portable scalar implementation.
@ neon
AArch64 NEON grouped butterfly path.
static Array< uint64_t > poly_power(const Array< uint64_t > &coeffs, const uint64_t k, const size_t n)
Formal polynomial power modulo x^n.
static constexpr bool supports_root_order(const uint64_t order) noexcept
static size_t next_power_of_two(size_t n, const char *const ctx)
static Array< uint64_t > multipoint_eval(const Array< uint64_t > &coeffs, const Array< uint64_t > &points)
Evaluate a polynomial on multiple points modulo MOD.
static Array< uint64_t > poly_inverse(const Array< uint64_t > &coeffs, const size_t n)
Formal polynomial inverse modulo x^n.
static Array< uint64_t > poly_sub_normalized(const Array< uint64_t > &lhs, const Array< uint64_t > &rhs)
static Array< uint64_t > multiply(const Array< uint64_t > &a, const Array< uint64_t > &b)
Multiply two polynomials modulo MOD.
static Array< uint64_t > poly_add_series(const Array< uint64_t > &lhs, const Array< uint64_t > &rhs, const size_t n)
static Array< uint64_t > reverse_poly(const Array< uint64_t > &input)
static Array< uint64_t > poly_derivative(const Array< uint64_t > &coeffs)
static void validate_root_order(const uint64_t order, const char *const ctx)
static Array< uint64_t > pmultiply(ThreadPool &pool, const Array< uint64_t > &a, const Array< uint64_t > &b, const size_t chunk_size=0)
Parallel polynomial multiplication modulo MOD.
static Array< uint64_t > truncate_poly(const Array< uint64_t > &input, const size_t n)
static void ptransform(ThreadPool &pool, Array< uint64_t > &a, const bool invert, const size_t chunk_size=0)
Parallel in-place transform using a ThreadPool.
static Array< uint64_t > prefix_copy(const Array< uint64_t > &input, const size_t length)
static constexpr uint64_t max_transform_size() noexcept
Maximum supported power-of-two transform size.
static constexpr const char * simd_backend_name(const NTTSimdBackend backend) noexcept
static Array< uint64_t > interpolate(const Array< uint64_t > &points, const Array< uint64_t > &values)
Interpolate a polynomial from point-value samples modulo MOD.
static void multiply_inplace(Array< uint64_t > &a, const Array< uint64_t > &b)
Replace a by the product a * b.
static constexpr uint64_t mod
static Array< uint64_t > ptransformed(ThreadPool &pool, const Array< uint64_t > &input, const bool invert=false, const size_t chunk_size=0)
Parallel transform returning a new array.
static SimdPreference simd_preference() noexcept
static uint64_t poly_eval(const Array< uint64_t > &coeffs, const uint64_t x)
Evaluate a polynomial at a single point modulo MOD.
static Array< uint64_t > transformed(const Array< uint64_t > &input, const bool invert=false)
Transform an input array and return the result.
static Array< uint64_t > series_prefix(const Array< uint64_t > &input, const size_t n)
static void ptransform_batch(ThreadPool &pool, Array< Array< uint64_t > > &batch, const bool invert, const size_t chunk_size=0)
Parallel batch transform for equal-sized inputs.
static NTTSimdBackend detected_simd_backend() noexcept
static constexpr bool supports_power_of_two_size(const size_t n) noexcept
static constexpr MontgomeryCtx mctx_
static constexpr bool simd_mod_supported() noexcept
static constexpr uint64_t pow_mod_constexpr(uint64_t base, uint64_t exp) noexcept
static Array< uint64_t > poly_mul_trunc(const Array< uint64_t > &lhs, const Array< uint64_t > &rhs, const size_t n)
static constexpr uint64_t root
static bool neon_dispatch_available() noexcept
Returns whether AArch64 NEON dispatch is available at runtime.
static Array< uint64_t > poly_add_normalized(const Array< uint64_t > &lhs, const Array< uint64_t > &rhs)
static std::pair< Array< uint64_t >, Array< uint64_t > > poly_divmod(const Array< uint64_t > ÷nd, const Array< uint64_t > &divisor)
Polynomial division with remainder modulo MOD.
static void transform(Array< uint64_t > &a, const bool invert)
In-place forward or inverse transform.
static Array< uint64_t > poly_mod(const Array< uint64_t > ÷nd, const Array< uint64_t > &divisor)
static constexpr uint64_t primitive_root_of_unity(const size_t n)
Return an n-th primitive root of unity modulo MOD.
static Array< uint64_t > normalize_poly(const Array< uint64_t > &input)
static NTTSimdBackend simd_backend() noexcept
Returns the SIMD backend selected under ALEPH_NTT_SIMD.
static void validate_supported_size(const size_t n, const char *const ctx)
static Array< uint64_t > poly_scalar_mul_series(const Array< uint64_t > &input, const uint64_t scalar, const size_t n)
static Array< uint64_t > pbigint_multiply(ThreadPool &pool, const Array< uint64_t > &a, const Array< uint64_t > &b, const size_t chunk_size=0)
Parallel big-integer multiplication in base Base.
static void trim_trailing_zeros(Array< uint64_t > &poly)
static constexpr bool is_power_of_two(const size_t n) noexcept
static Array< uint64_t > padded_copy(const Array< uint64_t > &input, const size_t n)
static Array< uint64_t > zero_series(const size_t n)
static constexpr uint64_t primitive_root_of_order(const uint64_t order)
static const char * simd_backend_name() noexcept
Returns the active SIMD backend name.
static void build_product_tree(Array< Array< uint64_t > > &tree, const Array< uint64_t > &points, const size_t node, const size_t left, const size_t right)
static Array< uint64_t > poly_log(const Array< uint64_t > &coeffs, const size_t n)
Formal polynomial logarithm modulo x^n.
static constexpr uint64_t max_transform_size_impl() noexcept
A reusable thread pool for efficient parallel task execution.
size_t num_threads() const noexcept
Get the number of worker threads.
auto enqueue(F &&f, Args &&... args) -> std::future< std::invoke_result_t< F, Args... > >
Submit a task for execution and get a future for the result.
__gmp_expr< T, __gmp_unary_expr< __gmp_expr< T, U >, __gmp_exp_function > > exp(const __gmp_expr< T, U > &expr)
__gmp_expr< typename __gmp_resolve_expr< T, V >::value_type, __gmp_binary_expr< __gmp_expr< T, U >, __gmp_expr< V, W >, __gmp_remainder_function > > remainder(const __gmp_expr< T, U > &expr1, const __gmp_expr< V, W > &expr2)
const long double offset[]
Offset values indexed by symbol string length (bounded by MAX_OFFSET_INDEX)
Safe modular arithmetic, extended Euclidean algorithm, and Chinese Remainder Theorem.
Main namespace for Aleph-w library functions.
and
Check uniqueness with explicit hash + equality functors.
uint64_t mod_inv(const uint64_t a, const uint64_t m)
Modular Inverse.
bool eq(const C1 &c1, const C2 &c2, Eq e=Eq())
Check equality of two containers using a predicate.
size_t size(Node *root) noexcept
static long & low(typename GT::Node *p)
Internal helper: low-link value stored directly in NODE_COOKIE(p).
void parallel_for_index(ThreadPool &pool, size_t start, size_t end, F &&f, size_t chunk_size=0)
Apply a function to each element in parallel (index-based).
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
uint64_t mod_exp(uint64_t base, uint64_t exp, const uint64_t m)
Modular exponentiation.
bool diff(const C1 &c1, const C2 &c2, Eq e=Eq())
Check if two containers differ.
uint64_t mod_mul(uint64_t a, uint64_t b, uint64_t m)
Safe 64-bit modular multiplication.
auto mode(const Container &data) -> std::decay_t< decltype(*std::begin(data))>
Compute the mode (most frequent value).
Itor::difference_type count(const Itor &beg, const Itor &end, const T &value)
Count elements equal to a value.
T sum(const Container &container, const T &init=T{})
Compute sum of all elements.
FooMap m(5, fst_unit_pair_hash, snd_unit_pair_hash)
A modern, efficient thread pool for parallel task execution.
Dynamic array container with automatic resizing.