55using Clock = std::chrono::steady_clock;
66 bool validate =
false;
68 size_t repetitions = 64;
72 size_t batch_count = 16;
73 unsigned seed = 20260306u;
74 Array<size_t> sizes = {64, 128, 256, 300, 512, 1024, 1536, 2048};
80 double median_us = 0.0;
83 double stddev_us = 0.0;
90 size_t items_per_call = 1;
91 size_t repetitions = 0;
92 TimingStats sequential;
96 struct ValidationMetrics
98 long double max_forward_error = 0;
99 long double max_roundtrip_error = 0;
100 long double max_real_roundtrip_error = 0;
101 long double max_compact_roundtrip_error = 0;
104 template <
typename Real>
105 [[
nodiscard]]
constexpr ValidationMetrics
108 if constexpr (std::is_same_v<Real, float>)
109 return {1.0e-4L, 1.0e-4L, 1.0e-4L, 1.0e-4L};
110 else if constexpr (std::is_same_v<Real, double>)
111 return {1.0e-12L, 1.0e-12L, 1.0e-12L, 1.0e-12L};
113 return {1.0e-15L, 1.0e-15L, 1.0e-15L, 1.0e-15L};
118 const ValidationMetrics &
envelope)
noexcept
130 std::stringstream
ss(
csv);
132 while (std::getline(
ss, item,
','))
138 const unsigned long long parsed = std::stoull(item);
141 catch (
const std::invalid_argument &)
143 throw std::runtime_error(
"Invalid token '" + item
144 +
"' in --sizes argument");
146 catch (
const std::out_of_range &)
148 throw std::runtime_error(
"Out-of-range token '" + item
149 +
"' in --sizes argument");
159 for (
int i = 1; i <
argc; ++i)
161 const std::string
arg =
argv[i];
162 if (
arg ==
"--validate")
164 else if (
arg ==
"--json")
166 else if (
arg ==
"--repetitions" and i + 1 <
argc)
167 options.repetitions =
static_cast<size_t>(std::stoull(
argv[++i]));
169 options.warmup =
static_cast<size_t>(std::stoull(
argv[++i]));
171 options.samples =
static_cast<size_t>(std::stoull(
argv[++i]));
173 options.threads =
static_cast<size_t>(std::stoull(
argv[++i]));
174 else if (
arg ==
"--batch-count" and i + 1 <
argc)
175 options.batch_count =
static_cast<size_t>(std::stoull(
argv[++i]));
177 options.seed =
static_cast<unsigned>(std::stoul(
argv[++i]));
180 else if (
arg ==
"--help")
183 <<
"Usage: fft_benchmark [--validate] [--json] [--repetitions N]"
184 <<
" [--warmup N] [--samples N] [--threads N] [--batch-count N]"
185 <<
" [--seed N] [--sizes a,b,c]\n";
190 std::cerr <<
"Unknown option: " <<
arg <<
"\n";
197 std::cerr <<
"At least one benchmark size is required\n";
201 for (
size_t i = 0; i <
options.sizes.size(); ++i)
204 std::cerr <<
"Benchmark sizes must be positive\n";
210 std::cerr <<
"Repetitions must be positive\n";
216 std::cerr <<
"Samples must be positive\n";
222 std::cerr <<
"Batch count must be positive\n";
235 const size_t detected =
static_cast<size_t>(std::thread::hardware_concurrency());
236 return std::max<size_t>(1,
detected);
239 template <
typename Real>
243 std::uniform_real_distribution<double> dist(-1.0, 1.0);
246 for (
size_t i = 0; i < n; ++i)
247 signal.
append(std::complex<Real>(
static_cast<Real>(dist(
rng)),
248 static_cast<Real>(dist(
rng))));
252 template <
typename Real>
256 std::mt19937_64 &
rng)
260 for (
size_t item = 0; item < batch_count; ++item)
265 template <
typename Real>
269 std::uniform_real_distribution<double> dist(-1.0, 1.0);
272 for (
size_t i = 0; i < n; ++i)
277 template <
typename Real>
281 std::mt19937_64 &
rng)
285 for (
size_t item = 0; item < batch_count; ++item)
290 template <
typename Real>
294 using Complex = std::complex<Real>;
299 for (
size_t k = 0;
k <
input.size(); ++
k)
302 for (
size_t t = 0; t <
input.size(); ++t)
305 *
static_cast<Real>(
k)
306 *
static_cast<Real>(t)
318 template <
typename Real>
321 const Array<std::complex<Real>> &
rhs)
323 long double error = 0;
324 for (
size_t i = 0; i <
lhs.size(); ++i)
326 static_cast<long double>(std::abs(
lhs[i] -
rhs[i])));
330 template <
typename Real>
334 long double error = 0;
335 for (
size_t i = 0; i <
lhs.size(); ++i)
337 static_cast<long double>(std::abs(
lhs[i] -
rhs[i])));
341 template <
typename Real>
347 for (
size_t i = 0; i <
input.size(); ++i)
352 template <
typename Fn>
355 const size_t samples,
356 const size_t repetitions,
359 for (
size_t sample = 0; sample < warmup; ++sample)
360 for (
size_t i = 0; i < repetitions; ++i)
365 for (
size_t sample = 0; sample < samples; ++sample)
367 const auto t0 = Clock::now();
368 for (
size_t i = 0; i < repetitions; ++i)
371 std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(
373 values.
append(elapsed /
static_cast<double>(repetitions));
382 <<
"summarize_timings: at least one sample is required";
389 for (
size_t i = 0; i < values.
size(); ++i)
392 stats.mean_us =
sum /
static_cast<double>(values.
size());
395 if (
sorted.size() % 2 == 1)
402 for (
size_t i = 0; i < values.
size(); ++i)
404 const double delta = values[i] - stats.mean_us;
407 stats.stddev_us = std::sqrt(
squared /
static_cast<double>(values.
size()));
411 template <
typename Fn>
414 const size_t samples,
415 const size_t repetitions,
421 std::forward<Fn>(
fn)));
429 const double denom =
static_cast<double>(std::max<size_t>(1,
taps));
430 for (
size_t i = 0; i <
taps; ++i)
432 const double phase = 2.0 * std::numbers::pi *
static_cast<double>(i) /
denom;
433 kernel.
append(0.42 - 0.5 * std::cos(phase) + 0.08 * std::cos(2.0 * phase));
447 for (
size_t i = 0; i <
options.sizes.size(); ++i)
449 const size_t n =
options.sizes[i];
465 const size_t window_size = std::min<size_t>(std::max<size_t>(16, n / 2), n);
485 const auto output = FFTD::transformed(complex_signal, false);
486 benchmark_sink += output[0].real();
494 const auto output = FFTD::ptransformed(pool, complex_signal, false);
495 benchmark_sink += output[0].real();
511 const auto output = batch_plan.transformed_batch(complex_batch, false);
512 benchmark_sink += output[0][0].real();
520 const auto output = batch_plan.ptransformed_batch(pool,
523 benchmark_sink += output[0][0].real();
539 const auto output = batch_plan.rfft_batch(real_batch);
540 benchmark_sink += output[0][0].real();
548 const auto output = batch_plan.prfft_batch(pool,
550 benchmark_sink += output[0][0].real();
564 const auto output = FFTD::transform(real_signal);
565 benchmark_sink += output[0].real();
573 const auto output = FFTD::ptransform(pool, real_signal);
574 benchmark_sink += output[0].real();
588 const auto output = FFTD::rfft(real_signal);
589 benchmark_sink += output[0].real();
597 const auto output = FFTD::prfft(pool, real_signal);
598 benchmark_sink += output[0].real();
612 const auto output = FFTD::multiply(long_signal, fir);
613 benchmark_sink += output[0];
621 const auto output = FFTD::pmultiply(pool, long_signal, fir);
622 benchmark_sink += output[0];
631 std::max<size_t>(
size_t(1),
options.repetitions / 4);
639 const auto output = overlap_bank.convolve(long_real_batch);
640 benchmark_sink += output[0][0];
648 const auto output = overlap_bank.pconvolve(pool,
650 benchmark_sink += output[0][0];
657 stft_row.repetitions = std::max<size_t>(
size_t(1),
options.repetitions / 4);
664 const auto output = FFTD::stft(long_signal, window, stft_options);
665 benchmark_sink += static_cast<long double>(output.size());
673 const auto output = FFTD::pstft(pool, long_signal, window, stft_options);
674 benchmark_sink += static_cast<long double>(output.size());
688 const auto output = FFTD::filtfilt(long_signal, fir, fir_block_size);
689 benchmark_sink += output[0];
697 const auto output = FFTD::pfiltfilt(pool,
701 benchmark_sink += output[0];
710 std::max<size_t>(
size_t(1),
options.repetitions / 4);
723 sequential_bank.reset();
724 const auto output = sequential_bank.filter(real_batch);
725 benchmark_sink += output[0][0];
733 parallel_bank.reset();
734 const auto output = parallel_bank.pfilter(pool, real_batch);
735 benchmark_sink += output[0][0];
743 template <
typename Real>
748 using Complex =
typename FFTT::Complex;
753 for (
const size_t n : {size_t(3), size_t(5), size_t(8), size_t(10),
754 size_t(17), size_t(32)})
767 metrics.max_roundtrip_error = std::max(
metrics.max_roundtrip_error,
773 metrics.max_real_roundtrip_error = std::max(
metrics.max_real_roundtrip_error,
779 metrics.max_compact_roundtrip_error =
780 std::max(
metrics.max_compact_roundtrip_error,
791 <<
"\"mean_us\": " << std::fixed << std::setprecision(6) << stats.mean_us <<
", "
792 <<
"\"median_us\": " << stats.median_us <<
", "
793 <<
"\"min_us\": " << stats.min_us <<
", "
794 <<
"\"max_us\": " << stats.max_us <<
", "
795 <<
"\"stddev_us\": " << stats.stddev_us
805 <<
" \"schema_version\": 1,\n"
806 <<
" \"mode\": \"benchmark\",\n"
807 <<
" \"metadata\": {\n"
808 <<
" \"seed\": " <<
options.seed <<
",\n"
810 <<
" \"simd_backend\": \"" << FFTD::simd_backend_name() <<
"\",\n"
811 <<
" \"batched_plan_simd_backend\": \""
812 << FFTD::batched_plan_simd_backend_name() <<
"\",\n"
813 <<
" \"detected_simd_backend\": \""
814 << FFTD::detected_simd_backend_name() <<
"\",\n"
815 <<
" \"avx2_kernel_compiled\": "
816 << (FFTD::avx2_kernel_compiled() ?
"true" :
"false") <<
",\n"
817 <<
" \"neon_kernel_compiled\": "
818 << (FFTD::neon_kernel_compiled() ?
"true" :
"false") <<
",\n"
819 <<
" \"avx2_runtime_available\": "
820 << (FFTD::avx2_runtime_available() ?
"true" :
"false") <<
",\n"
821 <<
" \"neon_runtime_available\": "
822 << (FFTD::neon_runtime_available() ?
"true" :
"false") <<
",\n"
823 <<
" \"simd_preference\": \"" << FFTD::simd_preference_name() <<
"\",\n"
824 <<
" \"batch_count\": " <<
options.batch_count <<
",\n"
825 <<
" \"warmup_samples\": " <<
options.warmup <<
",\n"
826 <<
" \"samples\": " <<
options.samples <<
"\n"
829 for (
size_t i = 0; i < rows.
size(); ++i)
831 const auto & row = rows[i];
833 ? row.sequential.mean_us / row.parallel.mean_us
836 ? row.sequential.median_us / row.parallel.median_us
840 <<
" \"case\": \"" << row.name <<
"\",\n"
841 <<
" \"size\": " << row.
size <<
",\n"
842 <<
" \"items_per_call\": " << row.items_per_call <<
",\n"
843 <<
" \"repetitions_per_sample\": " << row.repetitions <<
",\n"
844 <<
" \"metric_unit\": \"microseconds_per_call\",\n"
845 <<
" \"sequential\": ";
848 <<
" \"parallel\": ";
856 if (i + 1 != rows.
size())
860 std::cout <<
" ]\n}" << std::endl;
864 std::cout <<
"FFT baseline benchmark (microseconds per call)\n";
865 std::cout <<
"Configuration: seed=" <<
options.seed
867 <<
", simd=" << FFTD::simd_backend_name()
868 <<
", batch_simd=" << FFTD::batched_plan_simd_backend_name()
869 <<
", detected_simd=" << FFTD::detected_simd_backend_name()
870 <<
", simd_pref=" << FFTD::simd_preference_name()
871 <<
", batch_count=" <<
options.batch_count
872 <<
", warmup=" <<
options.warmup
873 <<
", samples=" <<
options.samples
875 std::cout << std::left << std::setw(14) <<
"Case"
876 << std::right << std::setw(10) <<
"Size"
877 << std::setw(10) <<
"Batch"
878 << std::setw(12) <<
"Rep/S"
879 << std::setw(16) <<
"SeqMean"
880 << std::setw(16) <<
"ParMean"
881 << std::setw(16) <<
"SeqMedian"
882 << std::setw(16) <<
"ParMedian"
883 << std::setw(12) <<
"Speedup"
885 std::cout << std::string(108,
'-') <<
"\n";
886 for (
size_t i = 0; i < rows.
size(); ++i)
888 const auto & row = rows[i];
889 const double speedup = row.parallel.mean_us > 0.0
890 ? row.sequential.mean_us / row.parallel.mean_us
892 std::cout << std::left << std::setw(14) << row.name
893 << std::right << std::setw(10) << row.
size
894 << std::setw(10) << row.items_per_call
895 << std::setw(12) << row.repetitions
896 << std::setw(16) << std::fixed << std::setprecision(3) << row.sequential.mean_us
897 << std::setw(16) << row.parallel.mean_us
898 << std::setw(16) << row.sequential.median_us
899 << std::setw(16) << row.parallel.median_us
903 std::cout <<
"Benchmark sink: " <<
static_cast<long double>(
benchmark_sink) <<
"\n";
922 std::cout << std::left << std::setw(12) << label
923 << std::right << std::setw(18) << std::setprecision(6)
924 << std::scientific <<
metrics.max_forward_error
925 << std::setw(18) <<
metrics.max_roundtrip_error
926 << std::setw(18) <<
metrics.max_real_roundtrip_error
927 << std::setw(18) <<
metrics.max_compact_roundtrip_error
935 std::cout <<
" \"" << label <<
"\": {"
936 <<
"\"max_forward_error\": " << std::scientific <<
metrics.max_forward_error <<
", "
937 <<
"\"max_roundtrip_error\": " <<
metrics.max_roundtrip_error <<
", "
938 <<
"\"max_real_roundtrip_error\": " <<
metrics.max_real_roundtrip_error <<
", "
939 <<
"\"max_compact_roundtrip_error\": " <<
metrics.max_compact_roundtrip_error
947 <<
" \"schema_version\": 1,\n"
948 <<
" \"mode\": \"validation\",\n"
949 <<
" \"valid\": " << (valid ?
"true" :
"false") <<
",\n"
950 <<
" \"metadata\": {\n"
951 <<
" \"seed\": " <<
seed <<
"\n"
953 <<
" \"validation\": {\n";
957 std::cout <<
" }\n}" << std::endl;
961 std::cout <<
"FFT validation envelope (max absolute error, seed=" <<
seed <<
")\n";
962 std::cout << std::left << std::setw(12) <<
"Precision"
963 << std::right << std::setw(18) <<
"Forward"
964 << std::setw(18) <<
"RoundTrip"
965 << std::setw(18) <<
"RealIFFT"
966 << std::setw(18) <<
"CompactIRFFT"
968 std::cout << std::string(84,
'-') <<
"\n";
#define ah_invalid_argument_if(C)
Throws std::invalid_argument if condition holds.
Simple dynamic array with automatic resizing and functional operations.
constexpr size_t size() const noexcept
Return the number of elements stored in the stack.
constexpr bool is_empty() const noexcept
Checks if the container is empty.
T & append(const T &data)
Append a copy of data
void reserve(size_t cap)
Reserves cap cells into the array.
Fast Fourier Transform (FFT) and DSP Toolkit.
A reusable thread pool for efficient parallel task execution.
Fast Fourier Transform (FFT) and Digital Signal Processing (DSP) toolkit.
std::chrono::steady_clock Clock
Main namespace for Aleph-w library functions.
and
Check uniqueness with explicit hash + equality functors.
size_t size(Node *root) noexcept
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
void error(const char *file, int line, const char *format,...)
Print an error message with file and line info.
Itor::difference_type count(const Itor &beg, const Itor &end, const T &value)
Count elements equal to a value.
T sum(const Container &container, const T &init=T{})
Compute sum of all elements.
static struct argp_option options[]
void run_benchmarks(const BenchmarkConfig &config)
Dynamic array container with automatic resizing.