Aleph-w 3.0
A C++ Library for Data Structures and Algorithms
Loading...
Searching...
No Matches
fft_benchmark.cc
Go to the documentation of this file.
1/*
2 Aleph_w
3
4 Data structures & Algorithms
5 version 2.0.0b
6 https://github.com/lrleon/Aleph-w
7
8 This file is part of Aleph-w library
9
10 Copyright (c) 2002-2026 Leandro Rabindranath Leon
11
12 Permission is hereby granted, free of charge, to any person obtaining a copy
13 of this software and associated documentation files (the "Software"), to deal
14 in the Software without restriction, including without limitation the rights
15 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 copies of the Software, and to permit persons to whom the Software is
17 furnished to do so, subject to the following conditions:
18
19 The above copyright notice and this permission notice shall be included in all
20 copies or substantial portions of the Software.
21
22 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 SOFTWARE.
29*/
30
36# include <algorithm>
37# include <chrono>
38# include <cmath>
39# include <complex>
40# include <cstdlib>
41# include <iomanip>
42# include <iostream>
43# include <limits>
44# include <numbers>
45# include <random>
46# include <sstream>
47# include <stdexcept>
48# include <string>
49# include <thread>
50
51# include <fft.H>
52# include <tpl_array.H>
53
54using namespace Aleph;
55using Clock = std::chrono::steady_clock;
56
57namespace
58{
59 using FFTD = FFT<double>;
60 using ComplexD = FFTD::Complex;
61
62 volatile long double benchmark_sink = 0;
63
64 struct Options
65 {
66 bool validate = false;
67 bool json = false;
68 size_t repetitions = 64;
69 size_t warmup = 2;
70 size_t samples = 7;
71 size_t threads = 0;
72 size_t batch_count = 16;
73 unsigned seed = 20260306u;
74 Array<size_t> sizes = {64, 128, 256, 300, 512, 1024, 1536, 2048};
75 };
76
77 struct TimingStats
78 {
79 double mean_us = 0.0;
80 double median_us = 0.0;
81 double min_us = 0.0;
82 double max_us = 0.0;
83 double stddev_us = 0.0;
84 };
85
86 struct BenchmarkRow
87 {
88 std::string name;
89 size_t size = 0;
90 size_t items_per_call = 1;
91 size_t repetitions = 0;
92 TimingStats sequential;
93 TimingStats parallel;
94 };
95
96 struct ValidationMetrics
97 {
98 long double max_forward_error = 0;
99 long double max_roundtrip_error = 0;
100 long double max_real_roundtrip_error = 0;
101 long double max_compact_roundtrip_error = 0;
102 };
103
104 template <typename Real>
105 [[nodiscard]] constexpr ValidationMetrics
106 validation_envelope() noexcept
107 {
108 if constexpr (std::is_same_v<Real, float>)
109 return {1.0e-4L, 1.0e-4L, 1.0e-4L, 1.0e-4L};
110 else if constexpr (std::is_same_v<Real, double>)
111 return {1.0e-12L, 1.0e-12L, 1.0e-12L, 1.0e-12L};
112 else
113 return {1.0e-15L, 1.0e-15L, 1.0e-15L, 1.0e-15L};
114 }
115
116 [[nodiscard]] constexpr bool
117 is_within_envelope(const ValidationMetrics & metrics,
118 const ValidationMetrics & envelope) noexcept
119 {
120 return metrics.max_forward_error <= envelope.max_forward_error
121 and metrics.max_roundtrip_error <= envelope.max_roundtrip_error
122 and metrics.max_real_roundtrip_error <= envelope.max_real_roundtrip_error
123 and metrics.max_compact_roundtrip_error <= envelope.max_compact_roundtrip_error;
124 }
125
127 parse_sizes(const std::string & csv)
128 {
129 Array<size_t> sizes;
130 std::stringstream ss(csv);
131 std::string item;
132 while (std::getline(ss, item, ','))
133 {
134 if (item.empty())
135 continue;
136 try
137 {
138 const unsigned long long parsed = std::stoull(item);
139 sizes.append(static_cast<size_t>(parsed));
140 }
141 catch (const std::invalid_argument &)
142 {
143 throw std::runtime_error("Invalid token '" + item
144 + "' in --sizes argument");
145 }
146 catch (const std::out_of_range &)
147 {
148 throw std::runtime_error("Out-of-range token '" + item
149 + "' in --sizes argument");
150 }
151 }
152 return sizes;
153 }
154
155 [[nodiscard]] Options
156 parse_options(const int argc, char ** argv)
157 {
158 Options options;
159 for (int i = 1; i < argc; ++i)
160 {
161 const std::string arg = argv[i];
162 if (arg == "--validate")
163 options.validate = true;
164 else if (arg == "--json")
165 options.json = true;
166 else if (arg == "--repetitions" and i + 1 < argc)
167 options.repetitions = static_cast<size_t>(std::stoull(argv[++i]));
168 else if (arg == "--warmup" and i + 1 < argc)
169 options.warmup = static_cast<size_t>(std::stoull(argv[++i]));
170 else if (arg == "--samples" and i + 1 < argc)
171 options.samples = static_cast<size_t>(std::stoull(argv[++i]));
172 else if (arg == "--threads" and i + 1 < argc)
173 options.threads = static_cast<size_t>(std::stoull(argv[++i]));
174 else if (arg == "--batch-count" and i + 1 < argc)
175 options.batch_count = static_cast<size_t>(std::stoull(argv[++i]));
176 else if (arg == "--seed" and i + 1 < argc)
177 options.seed = static_cast<unsigned>(std::stoul(argv[++i]));
178 else if (arg == "--sizes" and i + 1 < argc)
179 options.sizes = parse_sizes(argv[++i]);
180 else if (arg == "--help")
181 {
182 std::cout
183 << "Usage: fft_benchmark [--validate] [--json] [--repetitions N]"
184 << " [--warmup N] [--samples N] [--threads N] [--batch-count N]"
185 << " [--seed N] [--sizes a,b,c]\n";
186 std::exit(0);
187 }
188 else
189 {
190 std::cerr << "Unknown option: " << arg << "\n";
191 std::exit(1);
192 }
193 }
194
195 if (options.sizes.is_empty())
196 {
197 std::cerr << "At least one benchmark size is required\n";
198 std::exit(1);
199 }
200
201 for (size_t i = 0; i < options.sizes.size(); ++i)
202 if (options.sizes[i] == 0)
203 {
204 std::cerr << "Benchmark sizes must be positive\n";
205 std::exit(1);
206 }
207
208 if (options.repetitions == 0)
209 {
210 std::cerr << "Repetitions must be positive\n";
211 std::exit(1);
212 }
213
214 if (options.samples == 0)
215 {
216 std::cerr << "Samples must be positive\n";
217 std::exit(1);
218 }
219
220 if (options.batch_count == 0)
221 {
222 std::cerr << "Batch count must be positive\n";
223 std::exit(1);
224 }
225
226 return options;
227 }
228
229 [[nodiscard]] size_t
230 resolve_thread_count(const size_t requested)
231 {
232 if (requested != 0)
233 return requested;
234
235 const size_t detected = static_cast<size_t>(std::thread::hardware_concurrency());
236 return std::max<size_t>(1, detected);
237 }
238
239 template <typename Real>
241 make_complex_signal(const size_t n, std::mt19937_64 & rng)
242 {
243 std::uniform_real_distribution<double> dist(-1.0, 1.0);
245 signal.reserve(n);
246 for (size_t i = 0; i < n; ++i)
247 signal.append(std::complex<Real>(static_cast<Real>(dist(rng)),
248 static_cast<Real>(dist(rng))));
249 return signal;
250 }
251
252 template <typename Real>
254 make_complex_batch(const size_t batch_count,
255 const size_t n,
256 std::mt19937_64 & rng)
257 {
259 batch.reserve(batch_count);
260 for (size_t item = 0; item < batch_count; ++item)
262 return batch;
263 }
264
265 template <typename Real>
267 make_real_signal(const size_t n, std::mt19937_64 & rng)
268 {
269 std::uniform_real_distribution<double> dist(-1.0, 1.0);
270 Array<Real> signal;
271 signal.reserve(n);
272 for (size_t i = 0; i < n; ++i)
273 signal.append(static_cast<Real>(dist(rng)));
274 return signal;
275 }
276
277 template <typename Real>
279 make_real_batch(const size_t batch_count,
280 const size_t n,
281 std::mt19937_64 & rng)
282 {
284 batch.reserve(batch_count);
285 for (size_t item = 0; item < batch_count; ++item)
286 batch.append(make_real_signal<Real>(n, rng));
287 return batch;
288 }
289
290 template <typename Real>
292 naive_dft(const Array<std::complex<Real>> & input, const bool invert)
293 {
294 using Complex = std::complex<Real>;
295
297 output.reserve(input.size());
298 const Real sign = invert ? Real(1) : Real(-1);
299 for (size_t k = 0; k < input.size(); ++k)
300 {
301 Complex sum(Real(0), Real(0));
302 for (size_t t = 0; t < input.size(); ++t)
303 {
304 const Real angle = sign * Real(2) * std::numbers::pi_v<Real>
305 * static_cast<Real>(k)
306 * static_cast<Real>(t)
307 / static_cast<Real>(input.size());
308 sum += input[t] * std::polar(Real(1), angle);
309 }
310
311 if (invert)
312 sum /= static_cast<Real>(input.size());
313 output.append(sum);
314 }
315 return output;
316 }
317
318 template <typename Real>
319 [[nodiscard]] long double
320 max_complex_error(const Array<std::complex<Real>> & lhs,
321 const Array<std::complex<Real>> & rhs)
322 {
323 long double error = 0;
324 for (size_t i = 0; i < lhs.size(); ++i)
325 error = std::max(error,
326 static_cast<long double>(std::abs(lhs[i] - rhs[i])));
327 return error;
328 }
329
330 template <typename Real>
331 [[nodiscard]] long double
333 {
334 long double error = 0;
335 for (size_t i = 0; i < lhs.size(); ++i)
336 error = std::max(error,
337 static_cast<long double>(std::abs(lhs[i] - rhs[i])));
338 return error;
339 }
340
341 template <typename Real>
344 {
346 output.reserve(input.size());
347 for (size_t i = 0; i < input.size(); ++i)
348 output.append(std::complex<Real>(input[i], Real(0)));
349 return output;
350 }
351
352 template <typename Fn>
354 collect_us_per_call_samples(const size_t warmup,
355 const size_t samples,
356 const size_t repetitions,
357 Fn && fn)
358 {
359 for (size_t sample = 0; sample < warmup; ++sample)
360 for (size_t i = 0; i < repetitions; ++i)
361 fn();
362
363 Array<double> values;
364 values.reserve(samples);
365 for (size_t sample = 0; sample < samples; ++sample)
366 {
367 const auto t0 = Clock::now();
368 for (size_t i = 0; i < repetitions; ++i)
369 fn();
370 const auto elapsed =
371 std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(
372 Clock::now() - t0).count();
373 values.append(elapsed / static_cast<double>(repetitions));
374 }
375 return values;
376 }
377
378 [[nodiscard]] TimingStats
379 summarize_timings(const Array<double> & values)
380 {
382 << "summarize_timings: at least one sample is required";
383
384 TimingStats stats;
385 Array<double> sorted = values;
386 std::sort(&sorted[0], &sorted[0] + sorted.size());
387
388 double sum = 0.0;
389 for (size_t i = 0; i < values.size(); ++i)
390 sum += values[i];
391
392 stats.mean_us = sum / static_cast<double>(values.size());
393 stats.min_us = sorted[0];
394 stats.max_us = sorted[sorted.size() - 1];
395 if (sorted.size() % 2 == 1)
396 stats.median_us = sorted[sorted.size() / 2];
397 else
398 stats.median_us = 0.5 * (sorted[sorted.size() / 2 - 1]
399 + sorted[sorted.size() / 2]);
400
401 double squared = 0.0;
402 for (size_t i = 0; i < values.size(); ++i)
403 {
404 const double delta = values[i] - stats.mean_us;
405 squared += delta * delta;
406 }
407 stats.stddev_us = std::sqrt(squared / static_cast<double>(values.size()));
408 return stats;
409 }
410
411 template <typename Fn>
412 [[nodiscard]] TimingStats
413 measure_timing_stats(const size_t warmup,
414 const size_t samples,
415 const size_t repetitions,
416 Fn && fn)
417 {
419 samples,
420 repetitions,
421 std::forward<Fn>(fn)));
422 }
423
425 make_fir_kernel(const size_t taps)
426 {
427 Array<double> kernel;
428 kernel.reserve(taps);
429 const double denom = static_cast<double>(std::max<size_t>(1, taps));
430 for (size_t i = 0; i < taps; ++i)
431 {
432 const double phase = 2.0 * std::numbers::pi * static_cast<double>(i) / denom;
433 kernel.append(0.42 - 0.5 * std::cos(phase) + 0.08 * std::cos(2.0 * phase));
434 }
435 return kernel;
436 }
437
439 run_benchmarks(const Options & options)
440 {
441 std::mt19937_64 rng(options.seed);
442 const size_t num_threads = resolve_thread_count(options.threads);
443 ThreadPool pool(num_threads);
445 rows.reserve(options.sizes.size() * 10);
446
447 for (size_t i = 0; i < options.sizes.size(); ++i)
448 {
449 const size_t n = options.sizes[i];
452 make_complex_batch<double>(options.batch_count, n, rng);
455 make_real_batch<double>(options.batch_count, n, rng);
457 make_real_batch<double>(options.batch_count, n * 4, rng);
459 const Array<double> fir = make_fir_kernel(std::min<size_t>(129, std::max<size_t>(17, n / 8 + 1)));
460 const FFTD::Plan batch_plan(n);
461 const Array<double> lfilter_numerator = {0.067455, 0.134911, 0.067455};
462 const Array<double> lfilter_denominator = {1.0, -1.14298, 0.412802};
463
464 const size_t stft_fft_size = n;
465 const size_t window_size = std::min<size_t>(std::max<size_t>(16, n / 2), n);
466 const Array<double> window = FFTD::hann_window(window_size);
467 FFTD::STFTOptions stft_options;
468 stft_options.hop_size = std::max<size_t>(1, window_size / 4);
469 stft_options.fft_size = stft_fft_size;
470 stft_options.pad_end = true;
471 stft_options.validate_nola = true;
472
473 const size_t fir_block_size = std::max<size_t>(64, n / 2);
474
475 BenchmarkRow complex_row;
476 complex_row.name = "complex_fft";
477 complex_row.size = n;
478 complex_row.repetitions = options.repetitions;
479 complex_row.sequential =
481 options.samples,
482 options.repetitions,
483 [&complex_signal]()
484 {
485 const auto output = FFTD::transformed(complex_signal, false);
486 benchmark_sink += output[0].real();
487 });
488 complex_row.parallel =
490 options.samples,
491 options.repetitions,
492 [&complex_signal, &pool]()
493 {
494 const auto output = FFTD::ptransformed(pool, complex_signal, false);
495 benchmark_sink += output[0].real();
496 });
497 rows.append(complex_row);
498
499 BenchmarkRow batched_complex_row;
500 batched_complex_row.name = "batched_complex_fft";
501 batched_complex_row.size = n;
502 batched_complex_row.items_per_call = options.batch_count;
503 batched_complex_row.repetitions = std::max<size_t>(size_t(1),
504 options.repetitions / 2);
505 batched_complex_row.sequential =
507 options.samples,
508 batched_complex_row.repetitions,
510 {
511 const auto output = batch_plan.transformed_batch(complex_batch, false);
512 benchmark_sink += output[0][0].real();
513 });
514 batched_complex_row.parallel =
516 options.samples,
517 batched_complex_row.repetitions,
518 [&complex_batch, &batch_plan, &pool]()
519 {
520 const auto output = batch_plan.ptransformed_batch(pool,
521 complex_batch,
522 false);
523 benchmark_sink += output[0][0].real();
524 });
526
527 BenchmarkRow batched_real_row;
528 batched_real_row.name = "batched_real_rfft";
529 batched_real_row.size = n;
530 batched_real_row.items_per_call = options.batch_count;
531 batched_real_row.repetitions = std::max<size_t>(size_t(1),
532 options.repetitions / 2);
533 batched_real_row.sequential =
535 options.samples,
536 batched_real_row.repetitions,
538 {
539 const auto output = batch_plan.rfft_batch(real_batch);
540 benchmark_sink += output[0][0].real();
541 });
542 batched_real_row.parallel =
544 options.samples,
545 batched_real_row.repetitions,
546 [&real_batch, &batch_plan, &pool]()
547 {
548 const auto output = batch_plan.prfft_batch(pool,
549 real_batch);
550 benchmark_sink += output[0][0].real();
551 });
553
554 BenchmarkRow real_row;
555 real_row.name = "real_fft";
556 real_row.size = n;
557 real_row.repetitions = options.repetitions;
558 real_row.sequential =
560 options.samples,
561 options.repetitions,
562 [&real_signal]()
563 {
564 const auto output = FFTD::transform(real_signal);
565 benchmark_sink += output[0].real();
566 });
567 real_row.parallel =
569 options.samples,
570 options.repetitions,
571 [&real_signal, &pool]()
572 {
573 const auto output = FFTD::ptransform(pool, real_signal);
574 benchmark_sink += output[0].real();
575 });
576 rows.append(real_row);
577
578 BenchmarkRow compact_row;
579 compact_row.name = "compact_rfft";
580 compact_row.size = n;
581 compact_row.repetitions = options.repetitions;
582 compact_row.sequential =
584 options.samples,
585 options.repetitions,
586 [&real_signal]()
587 {
588 const auto output = FFTD::rfft(real_signal);
589 benchmark_sink += output[0].real();
590 });
591 compact_row.parallel =
593 options.samples,
594 options.repetitions,
595 [&real_signal, &pool]()
596 {
597 const auto output = FFTD::prfft(pool, real_signal);
598 benchmark_sink += output[0].real();
599 });
600 rows.append(compact_row);
601
602 BenchmarkRow convolution_row;
603 convolution_row.name = "convolution";
604 convolution_row.size = n;
605 convolution_row.repetitions = options.repetitions;
606 convolution_row.sequential =
608 options.samples,
609 options.repetitions,
610 [&long_signal, &fir]()
611 {
612 const auto output = FFTD::multiply(long_signal, fir);
613 benchmark_sink += output[0];
614 });
615 convolution_row.parallel =
617 options.samples,
618 options.repetitions,
619 [&long_signal, &fir, &pool]()
620 {
621 const auto output = FFTD::pmultiply(pool, long_signal, fir);
622 benchmark_sink += output[0];
623 });
625
626 BenchmarkRow overlap_add_bank_row;
627 overlap_add_bank_row.name = "overlap_add_bank";
628 overlap_add_bank_row.size = n;
629 overlap_add_bank_row.items_per_call = options.batch_count;
630 overlap_add_bank_row.repetitions =
631 std::max<size_t>(size_t(1), options.repetitions / 4);
632 FFTD::OverlapAddBank overlap_bank(options.batch_count, fir, fir_block_size);
633 overlap_add_bank_row.sequential =
635 options.samples,
636 overlap_add_bank_row.repetitions,
638 {
639 const auto output = overlap_bank.convolve(long_real_batch);
640 benchmark_sink += output[0][0];
641 });
642 overlap_add_bank_row.parallel =
644 options.samples,
645 overlap_add_bank_row.repetitions,
646 [&long_real_batch, &overlap_bank, &pool]()
647 {
648 const auto output = overlap_bank.pconvolve(pool,
649 long_real_batch);
650 benchmark_sink += output[0][0];
651 });
653
654 BenchmarkRow stft_row;
655 stft_row.name = "stft";
656 stft_row.size = stft_fft_size;
657 stft_row.repetitions = std::max<size_t>(size_t(1), options.repetitions / 4);
658 stft_row.sequential =
660 options.samples,
661 stft_row.repetitions,
662 [&long_signal, &window, &stft_options]()
663 {
664 const auto output = FFTD::stft(long_signal, window, stft_options);
665 benchmark_sink += static_cast<long double>(output.size());
666 });
667 stft_row.parallel =
669 options.samples,
670 stft_row.repetitions,
671 [&long_signal, &window, &stft_options, &pool]()
672 {
673 const auto output = FFTD::pstft(pool, long_signal, window, stft_options);
674 benchmark_sink += static_cast<long double>(output.size());
675 });
676 rows.append(stft_row);
677
678 BenchmarkRow filter_row;
679 filter_row.name = "filtfilt";
680 filter_row.size = n;
681 filter_row.repetitions = std::max<size_t>(size_t(1), options.repetitions / 4);
682 filter_row.sequential =
684 options.samples,
685 filter_row.repetitions,
687 {
688 const auto output = FFTD::filtfilt(long_signal, fir, fir_block_size);
689 benchmark_sink += output[0];
690 });
691 filter_row.parallel =
693 options.samples,
694 filter_row.repetitions,
695 [&long_signal, &fir, fir_block_size, &pool]()
696 {
697 const auto output = FFTD::pfiltfilt(pool,
698 long_signal,
699 fir,
700 fir_block_size);
701 benchmark_sink += output[0];
702 });
703 rows.append(filter_row);
704
705 BenchmarkRow lfilter_bank_row;
706 lfilter_bank_row.name = "lfilter_bank";
707 lfilter_bank_row.size = n;
708 lfilter_bank_row.items_per_call = options.batch_count;
709 lfilter_bank_row.repetitions =
710 std::max<size_t>(size_t(1), options.repetitions / 4);
711 FFTD::LFilterBank sequential_bank(options.batch_count,
714 FFTD::LFilterBank parallel_bank(options.batch_count,
717 lfilter_bank_row.sequential =
719 options.samples,
720 lfilter_bank_row.repetitions,
722 {
723 sequential_bank.reset();
724 const auto output = sequential_bank.filter(real_batch);
725 benchmark_sink += output[0][0];
726 });
727 lfilter_bank_row.parallel =
729 options.samples,
730 lfilter_bank_row.repetitions,
731 [&real_batch, &parallel_bank, &pool]()
732 {
733 parallel_bank.reset();
734 const auto output = parallel_bank.pfilter(pool, real_batch);
735 benchmark_sink += output[0][0];
736 });
738 }
739
740 return rows;
741 }
742
743 template <typename Real>
744 [[nodiscard]] ValidationMetrics
745 compute_validation_metrics(const unsigned seed)
746 {
747 using FFTT = FFT<Real>;
748 using Complex = typename FFTT::Complex;
749
750 std::mt19937_64 rng(seed);
751 ValidationMetrics metrics;
752
753 for (const size_t n : {size_t(3), size_t(5), size_t(8), size_t(10),
754 size_t(17), size_t(32)})
755 {
758
759 const auto expected = naive_dft(complex_signal, false);
760 const auto obtained = FFTT::transformed(complex_signal, false);
761 metrics.max_forward_error = std::max(metrics.max_forward_error,
763
765 FFTT::transform(roundtrip, false);
766 FFTT::transform(roundtrip, true);
767 metrics.max_roundtrip_error = std::max(metrics.max_roundtrip_error,
770
771 const auto full_spectrum = FFTT::transform(real_signal);
772 const auto restored_real = FFTT::inverse_transform_real(full_spectrum);
773 metrics.max_real_roundtrip_error = std::max(metrics.max_real_roundtrip_error,
775 real_signal));
776
777 const auto compact_spectrum = FFTT::rfft(real_signal);
778 const auto restored_compact = FFTT::irfft(compact_spectrum, n);
779 metrics.max_compact_roundtrip_error =
780 std::max(metrics.max_compact_roundtrip_error,
782 }
783
784 return metrics;
785 }
786
787 void
788 print_timing_stats_json(const TimingStats & stats)
789 {
790 std::cout << "{"
791 << "\"mean_us\": " << std::fixed << std::setprecision(6) << stats.mean_us << ", "
792 << "\"median_us\": " << stats.median_us << ", "
793 << "\"min_us\": " << stats.min_us << ", "
794 << "\"max_us\": " << stats.max_us << ", "
795 << "\"stddev_us\": " << stats.stddev_us
796 << "}";
797 }
798
799 void
800 print_benchmarks(const Array<BenchmarkRow> & rows, const Options & options)
801 {
802 if (options.json)
803 {
804 std::cout << "{\n"
805 << " \"schema_version\": 1,\n"
806 << " \"mode\": \"benchmark\",\n"
807 << " \"metadata\": {\n"
808 << " \"seed\": " << options.seed << ",\n"
809 << " \"threads\": " << resolve_thread_count(options.threads) << ",\n"
810 << " \"simd_backend\": \"" << FFTD::simd_backend_name() << "\",\n"
811 << " \"batched_plan_simd_backend\": \""
812 << FFTD::batched_plan_simd_backend_name() << "\",\n"
813 << " \"detected_simd_backend\": \""
814 << FFTD::detected_simd_backend_name() << "\",\n"
815 << " \"avx2_kernel_compiled\": "
816 << (FFTD::avx2_kernel_compiled() ? "true" : "false") << ",\n"
817 << " \"neon_kernel_compiled\": "
818 << (FFTD::neon_kernel_compiled() ? "true" : "false") << ",\n"
819 << " \"avx2_runtime_available\": "
820 << (FFTD::avx2_runtime_available() ? "true" : "false") << ",\n"
821 << " \"neon_runtime_available\": "
822 << (FFTD::neon_runtime_available() ? "true" : "false") << ",\n"
823 << " \"simd_preference\": \"" << FFTD::simd_preference_name() << "\",\n"
824 << " \"batch_count\": " << options.batch_count << ",\n"
825 << " \"warmup_samples\": " << options.warmup << ",\n"
826 << " \"samples\": " << options.samples << "\n"
827 << " },\n"
828 << " \"rows\": [\n";
829 for (size_t i = 0; i < rows.size(); ++i)
830 {
831 const auto & row = rows[i];
832 const double speedup_mean = row.parallel.mean_us > 0.0
833 ? row.sequential.mean_us / row.parallel.mean_us
834 : 0.0;
835 const double speedup_median = row.parallel.median_us > 0.0
836 ? row.sequential.median_us / row.parallel.median_us
837 : 0.0;
838
839 std::cout << " {\n"
840 << " \"case\": \"" << row.name << "\",\n"
841 << " \"size\": " << row.size << ",\n"
842 << " \"items_per_call\": " << row.items_per_call << ",\n"
843 << " \"repetitions_per_sample\": " << row.repetitions << ",\n"
844 << " \"metric_unit\": \"microseconds_per_call\",\n"
845 << " \"sequential\": ";
846 print_timing_stats_json(row.sequential);
847 std::cout << ",\n"
848 << " \"parallel\": ";
849 print_timing_stats_json(row.parallel);
850 std::cout << ",\n"
851 << " \"speedup\": {"
852 << "\"mean\": " << speedup_mean << ", "
853 << "\"median\": " << speedup_median
854 << "}\n"
855 << " }";
856 if (i + 1 != rows.size())
857 std::cout << ",";
858 std::cout << "\n";
859 }
860 std::cout << " ]\n}" << std::endl;
861 return;
862 }
863
864 std::cout << "FFT baseline benchmark (microseconds per call)\n";
865 std::cout << "Configuration: seed=" << options.seed
866 << ", threads=" << resolve_thread_count(options.threads)
867 << ", simd=" << FFTD::simd_backend_name()
868 << ", batch_simd=" << FFTD::batched_plan_simd_backend_name()
869 << ", detected_simd=" << FFTD::detected_simd_backend_name()
870 << ", simd_pref=" << FFTD::simd_preference_name()
871 << ", batch_count=" << options.batch_count
872 << ", warmup=" << options.warmup
873 << ", samples=" << options.samples
874 << "\n";
875 std::cout << std::left << std::setw(14) << "Case"
876 << std::right << std::setw(10) << "Size"
877 << std::setw(10) << "Batch"
878 << std::setw(12) << "Rep/S"
879 << std::setw(16) << "SeqMean"
880 << std::setw(16) << "ParMean"
881 << std::setw(16) << "SeqMedian"
882 << std::setw(16) << "ParMedian"
883 << std::setw(12) << "Speedup"
884 << "\n";
885 std::cout << std::string(108, '-') << "\n";
886 for (size_t i = 0; i < rows.size(); ++i)
887 {
888 const auto & row = rows[i];
889 const double speedup = row.parallel.mean_us > 0.0
890 ? row.sequential.mean_us / row.parallel.mean_us
891 : 0.0;
892 std::cout << std::left << std::setw(14) << row.name
893 << std::right << std::setw(10) << row.size
894 << std::setw(10) << row.items_per_call
895 << std::setw(12) << row.repetitions
896 << std::setw(16) << std::fixed << std::setprecision(3) << row.sequential.mean_us
897 << std::setw(16) << row.parallel.mean_us
898 << std::setw(16) << row.sequential.median_us
899 << std::setw(16) << row.parallel.median_us
900 << std::setw(12) << speedup
901 << "\n";
902 }
903 std::cout << "Benchmark sink: " << static_cast<long double>(benchmark_sink) << "\n";
904 }
905
906 [[nodiscard]] bool
907 print_validation(const bool json, const unsigned seed)
908 {
919
920 auto print_metrics = [](const char * label, const ValidationMetrics & metrics)
921 {
922 std::cout << std::left << std::setw(12) << label
923 << std::right << std::setw(18) << std::setprecision(6)
924 << std::scientific << metrics.max_forward_error
925 << std::setw(18) << metrics.max_roundtrip_error
926 << std::setw(18) << metrics.max_real_roundtrip_error
927 << std::setw(18) << metrics.max_compact_roundtrip_error
928 << "\n";
929 };
930
931 if (json)
932 {
933 auto emit = [](const char * label, const ValidationMetrics & metrics, const bool trailing)
934 {
935 std::cout << " \"" << label << "\": {"
936 << "\"max_forward_error\": " << std::scientific << metrics.max_forward_error << ", "
937 << "\"max_roundtrip_error\": " << metrics.max_roundtrip_error << ", "
938 << "\"max_real_roundtrip_error\": " << metrics.max_real_roundtrip_error << ", "
939 << "\"max_compact_roundtrip_error\": " << metrics.max_compact_roundtrip_error
940 << "}";
941 if (trailing)
942 std::cout << ",";
943 std::cout << "\n";
944 };
945
946 std::cout << "{\n"
947 << " \"schema_version\": 1,\n"
948 << " \"mode\": \"validation\",\n"
949 << " \"valid\": " << (valid ? "true" : "false") << ",\n"
950 << " \"metadata\": {\n"
951 << " \"seed\": " << seed << "\n"
952 << " },\n"
953 << " \"validation\": {\n";
954 emit("float", float_metrics, true);
955 emit("double", double_metrics, true);
956 emit("long_double", long_double_metrics, false);
957 std::cout << " }\n}" << std::endl;
958 return valid;
959 }
960
961 std::cout << "FFT validation envelope (max absolute error, seed=" << seed << ")\n";
962 std::cout << std::left << std::setw(12) << "Precision"
963 << std::right << std::setw(18) << "Forward"
964 << std::setw(18) << "RoundTrip"
965 << std::setw(18) << "RealIFFT"
966 << std::setw(18) << "CompactIRFFT"
967 << "\n";
968 std::cout << std::string(84, '-') << "\n";
970 print_metrics("double", double_metrics);
971 print_metrics("long double", long_double_metrics);
972 return valid;
973 }
974}
975
976int
977main(int argc, char ** argv)
978{
979 const Options options = parse_options(argc, argv);
980
981 if (options.validate)
982 {
983 if (not print_validation(options.json, options.seed))
984 return 1;
985 }
986 else
988
989 return 0;
990}
#define ah_invalid_argument_if(C)
Throws std::invalid_argument if condition holds.
Definition ah-errors.H:639
Simple dynamic array with automatic resizing and functional operations.
Definition tpl_array.H:139
constexpr size_t size() const noexcept
Return the number of elements stored in the stack.
Definition tpl_array.H:351
constexpr bool is_empty() const noexcept
Checks if the container is empty.
Definition tpl_array.H:348
T & append(const T &data)
Append a copy of data
Definition tpl_array.H:245
void reserve(size_t cap)
Reserves cap cells into the array.
Definition tpl_array.H:315
Fast Fourier Transform (FFT) and DSP Toolkit.
Definition fft.H:154
A reusable thread pool for efficient parallel task execution.
static mt19937 rng
Fast Fourier Transform (FFT) and Digital Signal Processing (DSP) toolkit.
std::chrono::steady_clock Clock
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89
and
Check uniqueness with explicit hash + equality functors.
size_t size(Node *root) noexcept
Divide_Conquer_DP_Result< Cost > divide_and_conquer_partition_dp(const size_t groups, const size_t n, Transition_Cost_Fn transition_cost, const Cost inf=dp_optimization_detail::default_inf< Cost >())
Optimize partition DP using divide-and-conquer optimization.
void error(const char *file, int line, const char *format,...)
Print an error message with file and line info.
Definition ahDefs.C:105
Itor::difference_type count(const Itor &beg, const Itor &end, const T &value)
Count elements equal to a value.
Definition ahAlgo.H:127
T sum(const Container &container, const T &init=T{})
Compute sum of all elements.
static struct argp_option options[]
Definition ntreepic.C:1886
void run_benchmarks(const BenchmarkConfig &config)
ValueArg< size_t > seed
Definition testHash.C:48
static int * k
Dynamic array container with automatic resizing.
ofstream output
Definition writeHeap.C:215