d8/d27/ah-parallel_8H_source.html

/*

                          Aleph_w


  Data structures & Algorithms

  version 2.0.0b

  https://github.com/lrleon/Aleph-w


  This file is part of Aleph-w library


  Copyright (c) 2002-2026 Leandro Rabindranath Leon


  Permission is hereby granted, free of charge, to any person obtaining a copy

  of this software and associated documentation files (the "Software"), to deal

  in the Software without restriction, including without limitation the rights

  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell

  copies of the Software, and to permit persons to whom the Software is

  furnished to do so, subject to the following conditions:


  The above copyright notice and this permission notice shall be included in all

  copies or substantial portions of the Software.


  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

  SOFTWARE.

*/


#ifndef AH_PARALLEL_H

#define AH_PARALLEL_H


#include <vector>

#include <atomic>

#include <optional>

#include <algorithm>

#include <numeric>

#include <type_traits>

#include <iterator>

#include <functional>

#include <thread_pool.H>


namespace Aleph

{

  // =============================================================================

  // Implementation Details

  // =============================================================================


  namespace parallel_detail

  {


    inline size_t chunk_size(const size_t n, const size_t num_threads, const size_t min_chunk = 64)

    {

      if (n == 0) return 1;

      // Use more chunks than threads for better load balancing

      const size_t chunks = num_threads * 4;

      const size_t size = (n + chunks - 1) / chunks;

      return std::max(size, min_chunk);

    }


    template <typename Container>


    constexpr bool has_random_access()

    {

      using It = decltype(std::begin(std::declval<Container &>()));

      return std::is_base_of_v<std::random_access_iterator_tag,

                               typename std::iterator_traits<It>::iterator_category>;

    }


    template <typename Container>


    auto ensure_random_access(const Container & c)

    {

      if constexpr (has_random_access<Container>())

        return &c; // Return pointer to original

      else

        return std::make_unique<std::vector<typename Container::value_type>>(std::begin(c), std::end(c));

    }


    template <typename T>


    decltype(auto) deref(T && ptr)

    {

      if constexpr (std::is_pointer_v<std::decay_t<T>>)

        return *ptr;

      else

        return *ptr; // unique_ptr also supports *

    }


  } // namespace parallel_detail


  // =============================================================================

  // Parallel Map

  // =============================================================================


  template <typename ResultT = void, typename Container, typename Op>


  [[nodiscard]] auto pmaps(ThreadPool & pool, const Container & c, Op op,

                           size_t chunk_size = 0)

  {

    using InputT = std::decay_t<decltype(*std::begin(c))>;

    using ActualResultT = std::conditional_t<

      std::is_void_v<ResultT>,

      std::invoke_result_t<Op, const InputT &>,

      ResultT>;


    const size_t n = std::distance(std::begin(c), std::end(c));

    if (n == 0)

      return std::vector<ActualResultT>{};


    if (chunk_size == 0)

      chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    // Ensure random access for parallel processing

    auto data_holder = parallel_detail::ensure_random_access(c);

    const auto & data = parallel_detail::deref(data_holder);


    std::vector<ActualResultT> result(n);

    std::vector<std::future<void>> futures;


    size_t offset = 0;


    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&result, &data, op, offset, chunk_end]()

                                         {

                                           auto in_it = std::begin(data);

                                           std::advance(in_it, offset);

                                           for (size_t i = offset; i < chunk_end; ++i, ++in_it)

                                             result[i] = op(*in_it);

                                         }));


        offset = chunk_end;

      }


    for (auto & f: futures)

      f.get();


    return result;

  }


  // =============================================================================

  // Parallel Filter

  // =============================================================================


  template <typename Container, typename Pred>


  [[nodiscard]] auto pfilter(ThreadPool & pool, const Container & c, Pred pred,

                             size_t chunk_size = 0)

  {

    using T = std::decay_t<decltype(*std::begin(c))>;


    const size_t n = std::distance(std::begin(c), std::end(c));

    if (n == 0)

      return std::vector<T>{};


    if (chunk_size == 0)

      chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    auto data_holder = parallel_detail::ensure_random_access(c);

    const auto & data = parallel_detail::deref(data_holder);


    // Each chunk produces its own filtered result

    std::vector<std::future<std::vector<T>>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&data, pred, offset, chunk_end]()

                                         {

                                           std::vector<T> chunk_result;

                                           auto it = std::begin(data);

                                           std::advance(it, offset);

                                           for (size_t i = offset; i < chunk_end; ++i, ++it)

                                             if (pred(*it))

                                               chunk_result.push_back(*it);

                                           return chunk_result;

                                         }));


        offset = chunk_end;

      }


    // Merge results in order

    std::vector<T> result;

    for (auto & f: futures)

      {

        auto chunk_result = f.get();

        result.insert(result.end(),

                      std::make_move_iterator(chunk_result.begin()),

                      std::make_move_iterator(chunk_result.end()));

      }


    return result;

  }


  // =============================================================================

  // Parallel Fold/Reduce

  // =============================================================================


  template <typename T, typename Container, typename BinaryOp>


  [[nodiscard]] T pfoldl(ThreadPool & pool, const Container & c, T init, BinaryOp op,

                         size_t chunk_size = 0)

  {

    const size_t n = std::distance(std::begin(c), std::end(c));

    if (n == 0)

      return init;


    if (chunk_size == 0)

      chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    auto data_holder = parallel_detail::ensure_random_access(c);

    const auto & data = parallel_detail::deref(data_holder);


    std::vector<std::future<T>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&data, op, offset, chunk_end]()

                                         {

                                           auto it = std::begin(data);

                                           std::advance(it, offset);

                                           T local = *it++;

                                           for (size_t i = offset + 1; i < chunk_end; ++i, ++it)

                                             local = op(local, *it);

                                           return local;

                                         }));


        offset = chunk_end;

      }


    // Combine partial results

    T result = init;

    for (auto & f: futures)

      result = op(result, f.get());


    return result;

  }


  // =============================================================================

  // Parallel For Each

  // =============================================================================


  template <typename Container, typename Op>


  void pfor_each(ThreadPool & pool, Container & c, Op op, size_t chunk_size = 0)

  {

    const size_t n = std::distance(std::begin(c), std::end(c));

    if (n == 0)

      return;


    if (chunk_size == 0)

      chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    std::vector<std::future<void>> futures;

    size_t offset = 0;


    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&c, op, offset, chunk_end]()

                                         {

                                           auto it = std::begin(c);

                                           std::advance(it, offset);

                                           for (size_t i = offset; i < chunk_end; ++i, ++it)

                                             op(*it);

                                         }));


        offset = chunk_end;

      }


    for (auto & f: futures)

      f.get();

  }


  template <typename Container, typename Op>


  void pfor_each(ThreadPool & pool, const Container & c, Op op, size_t chunk_size = 0)

  {

    const size_t n = std::distance(std::begin(c), std::end(c));

    if (n == 0)

      return;


    if (chunk_size == 0)

      chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    auto data_holder = parallel_detail::ensure_random_access(c);

    const auto & data = parallel_detail::deref(data_holder);


    std::vector<std::future<void>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&data, op, offset, chunk_end]()

                                         {

                                           auto it = std::begin(data);

                                           std::advance(it, offset);

                                           for (size_t i = offset; i < chunk_end; ++i, ++it)

                                             op(*it);

                                         }));


        offset = chunk_end;

      }


    for (auto & f: futures)

      f.get();

  }


  // =============================================================================

  // Parallel Predicates (all, exists, none)

  // =============================================================================


  template <typename Container, typename Pred>


  [[nodiscard]] bool pall(ThreadPool & pool, const Container & c, Pred pred,

                          size_t chunk_size = 0)

  {

    const size_t n = std::distance(std::begin(c), std::end(c));

    if (n == 0)

      return true;


    if (chunk_size == 0)

      chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    auto data_holder = parallel_detail::ensure_random_access(c);

    const auto & data = parallel_detail::deref(data_holder);


    std::atomic<bool> found_false{false};

    std::vector<std::future<void>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&data, pred, &found_false, offset, chunk_end]()

                                         {

                                           if (found_false.load(std::memory_order_relaxed))

                                             return; // Short-circuit


                                           auto it = std::begin(data);

                                           std::advance(it, offset);

                                           for (size_t i = offset; i < chunk_end; ++i, ++it)

                                             {

                                               if (not pred(*it))

                                                 {

                                                   found_false.store(true, std::memory_order_relaxed);

                                                   return;

                                                 }

                                               if (found_false.load(std::memory_order_relaxed))

                                                 return;

                                             }

                                         }));


        offset = chunk_end;

      }


    for (auto & f: futures)

      f.get();


    return not found_false.load();

  }


  template <typename Container, typename Pred>


  [[nodiscard]] bool pexists(ThreadPool & pool, const Container & c, Pred pred,

                             size_t chunk_size = 0)

  {

    const size_t n = std::distance(std::begin(c), std::end(c));

    if (n == 0)

      return false;


    if (chunk_size == 0)

      chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    auto data_holder = parallel_detail::ensure_random_access(c);

    const auto & data = parallel_detail::deref(data_holder);


    std::atomic<bool> found{false};

    std::vector<std::future<void>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&data, pred, &found, offset, chunk_end]()

                                         {

                                           if (found.load(std::memory_order_relaxed))

                                             return; // Short-circuit


                                           auto it = std::begin(data);

                                           std::advance(it, offset);

                                           for (size_t i = offset; i < chunk_end; ++i, ++it)

                                             {

                                               if (pred(*it))

                                                 {

                                                   found.store(true, std::memory_order_relaxed);

                                                   return;

                                                 }

                                               if (found.load(std::memory_order_relaxed))

                                                 return;

                                             }

                                         }));


        offset = chunk_end;

      }


    for (auto & f: futures)

      f.get();


    return found.load();

  }


  template <typename Container, typename Pred>


  [[nodiscard]] bool pnone(ThreadPool & pool, const Container & c, Pred pred,

                           size_t chunk_size = 0)

  {

    return not pexists(pool, c, pred, chunk_size);

  }


  // =============================================================================

  // Parallel Count

  // =============================================================================


  template <typename Container, typename Pred>


  [[nodiscard]] size_t pcount_if(ThreadPool & pool, const Container & c, Pred pred,

                                 size_t chunk_size = 0)

  {

    const size_t n = std::distance(std::begin(c), std::end(c));

    if (n == 0)

      return 0;


    if (chunk_size == 0)

      chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    auto data_holder = parallel_detail::ensure_random_access(c);

    const auto & data = parallel_detail::deref(data_holder);


    std::vector<std::future<size_t>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&data, pred, offset, chunk_end]()

                                         {

                                           size_t count = 0;

                                           auto it = std::begin(data);

                                           std::advance(it, offset);

                                           for (size_t i = offset; i < chunk_end; ++i, ++it)

                                             if (pred(*it))

                                               ++count;

                                           return count;

                                         }));


        offset = chunk_end;

      }


    size_t total = 0;

    for (auto & f: futures)

      total += f.get();


    return total;

  }


  // =============================================================================

  // Parallel Find

  // =============================================================================


  template <typename Container, typename Pred>


  [[nodiscard]] std::optional<size_t> pfind(ThreadPool & pool, const Container & c,

                                            Pred pred, size_t chunk_size = 0)

  {

    const size_t n = std::distance(std::begin(c), std::end(c));

    if (n == 0)

      return std::nullopt;


    if (chunk_size == 0)

      chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    auto data_holder = parallel_detail::ensure_random_access(c);

    const auto & data = parallel_detail::deref(data_holder);


    // Track minimum found index

    std::atomic<size_t> min_index{n}; // n means not found

    std::vector<std::future<void>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&data, pred, &min_index, offset, chunk_end]()

                                         {

                                           // Skip if we already found something earlier

                                           if (min_index.load(std::memory_order_relaxed) <= offset)

                                             return;


                                           auto it = std::begin(data);

                                           std::advance(it, offset);

                                           for (size_t i = offset; i < chunk_end; ++i, ++it)

                                             {

                                               // Stop if earlier match found

                                               if (min_index.load(std::memory_order_relaxed) <= i)

                                                 return;


                                               if (pred(*it))

                                                 {

                                                   // Atomically update minimum

                                                   size_t expected = min_index.load(std::memory_order_relaxed);

                                                   while (i < expected and

                                                          not min_index.compare_exchange_weak(expected, i,

                                                               std::memory_order_relaxed));

                                                   return;

                                                 }

                                             }

                                         }));


        offset = chunk_end;

      }


    for (auto & f: futures)

      f.get();


    if (size_t result = min_index.load(); result < n)

      return result;

    return std::nullopt;

  }


  template <typename Container, typename Pred>


  [[nodiscard]] auto pfind_value(ThreadPool & pool, const Container & c,

                                 Pred pred, size_t chunk_size = 0)

  {

    using T = std::decay_t<decltype(*std::begin(c))>;


    auto idx = pfind(pool, c, pred, chunk_size);

    if (not idx)

      return std::optional<T>{std::nullopt};


    auto it = std::begin(c);

    std::advance(it, *idx);

    return std::optional<T>{*it};

  }


  // =============================================================================

  // Parallel Numeric Operations

  // =============================================================================


  template <typename Container,

            typename T = std::decay_t<decltype(*std::begin(std::declval<Container>()))>>


  [[nodiscard]] T psum(ThreadPool & pool, const Container & c, T init = T{},

                       size_t chunk_size = 0)

  {

    return pfoldl(pool, c, init, std::plus<T>{}, chunk_size);

  }


  template <typename Container,

            typename T = std::decay_t<decltype(*std::begin(std::declval<Container>()))>>


  [[nodiscard]] T pproduct(ThreadPool & pool, const Container & c, T init = T{1},

                           size_t chunk_size = 0)

  {

    return pfoldl(pool, c, init, std::multiplies<T>{}, chunk_size);

  }


  template <typename Container>


  [[nodiscard]] auto pmin(ThreadPool & pool, const Container & c, size_t chunk_size = 0)

  {

    using T = std::decay_t<decltype(*std::begin(c))>;


    const size_t n = std::distance(std::begin(c), std::end(c));

    if (n == 0)

      return std::optional<T>{std::nullopt};


    if (chunk_size == 0)

      chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    auto data_holder = parallel_detail::ensure_random_access(c);

    const auto & data = parallel_detail::deref(data_holder);


    std::vector<std::future<T>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&data, offset, chunk_end]()

                                         {

                                           auto it = std::begin(data);

                                           std::advance(it, offset);

                                           T local_min = *it++;

                                           for (size_t i = offset + 1; i < chunk_end; ++i, ++it)

                                             if (*it < local_min)

                                               local_min = *it;

                                           return local_min;

                                         }));


        offset = chunk_end;

      }


    T result = futures[0].get();

    for (size_t i = 1; i < futures.size(); ++i)

      {

        T val = futures[i].get();

        if (val < result)

          result = val;

      }


    return std::optional<T>{result};

  }


  template <typename Container>


  [[nodiscard]] auto pmax(ThreadPool & pool, const Container & c, size_t chunk_size = 0)

  {

    using T = std::decay_t<decltype(*std::begin(c))>;


    const size_t n = std::distance(std::begin(c), std::end(c));

    if (n == 0)

      return std::optional<T>{std::nullopt};


    if (chunk_size == 0)

      chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    auto data_holder = parallel_detail::ensure_random_access(c);

    const auto & data = parallel_detail::deref(data_holder);


    std::vector<std::future<T>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&data, offset, chunk_end]()

                                         {

                                           auto it = std::begin(data);

                                           std::advance(it, offset);

                                           T local_max = *it++;

                                           for (size_t i = offset + 1; i < chunk_end; ++i, ++it)

                                             if (*it > local_max)

                                               local_max = *it;

                                           return local_max;

                                         }));


        offset = chunk_end;

      }


    T result = futures[0].get();

    for (size_t i = 1; i < futures.size(); ++i)

      {

        T val = futures[i].get();

        if (val > result)

          result = val;

      }


    return std::optional<T>{result};

  }


  template <typename Container>


  [[nodiscard]] auto pminmax(ThreadPool & pool, const Container & c, size_t chunk_size = 0)

  {

    using T = std::decay_t<decltype(*std::begin(c))>;


    const size_t n = std::distance(std::begin(c), std::end(c));

    if (n == 0)

      return std::optional<std::pair<T, T>>{std::nullopt};


    if (chunk_size == 0)

      chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    auto data_holder = parallel_detail::ensure_random_access(c);

    const auto & data = parallel_detail::deref(data_holder);


    std::vector<std::future<std::pair<T, T>>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&data, offset, chunk_end]()

                                         {

                                           auto it = std::begin(data);

                                           std::advance(it, offset);

                                           T local_min = *it;

                                           T local_max = *it++;

                                           for (size_t i = offset + 1; i < chunk_end; ++i, ++it)

                                             {

                                               if (*it < local_min) local_min = *it;

                                               if (*it > local_max) local_max = *it;

                                             }

                                           return std::make_pair(local_min, local_max);

                                         }));


        offset = chunk_end;

      }


    auto result = futures[0].get();

    for (size_t i = 1; i < futures.size(); ++i)

      {

        auto [mi, ma] = futures[i].get();

        if (mi < result.first) result.first = mi;

        if (ma > result.second) result.second = ma;

      }


    return std::optional<std::pair<T, T>>{result};

  }


  // =============================================================================

  // Parallel Sort

  // =============================================================================


  template <typename Container, typename Compare = std::less<>>


  void psort(ThreadPool & pool, Container & c, Compare cmp = Compare{},

             const size_t min_parallel_size = 1024)

  {

    const size_t n = std::distance(std::begin(c), std::end(c));

    if (n <= 1)

      return;


    // For small sizes, use regular sort

    if (n <= min_parallel_size or pool.num_threads() <= 1)

      {

        std::sort(std::begin(c), std::end(c), cmp);

        return;

      }


    // Split into chunks, sort each in parallel, then merge

    const size_t num_chunks = std::min(pool.num_threads() * 2, n / min_parallel_size);

    const size_t chunk_size = (n + num_chunks - 1) / num_chunks;


    // Sort chunks in parallel

    std::vector<std::future<void>> futures;

    for (size_t i = 0; i < n; i += chunk_size)

      {

        size_t end = std::min(i + chunk_size, n);

        auto begin_it = std::begin(c);

        std::advance(begin_it, i);

        auto end_it = std::begin(c);

        std::advance(end_it, end);


        futures.push_back(pool.enqueue([begin_it, end_it, cmp]()

                                         {

                                           std::sort(begin_it, end_it, cmp);

                                         }));

      }


    for (auto & f: futures)

      f.get();


    // Merge sorted chunks

    using T = std::decay_t<decltype(*std::begin(c))>;

    std::vector<T> buffer(n);


    for (size_t width = chunk_size; width < n; width *= 2)

      {

        std::vector<std::future<void>> merge_futures;


        for (size_t i = 0; i < n; i += 2 * width)

          {

            size_t mid = std::min(i + width, n);

            size_t end = std::min(i + 2 * width, n);


            if (mid < end)

              {

                auto begin_it = std::begin(c);

                std::advance(begin_it, i);

                auto mid_it = std::begin(c);

                std::advance(mid_it, mid);

                auto end_it = std::begin(c);

                std::advance(end_it, end);


                merge_futures.push_back(pool.enqueue([begin_it, mid_it, end_it, &buffer, i, cmp]()

                                                       {

                                                         std::merge(begin_it, mid_it, mid_it, end_it,

                                                                    buffer.begin() + i, cmp);

                                                       }));

              }

            else

              {

                // Copy remaining elements

                auto begin_it = std::begin(c);

                std::advance(begin_it, i);

                auto end_it = std::begin(c);

                std::advance(end_it, mid);

                std::copy(begin_it, end_it, buffer.begin() + i);

              }

          }


        for (auto & f: merge_futures)

          f.get();


        // Swap buffer back to container

        auto it = std::begin(c);

        for (size_t i = 0; i < n; ++i, ++it)

          *it = std::move(buffer[i]);

      }

  }


  // =============================================================================

  // Parallel Zip Operations

  // =============================================================================


  template <typename Container1, typename Container2, typename Op>


  void pzip_for_each(ThreadPool & pool, const Container1 & c1, const Container2 & c2,

                     Op op, size_t chunk_size = 0)

  {

    const size_t n1 = std::distance(std::begin(c1), std::end(c1));

    const size_t n2 = std::distance(std::begin(c2), std::end(c2));

    const size_t n = std::min(n1, n2);


    if (n == 0)

      return;


    if (chunk_size == 0)

      chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    auto h1 = parallel_detail::ensure_random_access(c1);

    auto h2 = parallel_detail::ensure_random_access(c2);

    const auto & d1 = parallel_detail::deref(h1);

    const auto & d2 = parallel_detail::deref(h2);


    std::vector<std::future<void>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&d1, &d2, op, offset, chunk_end]()

                                         {

                                           auto it1 = std::begin(d1);

                                           auto it2 = std::begin(d2);

                                           std::advance(it1, offset);

                                           std::advance(it2, offset);

                                           for (size_t i = offset; i < chunk_end; ++i, ++it1, ++it2)

                                             op(*it1, *it2);

                                         }));


        offset = chunk_end;

      }


    for (auto & f: futures)

      f.get();

  }


  template <typename Container1, typename Container2, typename Op>


  [[nodiscard]] auto pzip_maps(ThreadPool & pool, const Container1 & c1,

                               const Container2 & c2, Op op, size_t chunk_size = 0)

  {

    using T1 = std::decay_t<decltype(*std::begin(c1))>;

    using T2 = std::decay_t<decltype(*std::begin(c2))>;

    using ResultT = std::invoke_result_t<Op, const T1 &, const T2 &>;


    const size_t n1 = std::distance(std::begin(c1), std::end(c1));

    const size_t n2 = std::distance(std::begin(c2), std::end(c2));

    const size_t n = std::min(n1, n2);


    if (n == 0)

      return std::vector<ResultT>{};


    if (chunk_size == 0)

      chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    auto h1 = parallel_detail::ensure_random_access(c1);

    auto h2 = parallel_detail::ensure_random_access(c2);

    const auto & d1 = parallel_detail::deref(h1);

    const auto & d2 = parallel_detail::deref(h2);


    std::vector<ResultT> result(n);

    std::vector<std::future<void>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&result, &d1, &d2, op, offset, chunk_end]()

                                         {

                                           auto it1 = std::begin(d1);

                                           auto it2 = std::begin(d2);

                                           std::advance(it1, offset);

                                           std::advance(it2, offset);

                                           for (size_t i = offset; i < chunk_end; ++i, ++it1, ++it2)

                                             result[i] = op(*it1, *it2);

                                         }));


        offset = chunk_end;

      }


    for (auto & f: futures)

      f.get();


    return result;

  }


  template <typename Container1, typename Container2, typename T, typename Op>


  [[nodiscard]] T pzip_foldl(ThreadPool & pool, const Container1 & c1,

                             const Container2 & c2, T init, Op op,

                             size_t chunk_size = 0)

  {

    const size_t n1 = std::distance(std::begin(c1), std::end(c1));

    const size_t n2 = std::distance(std::begin(c2), std::end(c2));

    const size_t n = std::min(n1, n2);


    if (n == 0)

      return init;


    if (chunk_size == 0)

      chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    auto h1 = parallel_detail::ensure_random_access(c1);

    auto h2 = parallel_detail::ensure_random_access(c2);

    const auto & d1 = parallel_detail::deref(h1);

    const auto & d2 = parallel_detail::deref(h2);


    std::vector<std::future<T>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&d1, &d2, &init, op, offset, chunk_end]()

                                         {

                                           auto it1 = std::begin(d1);

                                           auto it2 = std::begin(d2);

                                           std::advance(it1, offset);

                                           std::advance(it2, offset);


                                           T local = op(init, *it1++, *it2++);

                                           for (size_t i = offset + 1; i < chunk_end; ++i, ++it1, ++it2)

                                             local = op(local, *it1, *it2);

                                           return local;

                                         }));


        offset = chunk_end;

      }


    // Binary reduce the partial results

    // We need a binary op for this - derive it from the ternary op

    T result = futures[0].get();

    for (size_t i = 1; i < futures.size(); ++i)

      {

        T val = futures[i].get();

        // Combine using addition - user should use pfoldl + pzip_maps for complex cases

        result = result + val - init; // Compensate for init being added in each chunk

      }


    return result;

  }


  // =============================================================================

  // Parallel Partition

  // =============================================================================


  template <typename Container, typename Pred>


  [[nodiscard]] auto ppartition(ThreadPool & pool, const Container & c, Pred pred,

                                size_t chunk_size = 0)

  {

    using T = std::decay_t<decltype(*std::begin(c))>;


    const size_t n = std::distance(std::begin(c), std::end(c));

    if (n == 0)

      return std::make_pair(std::vector<T>{}, std::vector<T>{});


    if (chunk_size == 0)

      chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    auto data_holder = parallel_detail::ensure_random_access(c);

    const auto & data = parallel_detail::deref(data_holder);


    using ChunkResult = std::pair<std::vector<T>, std::vector<T>>;

    std::vector<std::future<ChunkResult>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&data, pred, offset, chunk_end]()

                                         {

                                           std::vector<T> yes, no;

                                           auto it = std::begin(data);

                                           std::advance(it, offset);

                                           for (size_t i = offset; i < chunk_end; ++i, ++it)

                                             {

                                               if (pred(*it))

                                                 yes.push_back(*it);

                                               else

                                                 no.push_back(*it);

                                             }

                                           return std::make_pair(std::move(yes), std::move(no));

                                         }));


        offset = chunk_end;

      }


    // Merge results in order

    std::vector<T> yes_result, no_result;

    for (auto & f: futures)

      {

        auto [yes, no] = f.get();

        yes_result.insert(yes_result.end(),

                          std::make_move_iterator(yes.begin()),

                          std::make_move_iterator(yes.end()));

        no_result.insert(no_result.end(),

                         std::make_move_iterator(no.begin()),

                         std::make_move_iterator(no.end()));

      }


    return std::make_pair(std::move(yes_result), std::move(no_result));

  }


  // =============================================================================

  // Variadic Parallel Zip Operations (N containers)

  // =============================================================================


  namespace parallel_zip_detail

  {

    template <typename Container>


    struct ContainerHolder

    {

      using value_type = std::decay_t<decltype(*std::begin(std::declval<Container &>()))>;

      using holder_type = std::conditional_t<

        parallel_detail::has_random_access<Container>(),

        const Container *,

        std::unique_ptr<std::vector<value_type>>>;


      holder_type data;

      size_t cached_size;


      explicit ContainerHolder(const Container & c)

      {

        if constexpr (parallel_detail::has_random_access<Container>())

          {

            data = &c;

            // For random access, std::distance is O(1)

            cached_size = static_cast<size_t>(std::distance(std::begin(c), std::end(c)));

          }

        else

          {

            // Copy to vector (O(n) - unavoidable), then get size from vector (O(1))

            data = std::make_unique<std::vector<value_type>>(std::begin(c), std::end(c));

            cached_size = data->size();

          }

      }


      decltype(auto) get() const

      {

        if constexpr (parallel_detail::has_random_access<Container>())

          return *data;

        else

          return *data;

      }


      [[nodiscard]] size_t size() const noexcept { return cached_size; }


      auto begin() const { return std::begin(get()); }

      auto end() const { return std::end(get()); }

    };


    template <typename... Holders, size_t... Is>


    size_t min_holder_size_impl(const std::tuple<Holders...> & holders,

                                std::index_sequence<Is...>)

    {

      return std::min({std::get<Is>(holders).size()...});

    }


    template <typename... Holders>


    size_t min_holder_size(const std::tuple<Holders...> & holders)

    {

      return min_holder_size_impl(holders, std::make_index_sequence<sizeof...(Holders)>{});

    }


    template <typename... Holders, size_t... Is>


    auto make_iterators_at(size_t offset, const std::tuple<Holders...> & holders,

                           std::index_sequence<Is...>)

    {

      return std::make_tuple([&]()

                               {

                                 auto it = std::get<Is>(holders).begin();

                                 std::advance(it, offset);

                                 return it;

                               }()...);

    }


    template <typename... Iters, size_t... Is>


    void advance_all_iters(std::tuple<Iters...> & iters, std::index_sequence<Is...>)

    {

      (++std::get<Is>(iters), ...);

    }


    template <typename... Iters, size_t... Is>


    auto deref_all_iters(const std::tuple<Iters...> & iters, std::index_sequence<Is...>)

    {

      return std::make_tuple(*std::get<Is>(iters)...);

    }


  } // namespace parallel_zip_detail


  template <typename Op, typename... Containers>


  void pzip_for_each_n(ThreadPool & pool, Op op, const Containers &... cs)

  {

    static_assert(sizeof...(Containers) >= 2,

                  "pzip_for_each requires at least 2 containers");


    // Convert all containers to random access FIRST

    // This is O(n) for non-RA containers, but unavoidable

    auto holders = std::make_tuple(parallel_zip_detail::ContainerHolder<Containers>(cs)...);


    // Now get min size - O(1) because all holders have cached sizes

    const size_t n = parallel_zip_detail::min_holder_size(holders);

    if (n == 0)

      return;


    const size_t chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    std::vector<std::future<void>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&holders, op, offset, chunk_end]()

                                         {

                                           constexpr size_t N = sizeof...(Containers);

                                           auto iters = parallel_zip_detail::make_iterators_at(

                                              offset, holders, std::make_index_sequence<N>{});


                                           for (size_t i = offset; i < chunk_end; ++i)

                                             {

                                               std::apply(op, parallel_zip_detail::deref_all_iters(

                                                             iters, std::make_index_sequence<N>{}));

                                               parallel_zip_detail::advance_all_iters(

                                                  iters, std::make_index_sequence<N>{});

                                             }

                                         }));


        offset = chunk_end;

      }


    for (auto & f: futures)

      f.get();

  }


  template <typename Op, typename... Containers>


  [[nodiscard]] auto pzip_maps_n(ThreadPool & pool, Op op, const Containers &... cs)

  {

    static_assert(sizeof...(Containers) >= 2,

                  "pzip_maps requires at least 2 containers");


    // Deduce result type from operation

    using ResultT = std::invoke_result_t<Op,

                                         std::decay_t<decltype(*std::begin(cs))>...>;


    // Convert all containers to random access FIRST

    auto holders = std::make_tuple(parallel_zip_detail::ContainerHolder<Containers>(cs)...);


    // Now get min size - O(1) because all holders have cached sizes

    const size_t n = parallel_zip_detail::min_holder_size(holders);

    if (n == 0)

      return std::vector<ResultT>{};


    size_t chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    std::vector<ResultT> result(n);

    std::vector<std::future<void>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&result, &holders, op, offset, chunk_end]()

                                         {

                                           constexpr size_t N = sizeof...(Containers);

                                           auto iters = parallel_zip_detail::make_iterators_at(

                                              offset, holders, std::make_index_sequence<N>{});


                                           for (size_t i = offset; i < chunk_end; ++i)

                                             {

                                               result[i] = std::apply(op, parallel_zip_detail::deref_all_iters(

                                                                         iters, std::make_index_sequence<N>{}));

                                               parallel_zip_detail::advance_all_iters(

                                                  iters, std::make_index_sequence<N>{});

                                             }

                                         }));


        offset = chunk_end;

      }


    for (auto & f: futures)

      f.get();


    return result;

  }


  template <typename T, typename Op, typename Combiner, typename... Containers>


  [[nodiscard]] T pzip_foldl_n(ThreadPool & pool, T init, Op op, Combiner combiner,

                               const Containers &... cs)

  {

    static_assert(sizeof...(Containers) >= 2,

                  "pzip_foldl requires at least 2 containers");


    // Convert all containers to random access FIRST

    auto holders = std::make_tuple(parallel_zip_detail::ContainerHolder<Containers>(cs)...);


    // Now get min size - O(1) because all holders have cached sizes

    const size_t n = parallel_zip_detail::min_holder_size(holders);

    if (n == 0)

      return init;


    size_t chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    std::vector<std::future<T>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&holders, init, op, offset, chunk_end]()

                                         {

                                           constexpr size_t N = sizeof...(Containers);

                                           auto iters = parallel_zip_detail::make_iterators_at(

                                              offset, holders, std::make_index_sequence<N>{});


                                           // First element

                                           auto first_tuple = parallel_zip_detail::deref_all_iters(

                                              iters, std::make_index_sequence<N>{});

                                           T local = std::apply([&op, &init](auto &&... args)

                                                                  {

                                                                    return op(init, std::forward<decltype(args)>(args)

                                                                              ...);

                                                                  }, first_tuple);

                                           parallel_zip_detail::advance_all_iters(

                                              iters, std::make_index_sequence<N>{});


                                           // Remaining elements

                                           for (size_t i = offset + 1; i < chunk_end; ++i)

                                             {

                                               auto tuple = parallel_zip_detail::deref_all_iters(

                                                  iters, std::make_index_sequence<N>{});

                                               local = std::apply([&op, &local](auto &&... args)

                                                                    {

                                                                      return op(local,

                                                                                  std::forward<decltype(args)>(args)

                                                                                  ...);

                                                                    }, tuple);

                                               parallel_zip_detail::advance_all_iters(

                                                  iters, std::make_index_sequence<N>{});

                                             }


                                           return local;

                                         }));


        offset = chunk_end;

      }


    // Combine partial results using the combiner

    T result = futures[0].get();

    for (size_t i = 1; i < futures.size(); ++i)

      result = combiner(result, futures[i].get());


    return result;

  }


  template <typename Pred, typename... Containers>


  [[nodiscard]] bool pzip_all_n(ThreadPool & pool, Pred pred, const Containers &... cs)

  {

    static_assert(sizeof...(Containers) >= 2,

                  "pzip_all requires at least 2 containers");


    // Convert all containers to random access FIRST

    auto holders = std::make_tuple(

                                   parallel_zip_detail::ContainerHolder<Containers>(cs)...);


    // Now get min size - O(1) because all holders have cached sizes

    const size_t n = parallel_zip_detail::min_holder_size(holders);

    if (n == 0)

      return true; // Vacuous truth


    const size_t chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    std::atomic<bool> found_false{false};

    std::vector<std::future<void>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&holders, pred, &found_false, offset, chunk_end]()

                                         {

                                           if (found_false.load(std::memory_order_relaxed))

                                             return;


                                           constexpr size_t N = sizeof...(Containers);

                                           auto iters = parallel_zip_detail::make_iterators_at(

                                              offset, holders, std::make_index_sequence<N>{});


                                           for (size_t i = offset; i < chunk_end; ++i)

                                             {

                                               auto tuple = parallel_zip_detail::deref_all_iters(

                                                  iters, std::make_index_sequence<N>{});

                                               if (! std::apply(pred, tuple))

                                                 {

                                                   found_false.store(true, std::memory_order_relaxed);

                                                   return;

                                                 }

                                               if (found_false.load(std::memory_order_relaxed))

                                                 return;

                                               parallel_zip_detail::advance_all_iters(

                                                  iters, std::make_index_sequence<N>{});

                                             }

                                         }));


        offset = chunk_end;

      }


    for (auto & f: futures)

      f.get();


    return not found_false.load();

  }


  template <typename Pred, typename... Containers>


  [[nodiscard]] bool pzip_exists_n(ThreadPool & pool, Pred pred, const Containers &... cs)

  {

    static_assert(sizeof...(Containers) >= 2,

                  "pzip_exists requires at least 2 containers");


    // Convert all containers to random access FIRST

    auto holders = std::make_tuple(parallel_zip_detail::ContainerHolder<Containers>(cs)...);


    // Now get min size - O(1) because all holders have cached sizes

    const size_t n = parallel_zip_detail::min_holder_size(holders);

    if (n == 0)

      return false;


    const size_t chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    std::atomic<bool> found{false};

    std::vector<std::future<void>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&holders, pred, &found, offset, chunk_end]()

                                         {

                                           if (found.load(std::memory_order_relaxed))

                                             return;


                                           constexpr size_t N = sizeof...(Containers);

                                           auto iters = parallel_zip_detail::make_iterators_at(

                                              offset, holders, std::make_index_sequence<N>{});


                                           for (size_t i = offset; i < chunk_end; ++i)

                                             {

                                               auto tuple = parallel_zip_detail::deref_all_iters(

                                                  iters, std::make_index_sequence<N>{});

                                               if (std::apply(pred, tuple))

                                                 {

                                                   found.store(true, std::memory_order_relaxed);

                                                   return;

                                                 }

                                               if (found.load(std::memory_order_relaxed))

                                                 return;

                                               parallel_zip_detail::advance_all_iters(

                                                  iters, std::make_index_sequence<N>{});

                                             }

                                         }));


        offset = chunk_end;

      }


    for (auto & f: futures)

      f.get();


    return found.load();

  }


  template <typename Pred, typename... Containers>


  [[nodiscard]] size_t pzip_count_if_n(ThreadPool & pool, Pred pred,

                                       const Containers &... cs)

  {

    static_assert(sizeof...(Containers) >= 2,

                  "pzip_count_if requires at least 2 containers");


    // Convert all containers to random access FIRST

    auto holders = std::make_tuple(parallel_zip_detail::ContainerHolder<Containers>(cs)...);


    // Now get min size - O(1) because all holders have cached sizes

    const size_t n = parallel_zip_detail::min_holder_size(holders);

    if (n == 0)

      return 0;


    size_t chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    std::vector<std::future<size_t>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&holders, pred, offset, chunk_end]()

                                         {

                                           constexpr size_t N = sizeof...(Containers);

                                           auto iters = parallel_zip_detail::make_iterators_at(

                                              offset, holders, std::make_index_sequence<N>{});


                                           size_t count = 0;

                                           for (size_t i = offset; i < chunk_end; ++i)

                                             {

                                               auto tuple = parallel_zip_detail::deref_all_iters(

                                                  iters, std::make_index_sequence<N>{});

                                               if (std::apply(pred, tuple))

                                                 ++count;

                                               parallel_zip_detail::advance_all_iters(

                                                  iters, std::make_index_sequence<N>{});

                                             }

                                           return count;

                                         }));


        offset = chunk_end;

      }


    size_t total = 0;

    for (auto & f: futures)

      total += f.get();


    return total;

  }


  // =============================================================================

  // Parallel Enumerate

  // =============================================================================


  template <typename Container, typename Op>


  void penumerate_for_each(ThreadPool & pool, Container & c, Op op,

                           size_t chunk_size = 0)

  {

    const size_t n = std::distance(std::begin(c), std::end(c));

    if (n == 0)

      return;


    if (chunk_size == 0)

      chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    std::vector<std::future<void>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&c, op, offset, chunk_end]()

                                         {

                                           auto it = std::begin(c);

                                           std::advance(it, offset);

                                           for (size_t i = offset; i < chunk_end; ++i, ++it)

                                             op(i, *it);

                                         }));


        offset = chunk_end;

      }


    for (auto & f: futures)

      f.get();

  }


  template <typename Container, typename Op>


  void penumerate_for_each(ThreadPool & pool, const Container & c, Op op,

                           size_t chunk_size = 0)

  {

    const size_t n = std::distance(std::begin(c), std::end(c));

    if (n == 0)

      return;


    if (chunk_size == 0)

      chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    auto data_holder = parallel_detail::ensure_random_access(c);

    const auto & data = parallel_detail::deref(data_holder);


    std::vector<std::future<void>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&data, op, offset, chunk_end]()

                                         {

                                           auto it = std::begin(data);

                                           std::advance(it, offset);

                                           for (size_t i = offset; i < chunk_end; ++i, ++it)

                                             op(i, *it);

                                         }));


        offset = chunk_end;

      }


    for (auto & f: futures)

      f.get();

  }


  template <typename Container, typename Op>


  [[nodiscard]] auto penumerate_maps(ThreadPool & pool, const Container & c, Op op,

                                     size_t chunk_size = 0)

  {

    using T = std::decay_t<decltype(*std::begin(c))>;

    using ResultT = std::invoke_result_t<Op, size_t, const T &>;


    const size_t n = std::distance(std::begin(c), std::end(c));

    if (n == 0)

      return std::vector<ResultT>{};


    if (chunk_size == 0)

      chunk_size = parallel_detail::chunk_size(n, pool.num_threads());


    auto data_holder = parallel_detail::ensure_random_access(c);

    const auto & data = parallel_detail::deref(data_holder);


    std::vector<ResultT> result(n);

    std::vector<std::future<void>> futures;


    size_t offset = 0;

    while (offset < n)

      {

        size_t chunk_end = std::min(offset + chunk_size, n);


        futures.push_back(pool.enqueue([&result, &data, op, offset, chunk_end]()

                                         {

                                           auto it = std::begin(data);

                                           std::advance(it, offset);

                                           for (size_t i = offset; i < chunk_end; ++i, ++it)

                                             result[i] = op(i, *it);

                                         }));


        offset = chunk_end;

      }


    for (auto & f: futures)

      f.get();


    return result;

  }


  // =============================================================================

  // Convenience: Default Pool Variants

  // =============================================================================


  inline ThreadPool &parallel_default_pool()

  {

    return default_pool();

  }


  // Convenience macros for using default pool (optional)

#ifdef AH_PARALLEL_USE_DEFAULT_POOL


#define PMAP(c, op) pmaps(parallel_default_pool(), c, op)

#define PFILTER(c, pred) pfilter(parallel_default_pool(), c, pred)

#define PFOLD(c, init, op) pfoldl(parallel_default_pool(), c, init, op)

#define PFOR_EACH(c, op) pfor_each(parallel_default_pool(), c, op)

#define PALL(c, pred) pall(parallel_default_pool(), c, pred)

#define PEXISTS(c, pred) pexists(parallel_default_pool(), c, pred)

#define PSUM(c) psum(parallel_default_pool(), c)


#endif // AH_PARALLEL_USE_DEFAULT_POOL

} // namespace Aleph


#endif // AH_PARALLEL_H

Aleph::DynList::get
T get()
Definition htlist.H:1637

Aleph::ThreadPool
A reusable thread pool for efficient parallel task execution.
Definition thread_pool.H:212

Aleph::ThreadPool::num_threads
size_t num_threads() const noexcept
Get the number of worker threads.
Definition thread_pool.H:859

Aleph::ThreadPool::enqueue
auto enqueue(F &&f, Args &&... args) -> std::future< std::invoke_result_t< F, Args... > >
Submit a task for execution and get a future for the result.
Definition thread_pool.H:407

StlAlephIterator::end
iterator end() noexcept
Return an STL-compatible end iterator.
Definition ah-iterator.H:267

StlAlephIterator::begin
iterator begin() noexcept
Return an STL-compatible iterator to the first element.
Definition ah-iterator.H:264

cmp
int cmp(const __gmp_expr< T, U > &expr1, const __gmp_expr< V, W > &expr2)
Definition gmpfrxx.h:4118

pred
Freq_Node * pred
Predecessor node in level-order traversal.
Definition huffman_btreepic.H:179

offset
const long double offset[]
Offset values indexed by symbol string length (bounded by MAX_OFFSET_INDEX)
Definition huffman_btreepic.H:188

Aleph::parallel_detail::deref
decltype(auto) deref(T &&ptr)
Get reference from pointer or unique_ptr.
Definition ah-parallel.H:161

Aleph::parallel_detail::ensure_random_access
auto ensure_random_access(const Container &c)
For containers with random access, just return a pointer to it For non-random access,...
Definition ah-parallel.H:151

Aleph::parallel_detail::chunk_size
size_t chunk_size(const size_t n, const size_t num_threads, const size_t min_chunk=64)
Calculate optimal chunk size based on data size and thread count.
Definition ah-parallel.H:130

Aleph::parallel_detail::has_random_access
constexpr bool has_random_access()
Check if container supports random access.
Definition ah-parallel.H:141

Aleph::parallel_zip_detail::min_holder_size_impl
size_t min_holder_size_impl(const std::tuple< Holders... > &holders, std::index_sequence< Is... >)
Get minimum size from tuple of holders - always O(1) per holder.
Definition ah-parallel.H:1606

Aleph::parallel_zip_detail::advance_all_iters
void advance_all_iters(std::tuple< Iters... > &iters, std::index_sequence< Is... >)
Advance all iterators in tuple.
Definition ah-parallel.H:1633

Aleph::parallel_zip_detail::make_iterators_at
auto make_iterators_at(size_t offset, const std::tuple< Holders... > &holders, std::index_sequence< Is... >)
Create tuple of iterators at given offset.
Definition ah-parallel.H:1620

Aleph::parallel_zip_detail::min_holder_size
size_t min_holder_size(const std::tuple< Holders... > &holders)
Definition ah-parallel.H:1613

Aleph::parallel_zip_detail::deref_all_iters
auto deref_all_iters(const std::tuple< Iters... > &iters, std::index_sequence< Is... >)
Dereference all iterators and make tuple.
Definition ah-parallel.H:1640

Aleph
Main namespace for Aleph-w library functions.
Definition ah-arena.H:89

Aleph::pall
bool pall(ThreadPool &pool, const Container &c, Pred pred, size_t chunk_size=0)
Parallel all predicate (short-circuit).
Definition ah-parallel.H:556

Aleph::pnone
bool pnone(ThreadPool &pool, const Container &c, Pred pred, size_t chunk_size=0)
Parallel none predicate.
Definition ah-parallel.H:690

Aleph::pmaps
auto pmaps(ThreadPool &pool, const Container &c, Op op, size_t chunk_size=0)
Parallel map operation.
Definition ah-parallel.H:205

Aleph::pmin
auto pmin(ThreadPool &pool, const Container &c, size_t chunk_size=0)
Parallel minimum element.
Definition ah-parallel.H:951

Aleph::pzip_foldl_n
T pzip_foldl_n(ThreadPool &pool, T init, Op op, Combiner combiner, const Containers &... cs)
Parallel fold/reduce over N zipped containers (variadic).
Definition ah-parallel.H:1846

Aleph::default_pool
ThreadPool & default_pool()
Global default thread pool.
Definition thread_pool.H:1465

Aleph::pzip_foldl
T pzip_foldl(ThreadPool &pool, const Container1 &c1, const Container2 &c2, T init, Op op, size_t chunk_size=0)
Parallel zip + fold.
Definition ah-parallel.H:1409

Aleph::pzip_for_each
void pzip_for_each(ThreadPool &pool, const Container1 &c1, const Container2 &c2, Op op, size_t chunk_size=0)
Parallel zip + for_each.
Definition ah-parallel.H:1264

Aleph::size
size_t size(Node *root) noexcept
Definition tpl_binNodeUtils.H:491

Aleph::parallel_default_pool
ThreadPool & parallel_default_pool()
Global default pool for parallel operations.
Definition ah-parallel.H:2346

Aleph::penumerate_maps
auto penumerate_maps(ThreadPool &pool, const Container &c, Op op, size_t chunk_size=0)
Parallel enumerate with map.
Definition ah-parallel.H:2295

Aleph::penumerate_for_each
void penumerate_for_each(ThreadPool &pool, Container &c, Op op, size_t chunk_size=0)
Parallel for_each with index (enumerate).
Definition ah-parallel.H:2186

Aleph::pzip_all_n
bool pzip_all_n(ThreadPool &pool, Pred pred, const Containers &... cs)
Parallel all predicate over N zipped containers (variadic).
Definition ah-parallel.H:1944

Aleph::pcount_if
size_t pcount_if(ThreadPool &pool, const Container &c, Pred pred, size_t chunk_size=0)
Parallel count_if operation.
Definition ah-parallel.H:722

Aleph::pfind
std::optional< size_t > pfind(ThreadPool &pool, const Container &c, Pred pred, size_t chunk_size=0)
Parallel find operation (returns index).
Definition ah-parallel.H:793

Aleph::pzip_exists_n
bool pzip_exists_n(ThreadPool &pool, Pred pred, const Containers &... cs)
Parallel exists predicate over N zipped containers (variadic).
Definition ah-parallel.H:2031

Aleph::T
std::decay_t< typename HeadC::Item_Type > T
Definition ah-zip.H:107

Aleph::psort
void psort(ThreadPool &pool, Container &c, Compare cmp=Compare{}, const size_t min_parallel_size=1024)
Parallel sort (in-place).
Definition ah-parallel.H:1146

Aleph::pfor_each
void pfor_each(ThreadPool &pool, Container &c, Op op, size_t chunk_size=0)
Parallel for_each operation.
Definition ah-parallel.H:451

Aleph::pfilter
auto pfilter(ThreadPool &pool, const Container &c, Pred pred, size_t chunk_size=0)
Parallel filter operation.
Definition ah-parallel.H:283

Aleph::ppartition
auto ppartition(ThreadPool &pool, const Container &c, Pred pred, size_t chunk_size=0)
Parallel partition (stable).
Definition ah-parallel.H:1490

Aleph::pproduct
T pproduct(ThreadPool &pool, const Container &c, T init=T{1}, size_t chunk_size=0)
Parallel product of elements.
Definition ah-parallel.H:934

Aleph::pmax
auto pmax(ThreadPool &pool, const Container &c, size_t chunk_size=0)
Parallel maximum element.
Definition ah-parallel.H:1008

Aleph::pzip_for_each_n
void pzip_for_each_n(ThreadPool &pool, Op op, const Containers &... cs)
Parallel for_each over N zipped containers (variadic).
Definition ah-parallel.H:1679

Aleph::pzip_maps_n
auto pzip_maps_n(ThreadPool &pool, Op op, const Containers &... cs)
Parallel map over N zipped containers (variadic).
Definition ah-parallel.H:1756

Aleph::pfoldl
T pfoldl(ThreadPool &pool, const Container &c, T init, BinaryOp op, size_t chunk_size=0)
Parallel left fold (reduce).
Definition ah-parallel.H:375

Aleph::pfind_value
auto pfind_value(ThreadPool &pool, const Container &c, Pred pred, size_t chunk_size=0)
Parallel find with value return.
Definition ah-parallel.H:876

Aleph::init
static bool init
Definition hash-fct.C:47

Aleph::pzip_maps
auto pzip_maps(ThreadPool &pool, const Container1 &c1, const Container2 &c2, Op op, size_t chunk_size=0)
Parallel zip + map.
Definition ah-parallel.H:1331

Aleph::psum
T psum(ThreadPool &pool, const Container &c, T init=T{}, size_t chunk_size=0)
Parallel sum of elements.
Definition ah-parallel.H:915

Aleph::pminmax
auto pminmax(ThreadPool &pool, const Container &c, size_t chunk_size=0)
Parallel min and max elements.
Definition ah-parallel.H:1065

Aleph::pzip_count_if_n
size_t pzip_count_if_n(ThreadPool &pool, Pred pred, const Containers &... cs)
Parallel count over N zipped containers (variadic).
Definition ah-parallel.H:2104

Aleph::pexists
bool pexists(ThreadPool &pool, const Container &c, Pred pred, size_t chunk_size=0)
Parallel exists predicate (short-circuit).
Definition ah-parallel.H:627

Aleph::maps
DynList< T > maps(const C &c, Op op)
Classic map operation.
Definition ahFunctional.H:693

Aleph::count
Itor::difference_type count(const Itor &beg, const Itor &end, const T &value)
Count elements equal to a value.
Definition ahAlgo.H:127

Aleph::parallel_zip_detail::ContainerHolder
Holder for converted containers (either pointer or unique_ptr to vector).
Definition ah-parallel.H:1563

Aleph::parallel_zip_detail::ContainerHolder::cached_size
size_t cached_size
Cached size for O(1) access.
Definition ah-parallel.H:1571

Aleph::parallel_zip_detail::ContainerHolder::data
holder_type data
Definition ah-parallel.H:1570

Aleph::parallel_zip_detail::ContainerHolder::begin
auto begin() const
Definition ah-parallel.H:1600

Aleph::parallel_zip_detail::ContainerHolder::end
auto end() const
Definition ah-parallel.H:1601

Aleph::parallel_zip_detail::ContainerHolder::value_type
std::decay_t< decltype(*std::begin(std::declval< Container & >()))> value_type
Definition ah-parallel.H:1564

Aleph::parallel_zip_detail::ContainerHolder::holder_type
std::conditional_t< parallel_detail::has_random_access< Container >(), const Container *, std::unique_ptr< std::vector< value_type > > > holder_type
Definition ah-parallel.H:1568

Aleph::parallel_zip_detail::ContainerHolder::ContainerHolder
ContainerHolder(const Container &c)
Definition ah-parallel.H:1573

Aleph::parallel_zip_detail::ContainerHolder::get
decltype(auto) get() const
Definition ah-parallel.H:1589

Aleph::parallel_zip_detail::ContainerHolder::size
size_t size() const noexcept
Size is always O(1) - either from random access or from cached vector size.
Definition ah-parallel.H:1598

Container
Definition ah-dry.cc:65

thread_pool.H
A modern, efficient thread pool for parallel task execution.