summaryrefslogtreecommitdiffstats
path: root/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
blob: c074eea9861c1b720127ca7721b26b4d585004af (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_TRANSFORM_REDUCE_H
#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_TRANSFORM_REDUCE_H

#include <__algorithm/pstl_backends/cpu_backends/backend.h>
#include <__config>
#include <__iterator/concepts.h>
#include <__iterator/iterator_traits.h>
#include <__numeric/transform_reduce.h>
#include <__pstl/cpu_algos/cpu_traits.h>
#include <__type_traits/desugars_to.h>
#include <__type_traits/is_arithmetic.h>
#include <__type_traits/is_execution_policy.h>
#include <__utility/move.h>
#include <new>
#include <optional>

#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
#  pragma GCC system_header
#endif

_LIBCPP_PUSH_MACROS
#include <__undef_macros>

#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17

_LIBCPP_BEGIN_NAMESPACE_STD

template <typename _Backend,
          typename _DifferenceType,
          typename _Tp,
          typename _BinaryOperation,
          typename _UnaryOperation,
          typename _UnaryResult = invoke_result_t<_UnaryOperation, _DifferenceType>,
          __enable_if_t<__desugars_to_v<__plus_tag, _BinaryOperation, _Tp, _UnaryResult> && is_arithmetic_v<_Tp> &&
                            is_arithmetic_v<_UnaryResult>,
                        int>    = 0>
_LIBCPP_HIDE_FROM_ABI _Tp
__simd_transform_reduce(_DifferenceType __n, _Tp __init, _BinaryOperation, _UnaryOperation __f) noexcept {
  _PSTL_PRAGMA_SIMD_REDUCTION(+ : __init)
  for (_DifferenceType __i = 0; __i < __n; ++__i)
    __init += __f(__i);
  return __init;
}

template <typename _Backend,
          typename _Size,
          typename _Tp,
          typename _BinaryOperation,
          typename _UnaryOperation,
          typename _UnaryResult = invoke_result_t<_UnaryOperation, _Size>,
          __enable_if_t<!(__desugars_to_v<__plus_tag, _BinaryOperation, _Tp, _UnaryResult> && is_arithmetic_v<_Tp> &&
                          is_arithmetic_v<_UnaryResult>),
                        int>    = 0>
_LIBCPP_HIDE_FROM_ABI _Tp
__simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _UnaryOperation __f) noexcept {
  constexpr size_t __lane_size = __pstl::__cpu_traits<_Backend>::__lane_size;
  const _Size __block_size     = __lane_size / sizeof(_Tp);
  if (__n > 2 * __block_size && __block_size > 1) {
    alignas(__lane_size) char __lane_buffer[__lane_size];
    _Tp* __lane = reinterpret_cast<_Tp*>(__lane_buffer);

    // initializer
    _PSTL_PRAGMA_SIMD
    for (_Size __i = 0; __i < __block_size; ++__i) {
      ::new (__lane + __i) _Tp(__binary_op(__f(__i), __f(__block_size + __i)));
    }
    // main loop
    _Size __i                    = 2 * __block_size;
    const _Size __last_iteration = __block_size * (__n / __block_size);
    for (; __i < __last_iteration; __i += __block_size) {
      _PSTL_PRAGMA_SIMD
      for (_Size __j = 0; __j < __block_size; ++__j) {
        __lane[__j] = __binary_op(std::move(__lane[__j]), __f(__i + __j));
      }
    }
    // remainder
    _PSTL_PRAGMA_SIMD
    for (_Size __j = 0; __j < __n - __last_iteration; ++__j) {
      __lane[__j] = __binary_op(std::move(__lane[__j]), __f(__last_iteration + __j));
    }
    // combiner
    for (_Size __j = 0; __j < __block_size; ++__j) {
      __init = __binary_op(std::move(__init), std::move(__lane[__j]));
    }
    // destroyer
    _PSTL_PRAGMA_SIMD
    for (_Size __j = 0; __j < __block_size; ++__j) {
      __lane[__j].~_Tp();
    }
  } else {
    for (_Size __i = 0; __i < __n; ++__i) {
      __init = __binary_op(std::move(__init), __f(__i));
    }
  }
  return __init;
}

template <class _ExecutionPolicy,
          class _ForwardIterator1,
          class _ForwardIterator2,
          class _Tp,
          class _BinaryOperation1,
          class _BinaryOperation2>
_LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
    __cpu_backend_tag,
    _ForwardIterator1 __first1,
    _ForwardIterator1 __last1,
    _ForwardIterator2 __first2,
    _Tp __init,
    _BinaryOperation1 __reduce,
    _BinaryOperation2 __transform) {
  if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
    return __pstl::__cpu_traits<__cpu_backend_tag>::__transform_reduce(
        __first1,
        std::move(__last1),
        [__first1, __first2, __transform](_ForwardIterator1 __iter) {
          return __transform(*__iter, *(__first2 + (__iter - __first1)));
        },
        std::move(__init),
        std::move(__reduce),
        [__first1, __first2, __reduce, __transform](
            _ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last, _Tp __brick_init) {
          return *std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
              __cpu_backend_tag{},
              __brick_first,
              std::move(__brick_last),
              __first2 + (__brick_first - __first1),
              std::move(__brick_init),
              std::move(__reduce),
              std::move(__transform));
        });
  } else if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                       __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                       __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
    return std::__simd_transform_reduce<__cpu_backend_tag>(
        __last1 - __first1, std::move(__init), std::move(__reduce), [&](__iter_diff_t<_ForwardIterator1> __i) {
          return __transform(__first1[__i], __first2[__i]);
        });
  } else {
    return std::transform_reduce(
        std::move(__first1),
        std::move(__last1),
        std::move(__first2),
        std::move(__init),
        std::move(__reduce),
        std::move(__transform));
  }
}

template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class _UnaryOperation>
_LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
    __cpu_backend_tag,
    _ForwardIterator __first,
    _ForwardIterator __last,
    _Tp __init,
    _BinaryOperation __reduce,
    _UnaryOperation __transform) {
  if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
    return __pstl::__cpu_traits<__cpu_backend_tag>::__transform_reduce(
        std::move(__first),
        std::move(__last),
        [__transform](_ForwardIterator __iter) { return __transform(*__iter); },
        std::move(__init),
        __reduce,
        [__transform, __reduce](auto __brick_first, auto __brick_last, _Tp __brick_init) {
          auto __res = std::__pstl_transform_reduce<__remove_parallel_policy_t<_ExecutionPolicy>>(
              __cpu_backend_tag{},
              std::move(__brick_first),
              std::move(__brick_last),
              std::move(__brick_init),
              std::move(__reduce),
              std::move(__transform));
          _LIBCPP_ASSERT_INTERNAL(__res, "unseq/seq should never try to allocate!");
          return *std::move(__res);
        });
  } else if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
                       __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
    return std::__simd_transform_reduce<__cpu_backend_tag>(
        __last - __first,
        std::move(__init),
        std::move(__reduce),
        [=, &__transform](__iter_diff_t<_ForwardIterator> __i) { return __transform(__first[__i]); });
  } else {
    return std::transform_reduce(
        std::move(__first), std::move(__last), std::move(__init), std::move(__reduce), std::move(__transform));
  }
}

_LIBCPP_END_NAMESPACE_STD

#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17

_LIBCPP_POP_MACROS

#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_TRANSFORM_REDUCE_H