@@ -1169,16 +1169,17 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
1169
1169
1170
1170
template <typename _IsFull, typename _Idx>
1171
1171
void
1172
- __vector_path_impl (_IsFull __is_full , const _Idx __left_start_idx, _Range __rng) const
1172
+ __vector_path_impl (_IsFull, const _Idx __left_start_idx, _Range __rng) const
1173
1173
{
1174
1174
std::size_t __n = __size;
1175
1175
std::size_t __midpoint = __size / 2 ;
1176
- // If our start is passed the midpoint, then immediately leave as it is guaranteed to be processed by another
1177
- // work-item. There may be some double processing (< 4 elements) between left and right vectors at the
1178
- // "crossover" point within a work item, but allowing this is to happen is likely more performant than
1179
- // additional branching for each work item (see reverse_copy).
1180
- if (__left_start_idx >= __midpoint)
1181
- return ;
1176
+
1177
+ // In the below implementation, we see that _IsFull is ignored in favor of std::true_type{} in all cases.
1178
+ // This relaxation is due to the fact that in-place reverse launches work only over the first half of the
1179
+ // buffer. As long as __size >= __vec_size there is no risk of an OOB accesses or a race condition. There may
1180
+ // exist a single point of double processing between left and right vectors in the last work-item which
1181
+ // reverses middle elements. This extra processing of elements <= __vec_size is more performant than applying
1182
+ // additional branching (such as in reverse_copy).
1182
1183
1183
1184
// 1. Load two vectors that we want to swap: one from the left half of the buffer and one from the right
1184
1185
const _Idx __right_start_idx = __size - __left_start_idx - __base_t ::__preferred_vector_size;
@@ -1187,30 +1188,32 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
1187
1188
oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng_right_vector[__base_t ::__preferred_vector_size];
1188
1189
1189
1190
oneapi::dpl::__par_backend_hetero::__vector_load<__base_t ::__preferred_vector_size>{__n}(
1190
- __is_full, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng, __rng_left_vector);
1191
+ std::true_type{}, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng,
1192
+ __rng_left_vector);
1191
1193
oneapi::dpl::__par_backend_hetero::__vector_load<__base_t ::__preferred_vector_size>{__n}(
1192
- __is_full , __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng,
1194
+ std::true_type{} , __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng,
1193
1195
__rng_right_vector);
1194
- // 2. Reverse vectors in registers. Note that due to indices we have chosen, there will always be a full vector of elements to load
1196
+ // 2. Reverse vectors in registers. Note that due to indices we have chosen, there will always be a full
1197
+ // vector of elements to load
1195
1198
oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t ::__preferred_vector_size>{}(
1196
1199
std::true_type{}, __left_start_idx, __rng_left_vector);
1197
1200
oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t ::__preferred_vector_size>{}(
1198
1201
std::true_type{}, __right_start_idx, __rng_right_vector);
1199
1202
// 3. Store the left-half vector to the corresponding right-half indices and vice versa
1200
1203
oneapi::dpl::__par_backend_hetero::__vector_store<__base_t ::__preferred_vector_size>{__n}(
1201
- __is_full , __right_start_idx,
1204
+ std::true_type{} , __right_start_idx,
1202
1205
oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
1203
1206
__rng_left_vector, __rng);
1204
1207
oneapi::dpl::__par_backend_hetero::__vector_store<__base_t ::__preferred_vector_size>{__n}(
1205
- __is_full , __left_start_idx,
1208
+ std::true_type{} , __left_start_idx,
1206
1209
oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
1207
1210
__rng_right_vector, __rng);
1208
1211
// 4. Call destructors of temporary storage
1209
1212
oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t ::__preferred_vector_size>{__n}(
1210
- __is_full , 0 , oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter (),
1213
+ std::true_type{} , 0 , oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter (),
1211
1214
__rng_left_vector);
1212
1215
oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t ::__preferred_vector_size>{__n}(
1213
- __is_full , 0 , oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter (),
1216
+ std::true_type{} , 0 , oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter (),
1214
1217
__rng_right_vector);
1215
1218
}
1216
1219
template <typename _IsFull, typename _Idx>
0 commit comments