Skip to content

Commit 2ccb478

Browse files
committed
Correct optimization in __reverse_functor and improve explanation
Signed-off-by: Matthew Michel <[email protected]>
1 parent 8a387b2 commit 2ccb478

File tree

1 file changed

+17
-14
lines changed

1 file changed

+17
-14
lines changed

include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h

+17-14
Original file line numberDiff line numberDiff line change
@@ -1169,16 +1169,17 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
11691169

11701170
template <typename _IsFull, typename _Idx>
11711171
void
1172-
__vector_path_impl(_IsFull __is_full, const _Idx __left_start_idx, _Range __rng) const
1172+
__vector_path_impl(_IsFull, const _Idx __left_start_idx, _Range __rng) const
11731173
{
11741174
std::size_t __n = __size;
11751175
std::size_t __midpoint = __size / 2;
1176-
// If our start is passed the midpoint, then immediately leave as it is guaranteed to be processed by another
1177-
// work-item. There may be some double processing (< 4 elements) between left and right vectors at the
1178-
// "crossover" point within a work item, but allowing this is to happen is likely more performant than
1179-
// additional branching for each work item (see reverse_copy).
1180-
if (__left_start_idx >= __midpoint)
1181-
return;
1176+
1177+
// In the below implementation, we see that _IsFull is ignored in favor of std::true_type{} in all cases.
1178+
// This relaxation is due to the fact that in-place reverse launches work only over the first half of the
1179+
// buffer. As long as __size >= __vec_size there is no risk of an OOB accesses or a race condition. There may
1180+
// exist a single point of double processing between left and right vectors in the last work-item which
1181+
// reverses middle elements. This extra processing of elements <= __vec_size is more performant than applying
1182+
// additional branching (such as in reverse_copy).
11821183

11831184
// 1. Load two vectors that we want to swap: one from the left half of the buffer and one from the right
11841185
const _Idx __right_start_idx = __size - __left_start_idx - __base_t::__preferred_vector_size;
@@ -1187,30 +1188,32 @@ struct __reverse_functor : public walk_vector_or_scalar_base<_Range>
11871188
oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng_right_vector[__base_t::__preferred_vector_size];
11881189

11891190
oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
1190-
__is_full, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng, __rng_left_vector);
1191+
std::true_type{}, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng,
1192+
__rng_left_vector);
11911193
oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}(
1192-
__is_full, __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng,
1194+
std::true_type{}, __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng,
11931195
__rng_right_vector);
1194-
// 2. Reverse vectors in registers. Note that due to indices we have chosen, there will always be a full vector of elements to load
1196+
// 2. Reverse vectors in registers. Note that due to indices we have chosen, there will always be a full
1197+
// vector of elements to load
11951198
oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
11961199
std::true_type{}, __left_start_idx, __rng_left_vector);
11971200
oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}(
11981201
std::true_type{}, __right_start_idx, __rng_right_vector);
11991202
// 3. Store the left-half vector to the corresponding right-half indices and vice versa
12001203
oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
1201-
__is_full, __right_start_idx,
1204+
std::true_type{}, __right_start_idx,
12021205
oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
12031206
__rng_left_vector, __rng);
12041207
oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}(
1205-
__is_full, __left_start_idx,
1208+
std::true_type{}, __left_start_idx,
12061209
oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<oneapi::dpl::__internal::__pstl_assign>{},
12071210
__rng_right_vector, __rng);
12081211
// 4. Call destructors of temporary storage
12091212
oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
1210-
__is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
1213+
std::true_type{}, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
12111214
__rng_left_vector);
12121215
oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(
1213-
__is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
1216+
std::true_type{}, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(),
12141217
__rng_right_vector);
12151218
}
12161219
template <typename _IsFull, typename _Idx>

0 commit comments

Comments
 (0)