diff --git a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h index 4d7b81e6a2e..4fc274f2445 100644 --- a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h +++ b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_submitters.h @@ -27,7 +27,7 @@ namespace oneapi::dpl::experimental::kt::gpu::esimd::__impl { //------------------------------------------------------------------------ -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation //------------------------------------------------------------------------ template , _ExecutionPolicy&& __exec, _For auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf.all_view()); + unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__buf.all_view())>{ + __f, static_cast(__n)}, + __n, __buf.all_view()); return __future_obj; } @@ -67,7 +69,9 @@ __pattern_walk2_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf1.all_view(), __buf2.all_view()); + unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__buf1.all_view()), + decltype(__buf2.all_view())>{__f, static_cast(__n)}, + __n, __buf1.all_view(), __buf2.all_view()); return __future.__make_future(__first2 + __n); } @@ -91,10 +95,12 @@ __pattern_walk3_async(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _For oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator3>(); auto __buf3 = __keep3(__first3, __first3 + __n); - auto __future = - oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, - __buf1.all_view(), __buf2.all_view(), __buf3.all_view()); + auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for( + _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__buf1.all_view()), + decltype(__buf2.all_view()), decltype(__buf3.all_view())>{ + __f, static_cast(__n)}, + __n, __buf1.all_view(), __buf2.all_view(), __buf3.all_view()); return __future.__make_future(__first3 + __n); } diff --git a/include/oneapi/dpl/internal/binary_search_impl.h b/include/oneapi/dpl/internal/binary_search_impl.h index ef01be4b161..d06065e8c55 100644 --- a/include/oneapi/dpl/internal/binary_search_impl.h +++ b/include/oneapi/dpl/internal/binary_search_impl.h @@ -37,13 +37,19 @@ enum class search_algorithm binary_search }; -template -struct custom_brick +#if _ONEDPL_BACKEND_SYCL +template +struct __custom_brick : oneapi::dpl::unseq_backend::walk_scalar_base<_Range> { Comp comp; T size; bool use_32bit_indexing; + __custom_brick(Comp comp, T size, bool use_32bit_indexing) + : comp(comp), size(size), use_32bit_indexing(use_32bit_indexing) + { + } + template void search_impl(_ItemId idx, _Acc acc) const @@ -68,17 +74,23 @@ struct custom_brick get<2>(acc[idx]) = (value != end_orig) && (get<1>(acc[idx]) == get<0>(acc[value])); } } - - template + template void - operator()(_ItemId idx, _Acc acc) const + __scalar_path_impl(_IsFull, _ItemId idx, _Acc acc) const { if (use_32bit_indexing) search_impl(idx, acc); else search_impl(idx, acc); } + template + void + operator()(_IsFull __is_full, _ItemId idx, _Acc acc) const + { + __scalar_path_impl(__is_full, idx, acc); + } }; +#endif template @@ -155,7 +167,8 @@ lower_bound_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, InputIt const bool use_32bit_indexing = size <= std::numeric_limits::max(); __bknd::__parallel_for( _BackendTag{}, ::std::forward(policy), - custom_brick{comp, size, use_32bit_indexing}, + __custom_brick{ + comp, size, use_32bit_indexing}, value_size, zip_vw) .__deferrable_wait(); return result + value_size; @@ -187,7 +200,8 @@ upper_bound_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, InputIt const bool use_32bit_indexing = size <= std::numeric_limits::max(); __bknd::__parallel_for( _BackendTag{}, std::forward(policy), - custom_brick{comp, size, use_32bit_indexing}, + __custom_brick{ + comp, size, use_32bit_indexing}, value_size, zip_vw) .__deferrable_wait(); return result + value_size; @@ -217,10 +231,11 @@ binary_search_impl(__internal::__hetero_tag<_BackendTag>, Policy&& policy, Input auto result_buf = keep_result(result, result + value_size); auto zip_vw = make_zip_view(input_buf.all_view(), value_buf.all_view(), result_buf.all_view()); const bool use_32bit_indexing = size <= std::numeric_limits::max(); - __bknd::__parallel_for(_BackendTag{}, std::forward(policy), - custom_brick{ - comp, size, use_32bit_indexing}, - value_size, zip_vw) + __bknd::__parallel_for( + _BackendTag{}, std::forward(policy), + __custom_brick{ + comp, size, use_32bit_indexing}, + value_size, zip_vw) .__deferrable_wait(); return result + value_size; } diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 65bf99c8777..6d8fd17aeaa 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -57,7 +57,10 @@ __pattern_walk1(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt auto __buf = __keep(__first, __last); oneapi::dpl::__par_backend_hetero::__parallel_for( - _BackendTag{}, __exec, unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf.all_view()) + _BackendTag{}, __exec, + unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, decltype(__buf.all_view())>{ + __f, static_cast(__n)}, + __n, __buf.all_view()) .__deferrable_wait(); } @@ -103,7 +106,9 @@ __pattern_walk2(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, __buf1.all_view(), __buf2.all_view()); + unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__buf1.all_view()), + decltype(__buf2.all_view())>{__f, static_cast(__n)}, + __n, __buf1.all_view(), __buf2.all_view()); // Call no wait, wait or deferrable wait depending on _WaitMode __future.wait(_WaitMode{}); @@ -130,10 +135,25 @@ _ForwardIterator2 __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _Function __f) { - return __pattern_walk2( - __tag, ::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __f); + auto __n = __last1 - __first1; + if (__n == 0) + return __first2; + + auto __keep1 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator1>(); + auto __buf1 = __keep1(__first1, __last1); + + auto __keep2 = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator2>(); + auto __buf2 = __keep2(__first2, __first2 + __n); + + auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + unseq_backend::__brick_swap<_ExecutionPolicy, _Function, decltype(__buf1.all_view()), + decltype(__buf2.all_view())>{__f, static_cast(__n)}, + __n, __buf1.all_view(), __buf2.all_view()); + __future.wait(__par_backend_hetero::__deferrable_mode{}); + return __first2 + __n; } //------------------------------------------------------------------------ @@ -160,9 +180,12 @@ __pattern_walk3(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ForwardIt auto __keep3 = oneapi::dpl::__ranges::__get_sycl_range<__acc_mode3, _ForwardIterator3>(); auto __buf3 = __keep3(__first3, __first3 + __n); - oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, - __buf1.all_view(), __buf2.all_view(), __buf3.all_view()) + oneapi::dpl::__par_backend_hetero::__parallel_for( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__buf1.all_view()), + decltype(__buf2.all_view()), decltype(__buf3.all_view())>{ + __f, static_cast(__n)}, + __n, __buf1.all_view(), __buf2.all_view(), __buf3.all_view()) .__deferrable_wait(); return __first3 + __n; @@ -1564,9 +1587,10 @@ __pattern_reverse(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterato auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator>(); auto __buf = __keep(__first, __last); oneapi::dpl::__par_backend_hetero::__parallel_for( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - unseq_backend::__reverse_functor::difference_type>{__n}, __n / 2, - __buf.all_view()) + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + unseq_backend::__reverse_functor::difference_type, + decltype(__buf.all_view())>{__n}, + __n / 2, __buf.all_view()) .__deferrable_wait(); } @@ -1590,8 +1614,9 @@ __pattern_reverse_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bi oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator>(); auto __buf2 = __keep2(__result, __result + __n); oneapi::dpl::__par_backend_hetero::__parallel_for( - _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - unseq_backend::__reverse_copy::difference_type>{__n}, + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), + unseq_backend::__reverse_copy::difference_type, + decltype(__buf1.all_view()), decltype(__buf2.all_view())>{__n}, __n, __buf1.all_view(), __buf2.all_view()) .__deferrable_wait(); @@ -1632,18 +1657,20 @@ __pattern_rotate(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Iterator const auto __shift = __new_first - __first; oneapi::dpl::__par_backend_hetero::__parallel_for( _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__rotate_wrapper>(__exec), - unseq_backend::__rotate_copy::difference_type>{__n, __shift}, __n, - __buf.all_view(), __temp_rng_w); + unseq_backend::__rotate_copy::difference_type, + decltype(__buf.all_view()), decltype(__temp_rng_w)>{__n, __shift}, + __n, __buf.all_view(), __temp_rng_w); //An explicit wait isn't required here because we are working with a temporary sycl::buffer and sycl accessors and //SYCL runtime makes a dependency graph to prevent the races between two __parallel_for patterns. using _Function = __brick_move<__hetero_tag<_BackendTag>, _ExecutionPolicy>; - auto __brick = unseq_backend::walk_n<_ExecutionPolicy, _Function>{_Function{}}; - auto __temp_rng_rw = oneapi::dpl::__ranges::all_view<_Tp, __par_backend_hetero::access_mode::read_write>(__temp_buf.get_buffer()); - oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __brick, + auto __brick = + unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__temp_rng_rw), + decltype(__buf.all_view())>{_Function{}, static_cast(__n)}; + oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __brick, __n, __temp_rng_rw, __buf.all_view()) .__deferrable_wait(); @@ -1677,8 +1704,8 @@ __pattern_rotate_copy(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Bid oneapi::dpl::__par_backend_hetero::__parallel_for( _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - unseq_backend::__rotate_copy::difference_type>{__n, - __shift}, + unseq_backend::__rotate_copy::difference_type, + decltype(__buf1.all_view()), decltype(__buf2.all_view())>{__n, __shift}, __n, __buf1.all_view(), __buf2.all_view()) .__deferrable_wait(); @@ -1936,19 +1963,22 @@ __pattern_shift_left(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Rang if (__n >= __mid) { using _Function = __brick_move<__hetero_tag<_BackendTag>, _ExecutionPolicy>; - auto __brick = oneapi::dpl::unseq_backend::walk_n<_ExecutionPolicy, _Function>{_Function{}}; //TODO: to consider use just "read" access mode for a source range and just "write" - for a destination range. auto __src = oneapi::dpl::__ranges::drop_view_simple<_Range, _DiffType>(__rng, __n); auto __dst = oneapi::dpl::__ranges::take_view_simple<_Range, _DiffType>(__rng, __size_res); + auto __brick = + unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, decltype(__src), decltype(__dst)>{ + _Function{}, static_cast(__size_res)}; + oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), __brick, __size_res, __src, __dst) .__deferrable_wait(); } else //2. n < size/2; 'n' parallel copying { - auto __brick = unseq_backend::__brick_shift_left<_ExecutionPolicy, _DiffType>{__size, __n}; + auto __brick = unseq_backend::__brick_shift_left<_ExecutionPolicy, _DiffType, decltype(__rng)>{__size, __n}; oneapi::dpl::__par_backend_hetero::__parallel_for( _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__shift_left_right>( diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index da7820b91a2..d0af8d513ff 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -57,10 +57,35 @@ __pattern_walk_n(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Function auto __n = oneapi::dpl::__ranges::__get_first_range_size(__rngs...); if (__n > 0) { - oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), - unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n, - ::std::forward<_Ranges>(__rngs)...) - .__deferrable_wait(); + constexpr std::size_t __num_ranges = sizeof...(_Ranges); + static_assert(__num_ranges <= 3, "__pattern_walk_n only supports up to three packed range parameters"); + if constexpr (__num_ranges == 1) + { + oneapi::dpl::__par_backend_hetero::__parallel_for( + _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{ + __f, static_cast(__n)}, + __n, ::std::forward<_Ranges>(__rngs)...) + .__deferrable_wait(); + } + else if constexpr (__num_ranges == 2) + { + oneapi::dpl::__par_backend_hetero::__parallel_for( + _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{ + __f, static_cast(__n)}, + __n, ::std::forward<_Ranges>(__rngs)...) + .__deferrable_wait(); + } + else + { + oneapi::dpl::__par_backend_hetero::__parallel_for( + _BackendTag{}, ::std::forward<_ExecutionPolicy>(__exec), + unseq_backend::walk3_vectors_or_scalars<_ExecutionPolicy, _Function, std::decay_t<_Ranges>...>{ + __f, static_cast(__n)}, + __n, ::std::forward<_Ranges>(__rngs)...) + .__deferrable_wait(); + } } } @@ -147,20 +172,27 @@ __pattern_swap(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Rang { if (__rng1.size() <= __rng2.size()) { - oneapi::dpl::__internal::__ranges::__pattern_walk_n( - __tag, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap1_wrapper>( - ::std::forward<_ExecutionPolicy>(__exec)), - __f, __rng1, __rng2); - return __rng1.size(); + std::size_t __n = __rng1.size(); + auto __exec1 = oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap1_wrapper>( + std::forward<_ExecutionPolicy>(__exec)); + auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for( + _BackendTag{}, std::move(__exec1), + unseq_backend::__brick_swap, std::decay_t<_Range2>>{ + __f, __n}, + __n, __rng1, __rng2); + __future.wait(__par_backend_hetero::__deferrable_mode{}); + return __n; } - - oneapi::dpl::__internal::__ranges::__pattern_walk_n( - __tag, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap2_wrapper>( - ::std::forward<_ExecutionPolicy>(__exec)), - __f, __rng2, __rng1); - return __rng2.size(); + std::size_t __n = __rng2.size(); + auto __exec2 = + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__swap2_wrapper>(std::forward<_ExecutionPolicy>(__exec)); + auto __future = oneapi::dpl::__par_backend_hetero::__parallel_for( + _BackendTag{}, std::move(__exec2), + unseq_backend::__brick_swap, std::decay_t<_Range1>>{__f, + __n}, + __n, __rng2, __rng1); + __future.wait(__par_backend_hetero::__deferrable_mode{}); + return __n; } //------------------------------------------------------------------------ @@ -625,8 +657,9 @@ __pattern_unique_copy(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_wrapper>( std::forward<_ExecutionPolicy>(__exec)), - unseq_backend::walk_n<_ExecutionPolicy, _CopyBrick>{_CopyBrick{}}, __n, std::forward<_Range1>(__rng), - std::forward<_Range2>(__result)) + unseq_backend::walk2_vectors_or_scalars<_ExecutionPolicy, _CopyBrick, std::decay_t<_Range1>, + std::decay_t<_Range2>>{_CopyBrick{}, static_cast(__n)}, + __n, std::forward<_Range1>(__rng), std::forward<_Range2>(__result)) .get(); return 1; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index c7b72625315..47782444bdf 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -36,6 +36,7 @@ #include "sycl_defs.h" #include "parallel_backend_sycl_utils.h" +#include "parallel_backend_sycl_for.h" #include "parallel_backend_sycl_reduce.h" #include "parallel_backend_sycl_merge.h" #include "parallel_backend_sycl_merge_sort.h" @@ -218,57 +219,11 @@ class __scan_single_wg_dynamic_kernel; template class __scan_copy_single_wg_kernel; -//------------------------------------------------------------------------ -// parallel_for - async pattern -//------------------------------------------------------------------------ - -// Use the trick with incomplete type and partial specialization to deduce the kernel name -// as the parameter pack that can be empty (for unnamed kernels) or contain exactly one -// type (for explicitly specified name by the user) -template -struct __parallel_for_submitter; - -template -struct __parallel_for_submitter<__internal::__optional_kernel_name<_Name...>> -{ - template - auto - operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const - { - assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0); - _PRINT_INFO_IN_DEBUG_MODE(__exec); - auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) { - //get an access to data under SYCL buffer: - oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); - - __cgh.parallel_for<_Name...>(sycl::range(__count), [=](sycl::item __item_id) { - auto __idx = __item_id.get_linear_id(); - __brick(__idx, __rngs...); - }); - }); - return __future(__event); - } -}; - -//General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for, -//for some algorithms happens that size of processing range is n, but amount of iterations is n/2. -template -auto -__parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Fp __brick, _Index __count, - _Ranges&&... __rngs) -{ - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - using _ForKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<_CustomName>; - - return __parallel_for_submitter<_ForKernel>()(::std::forward<_ExecutionPolicy>(__exec), __brick, __count, - ::std::forward<_Ranges>(__rngs)...); -} - //------------------------------------------------------------------------ // parallel_transform_scan - async pattern //------------------------------------------------------------------------ -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_scan_submitter; @@ -2139,7 +2094,7 @@ struct __partial_merge_kernel } }; -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_partial_sort_submitter; @@ -2374,7 +2329,8 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ oneapi::dpl::__par_backend_hetero::__parallel_for( oneapi::dpl::__internal::__device_backend_tag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec), - unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), __intermediate_result_end, + unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n), _Range2>(__binary_op, __n), + __intermediate_result_end, oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __intermediate_result_end), std::forward<_Range2>(__values), oneapi::dpl::__ranges::views::all_write(__tmp_out_values)) @@ -2420,7 +2376,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ oneapi::dpl::__internal::__device_backend_tag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>( std::forward<_ExecutionPolicy>(__exec)), - unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>( + unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end), _Range4>( __binary_op, __intermediate_result_end), __result_end, oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __result_end), diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h new file mode 100644 index 00000000000..46e36d1fafb --- /dev/null +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_for.h @@ -0,0 +1,208 @@ +// -*- C++ -*- +//===-- parallel_backend_sycl_for.h --------------------------------===// +// +// Copyright (C) Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file incorporates work covered by the following copyright and permission +// notice: +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_FOR_H +#define _ONEDPL_PARALLEL_BACKEND_SYCL_FOR_H + +#include +#include +#include +#include + +#include "sycl_defs.h" +#include "parallel_backend_sycl_utils.h" +#include "execution_sycl_defs.h" +#include "unseq_backend_sycl.h" +#include "utils_ranges_sycl.h" + +#include "sycl_traits.h" //SYCL traits specialization for some oneDPL types. + +namespace oneapi +{ +namespace dpl +{ +namespace __par_backend_hetero +{ + +template +class __parallel_for_small_kernel; + +template +class __parallel_for_large_kernel; + +//------------------------------------------------------------------------ +// parallel_for - async pattern +//------------------------------------------------------------------------ + +// Use the trick with incomplete type and partial specialization to deduce the kernel name +// as the parameter pack that can be empty (for unnamed kernels) or contain exactly one +// type (for explicitly specified name by the user) +template +struct __parallel_for_small_submitter; + +template +struct __parallel_for_small_submitter<__internal::__optional_kernel_name<_Name...>> +{ + template + auto + operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const + { + assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0); + _PRINT_INFO_IN_DEBUG_MODE(__exec); + auto __event = __exec.queue().submit([__rngs..., __brick, __count](sycl::handler& __cgh) { + //get an access to data under SYCL buffer: + oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); + + __cgh.parallel_for<_Name...>(sycl::range(__count), [=](sycl::item __item_id) { + auto __idx = __item_id.get_linear_id(); + // For small inputs, do not vectorize or perform multiple iterations per work item. Spread input evenly + // across compute units. + __brick.__scalar_path_impl(std::true_type{}, __idx, __rngs...); + }); + }); + return __future(__event); + } +}; + +template +struct __parallel_for_large_submitter; + +template +struct __parallel_for_large_submitter<__internal::__optional_kernel_name<_Name...>, _RangeTypes...> +{ + // Limit the work-group size to 512 which has empirically yielded the best results across different architectures. + static constexpr std::uint16_t __max_work_group_size = 512; + + // SPIR-V compilation targets show best performance with a stride of the sub-group size. + // Other compilation targets perform best with a work-group size stride. This utility can only be called from the + // device. + static inline std::tuple + __stride_recommender(const sycl::nd_item<1>& __item, std::size_t __count, std::size_t __iters_per_work_item, + std::size_t __adj_elements_per_work_item, std::size_t __work_group_size) + { + const std::size_t __work_group_id = __item.get_group().get_group_linear_id(); + if constexpr (oneapi::dpl::__internal::__is_spirv_target_v) + { + const __dpl_sycl::__sub_group __sub_group = __item.get_sub_group(); + const std::uint32_t __sub_group_size = __sub_group.get_local_linear_range(); + const std::uint32_t __sub_group_id = __sub_group.get_group_linear_id(); + const std::uint32_t __sub_group_local_id = __sub_group.get_local_linear_id(); + + const std::size_t __sub_group_start_idx = + __iters_per_work_item * __adj_elements_per_work_item * + (__work_group_id * __work_group_size + __sub_group_size * __sub_group_id); + const bool __is_full_sub_group = + __sub_group_start_idx + __iters_per_work_item * __adj_elements_per_work_item * __sub_group_size <= + __count; + const std::size_t __work_item_idx = + __sub_group_start_idx + __adj_elements_per_work_item * __sub_group_local_id; + return std::tuple(__work_item_idx, __adj_elements_per_work_item * __sub_group_size, __is_full_sub_group); + } + else + { + const std::size_t __work_group_start_idx = + __work_group_id * __work_group_size * __iters_per_work_item * __adj_elements_per_work_item; + const std::size_t __work_item_idx = + __work_group_start_idx + __item.get_local_linear_id() * __adj_elements_per_work_item; + const bool __is_full_work_group = + __work_group_start_idx + __iters_per_work_item * __work_group_size * __adj_elements_per_work_item <= + __count; + return std::tuple(__work_item_idx, __work_group_size * __adj_elements_per_work_item, __is_full_work_group); + } + } + + // Once there is enough work to launch a group on each compute unit with our chosen __iters_per_item, + // then we should start using this code path. + template + static std::size_t + __estimate_best_start_size(const _ExecutionPolicy& __exec, _Fp __brick) + { + const std::size_t __work_group_size = + oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); + const std::uint32_t __max_cu = oneapi::dpl::__internal::__max_compute_units(__exec); + return __work_group_size * _Fp::__preferred_iters_per_item * __max_cu; + } + + template + auto + operator()(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&... __rngs) const + { + assert(oneapi::dpl::__ranges::__get_first_range_size(__rngs...) > 0); + const std::size_t __work_group_size = + oneapi::dpl::__internal::__max_work_group_size(__exec, __max_work_group_size); + _PRINT_INFO_IN_DEBUG_MODE(__exec); + auto __event = __exec.queue().submit([__rngs..., __brick, __work_group_size, __count](sycl::handler& __cgh) { + //get an access to data under SYCL buffer: + oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); + constexpr static std::uint16_t __iters_per_work_item = _Fp::__preferred_iters_per_item; + const std::size_t __num_groups = oneapi::dpl::__internal::__dpl_ceiling_div( + __count, (__work_group_size * _Fp::__preferred_vector_size * __iters_per_work_item)); + __cgh.parallel_for<_Name...>( + sycl::nd_range(sycl::range<1>(__num_groups * __work_group_size), sycl::range<1>(__work_group_size)), + [=](sycl::nd_item __item) { + auto [__idx, __stride, __is_full] = __stride_recommender( + __item, __count, __iters_per_work_item, _Fp::__preferred_vector_size, __work_group_size); + __strided_loop<__iters_per_work_item> __execute_loop{static_cast(__count)}; + if (__is_full) + { + __execute_loop(std::true_type{}, __idx, __stride, __brick, __rngs...); + } + // If we are not full, then take this branch only if there is work to process. + else if (__idx < __count) + { + __execute_loop(std::false_type{}, __idx, __stride, __brick, __rngs...); + } + }); + }); + return __future(__event); + } +}; + +//General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for, +//for some algorithms happens that size of processing range is n, but amount of iterations is n/2. +template +auto +__parallel_for(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Fp __brick, _Index __count, + _Ranges&&... __rngs) +{ + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + using _ForKernelSmall = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_small_kernel<_CustomName>>; + using _ForKernelLarge = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<__parallel_for_large_kernel<_CustomName>>; + + using __small_submitter = __parallel_for_small_submitter<_ForKernelSmall>; + using __large_submitter = __parallel_for_large_submitter<_ForKernelLarge, _Ranges...>; + // Compile two kernels: one for small-to-medium inputs and a second for large. This avoids runtime checks within a + // single kernel that worsen performance for small cases. If the number of iterations of the large submitter is 1, + // then only compile the basic kernel as the two versions are effectively the same. + if constexpr (_Fp::__preferred_iters_per_item > 1 || _Fp::__preferred_vector_size > 1) + { + if (__count >= __large_submitter::__estimate_best_start_size(__exec, __brick)) + { + + return __large_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); + } + } + return __small_submitter{}(std::forward<_ExecutionPolicy>(__exec), __brick, __count, + std::forward<_Ranges>(__rngs)...); +} + +} // namespace __par_backend_hetero +} // namespace dpl +} // namespace oneapi + +#endif diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h index 7baee78b1b1..613cd07f6f2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_fpga.h @@ -48,7 +48,7 @@ namespace __par_backend_hetero //General version of parallel_for, one additional parameter - __count of iterations of loop __cgh.parallel_for, //for some algorithms happens that size of processing range is n, but amount of iterations is n/2. -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_for_fpga_submitter; @@ -71,7 +71,7 @@ struct __parallel_for_fpga_submitter<__internal::__optional_kernel_name<_Name... #pragma unroll(::std::decay <_ExecutionPolicy>::type::unroll_factor) for (auto __idx = 0; __idx < __count; ++__idx) { - __brick(__idx, __rngs...); + __brick.__scalar_path_impl(std::true_type{}, __idx, __rngs...); } }); }); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h index b1a775dd8f4..da454b07bc8 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_merge.h @@ -164,7 +164,7 @@ __serial_merge(const _Rng1& __rng1, const _Rng2& __rng2, _Rng3& __rng3, const _I } } -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation template struct __parallel_merge_submitter; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h index a29060a9cca..26a280806ae 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h @@ -111,7 +111,7 @@ __device_reduce_kernel(const _NDItemId __item_id, const _Size __n, const _Size _ //------------------------------------------------------------------------ // parallel_transform_reduce - async patterns -// Please see the comment for __parallel_for_submitter for optional kernel name explanation +// Please see the comment above __parallel_for_small_submitter for optional kernel name explanation //------------------------------------------------------------------------ // Parallel_transform_reduce for a small arrays using a single work group. diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 3d9d923d80d..52db213e462 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -840,6 +840,172 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, } }; +// For use with __lazy_ctor_storage +struct __lazy_load_op +{ + template + void + operator()(_IdxType1 __idx_source, _IdxType2 __idx_dest, _SourceAcc __source_acc, _DestAcc __dest_acc) const + { + __dest_acc[__idx_dest].__setup(__source_acc[__idx_source]); + } +}; + +template +struct __vector_load +{ + static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported"); + std::size_t __full_range_size; + template + void + operator()(/*__is_full*/ std::true_type, _IdxType __start_idx, _LoadOp __load_op, _Rngs&&... __rngs) const + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint8_t __i = 0; __i < __vec_size; ++__i) + __load_op(__start_idx + __i, __i, __rngs...); + } + + template + void + operator()(/*__is_full*/ std::false_type, _IdxType __start_idx, _LoadOp __load_op, _Rngs&&... __rngs) const + { + std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__full_range_size - __start_idx}); + for (std::uint8_t __i = 0; __i < __elements; ++__i) + __load_op(__start_idx + __i, __i, __rngs...); + } +}; + +// For use with __lazy_ctor_storage +template +struct __lazy_store_transform_op +{ + _TransformOp __transform; + // Unary transformations into an output buffer + template + void + operator()(_IdxType1 __idx_source, _IdxType2 __idx_dest, _SourceAcc __source_acc, _DestAcc __dest_acc) const + { + __transform(__source_acc[__idx_source].__v, __dest_acc[__idx_dest]); + } + // Binary transformations into an output buffer + template + void + operator()(_IdxType1 __idx_source, _IdxType2 __idx_dest, _Source1Acc __source1_acc, _Source2Acc __source2_acc, + _DestAcc __dest_acc) const + { + __transform(__source1_acc[__idx_source].__v, __source2_acc[__idx_source].__v, __dest_acc[__idx_dest]); + } +}; + +template +struct __vector_walk +{ + static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported"); + std::size_t __full_range_size; + + template + void + operator()(std::true_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint8_t __i = 0; __i < __vec_size; ++__i) + { + __f(__rngs[__idx + __i]...); + } + } + // For a non-full vector path, process it sequentially. This will always be the last sub or work group + // if it does not evenly divide into input + template + void + operator()(std::false_type, _IdxType __idx, _WalkFunction __f, _Rngs&&... __rngs) const + { + std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__full_range_size - __idx}); + for (std::uint8_t __i = 0; __i < __elements; ++__i) + { + __f(__rngs[__idx + __i]...); + } + } +}; + +template +struct __vector_store +{ + static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported"); + std::size_t __full_range_size; + + template + void + operator()(std::true_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs&&... __rngs) const + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint8_t __i = 0; __i < __vec_size; ++__i) + __store_op(__i, __start_idx + __i, __rngs...); + } + template + void + operator()(std::false_type, _IdxType __start_idx, _StoreOp __store_op, _Rngs&&... __rngs) const + { + std::uint8_t __elements = std::min(std::size_t{__vec_size}, std::size_t{__full_range_size - __start_idx}); + for (std::uint8_t __i = 0; __i < __elements; ++__i) + __store_op(__i, __start_idx + __i, __rngs...); + } +}; + +template +struct __vector_reverse +{ + static_assert(__vec_size <= 4, "Only vector sizes of 4 or less are supported"); + template + void + operator()(/*__is_full*/ std::true_type, const _Idx /*__elements_to_process*/, _Array __array) const + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint8_t __i = 0; __i < __vec_size / 2; ++__i) + std::swap(__array[__i].__v, __array[__vec_size - __i - 1].__v); + } + template + void + operator()(/*__is_full*/ std::false_type, const _Idx __elements_to_process, _Array __array) const + { + for (std::uint8_t __i = 0; __i < __elements_to_process / 2; ++__i) + std::swap(__array[__i].__v, __array[__elements_to_process - __i - 1].__v); + } +}; + +// Processes a loop with a given stride. Intended to be used with sub-group / work-group strides for good memory access patterns +// (potentially with vectorization) +template +struct __strided_loop +{ + std::size_t __full_range_size; + template + void + operator()(/*__is_full*/ std::true_type, _IdxType __idx, std::uint16_t __stride, _LoopBodyOp __loop_body_op, + _Ranges&&... __rngs) const + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint8_t __i = 0; __i < __num_strides; ++__i) + { + __loop_body_op(std::true_type{}, __idx, __rngs...); + __idx += __stride; + } + } + template + void + operator()(/*__is_full*/ std::false_type, _IdxType __idx, std::uint16_t __stride, _LoopBodyOp __loop_body_op, + _Ranges&&... __rngs) const + { + // Constrain the number of iterations as much as possible and then pass the knowledge that we are not a full loop to the body operation + const std::uint8_t __adjusted_iters_per_work_item = + oneapi::dpl::__internal::__dpl_ceiling_div(__full_range_size - __idx, __stride); + for (std::uint8_t __i = 0; __i < __adjusted_iters_per_work_item; ++__i) + { + __loop_body_op(std::false_type{}, __idx, __rngs...); + __idx += __stride; + } + } +}; + } // namespace __par_backend_hetero } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h index 7d3fd829cc5..e9de6d55d4d 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h @@ -368,7 +368,16 @@ namespace oneapi::dpl::unseq_backend template struct walk_n; -template +template +struct walk1_vector_or_scalar; + +template +struct walk2_vectors_or_scalars; + +template +struct walk3_vectors_or_scalars; + +template struct walk_adjacent_difference; template class __brick_set_op; -template +template struct __brick_reduce_idx; } // namespace oneapi::dpl::unseq_backend @@ -429,9 +438,30 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backen { }; -template +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk1_vector_or_scalar, + _ExecutionPolicy, _F, _Range)> + : oneapi::dpl::__internal::__are_all_device_copyable<_F> +{ +}; + +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk2_vectors_or_scalars, + _ExecutionPolicy, _F, _Range1, _Range2)> + : oneapi::dpl::__internal::__are_all_device_copyable<_F> +{ +}; + +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk3_vectors_or_scalars, + _ExecutionPolicy, _F, _Range1, _Range2, _Range3)> + : oneapi::dpl::__internal::__are_all_device_copyable<_F> +{ +}; + +template struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::walk_adjacent_difference, - _ExecutionPolicy, _F)> + _ExecutionPolicy, _F, _Range1, _Range2)> : oneapi::dpl::__internal::__are_all_device_copyable<_F> { }; @@ -543,9 +573,9 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backen { }; -template +template struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::unseq_backend::__brick_reduce_idx, _BinaryOperator, - _Size)> + _Size, _Range)> : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryOperator, _Size> { }; @@ -555,8 +585,8 @@ namespace oneapi::dpl::internal enum class search_algorithm; -template -struct custom_brick; +template +struct __custom_brick; template struct replace_if_fun; @@ -575,8 +605,8 @@ class transform_if_stencil_fun; } // namespace oneapi::dpl::internal -template -struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::internal::custom_brick, Comp, T, func)> +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::internal::__custom_brick, Comp, T, _Range, func)> : oneapi::dpl::__internal::__are_all_device_copyable { }; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h index e0b57260ee0..4d413e95c33 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h @@ -22,6 +22,7 @@ #include "../../onedpl_config.h" #include "../../utils.h" #include "sycl_defs.h" +#include "utils_ranges_sycl.h" #define _ONEDPL_SYCL_KNOWN_IDENTITY_PRESENT \ (_ONEDPL_SYCL2020_KNOWN_IDENTITY_PRESENT || _ONEDPL_LIBSYCL_KNOWN_IDENTITY_PRESENT) @@ -112,6 +113,194 @@ struct walk_n } }; +// Base class which establishes tuning parameters including vectorization / scalar path decider at compile time +// for walk / for based algorithms +template +struct walk_vector_or_scalar_base +{ + private: + using _ValueTypes = std::tuple...>; + constexpr static std::uint8_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value; + // Empirically determined 'bytes-in-flight' to maximize bandwidth utilization + constexpr static std::uint8_t __bytes_per_item = 16; + // Maximum size supported by compilers to generate vector instructions + constexpr static std::uint8_t __max_vector_size = 4; + + public: + constexpr static bool __can_vectorize = + (oneapi::dpl::__ranges::__is_vectorizable_range>::value && ...) && + (std::is_fundamental_v> && ...) && __min_type_size < 4; + // Vectorize for small types, so we generate 128-byte load / stores in a sub-group + constexpr static std::uint8_t __preferred_vector_size = + __can_vectorize ? oneapi::dpl::__internal::__dpl_ceiling_div(__max_vector_size, __min_type_size) : 1; + constexpr static std::uint8_t __preferred_iters_per_item = + __bytes_per_item / (__min_type_size * __preferred_vector_size); +}; + +// Path that intentionally disables vectorization for algorithms with a scattered access pattern (e.g. binary_search) +template +struct walk_scalar_base +{ + private: + using _ValueTypes = std::tuple...>; + constexpr static std::uint8_t __min_type_size = oneapi::dpl::__internal::__min_nested_type_size<_ValueTypes>::value; + constexpr static std::uint8_t __bytes_per_item = 16; + + public: + constexpr static bool __can_vectorize = false; + // With no vectorization, the vector size is 1 + constexpr static std::uint8_t __preferred_vector_size = 1; + // To achieve full bandwidth utilization, multiple iterations need to be processed by a work item + constexpr static std::uint8_t __preferred_iters_per_item = + __bytes_per_item / (__min_type_size * __preferred_vector_size); +}; + +template +struct walk1_vector_or_scalar : public walk_vector_or_scalar_base<_Range> +{ + private: + using __base_t = walk_vector_or_scalar_base<_Range>; + _F __f; + std::size_t __n; + + public: + walk1_vector_or_scalar(_F __f, std::size_t __n) : __f(__f), __n(__n) {} + + template + void + __vector_path_impl(_IsFull __is_full, const std::size_t __idx, _Range __rng) const + { + oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}(__is_full, __idx, __f, + __rng); + } + + // _IsFull is ignored here. We assume that boundary checking has been already performed for this index. + template + void + __scalar_path_impl(_IsFull, const std::size_t __idx, _Range __rng) const + { + __f(__rng[__idx]); + } + + template + void + operator()(_IsFull __is_full, const std::size_t __idx, _Range __rng) const + { + if constexpr (__base_t::__can_vectorize) + __vector_path_impl(__is_full, __idx, __rng); + else + __scalar_path_impl(__is_full, __idx, __rng); + } +}; + +template +struct walk2_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Range2> +{ + private: + using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>; + _F __f; + std::size_t __n; + + public: + walk2_vectors_or_scalars(_F __f, std::size_t __n) : __f(__f), __n(__n) {} + + template + void + __vector_path_impl(_IsFull __is_full, const std::size_t __idx, _Range1 __rng1, _Range2 __rng2) const + { + using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>; + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size]; + // 1. Load input into a vector + oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng1, __rng1_vector); + // 2. Apply functor to vector and store into global memory + oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}( + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector, + __rng2); + // 3. Explicitly call destructor of lazy union type + oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}( + __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(), + __rng1_vector); + } + + // _IsFull is ignored here. We assume that boundary checking has been already performed for this index. + template + void + __scalar_path_impl(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const + { + + __f(__rng1[__idx], __rng2[__idx]); + } + + template + void + operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const + { + if constexpr (__base_t::__can_vectorize) + __vector_path_impl(__is_full, __idx, __rng1, __rng2); + else + __scalar_path_impl(__is_full, __idx, __rng1, __rng2); + } +}; + +template +struct walk3_vectors_or_scalars : public walk_vector_or_scalar_base<_Range1, _Range2, _Range3> +{ + private: + using __base_t = walk_vector_or_scalar_base<_Range1, _Range2, _Range3>; + _F __f; + std::size_t __n; + + public: + walk3_vectors_or_scalars(_F __f, std::size_t __n) : __f(__f), __n(__n) {} + + template + void + __vector_path_impl(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const + { + using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>; + using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>; + + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size]; + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2> __rng2_vector[__base_t::__preferred_vector_size]; + // 1. Load inputs into vectors + oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng1, __rng1_vector); + oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng2, __rng2_vector); + // 2. Apply binary functor to vector and store into global memory + oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}( + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector, + __rng2_vector, __rng3); + // 3. Explicitly call destructors of lazy union type + oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}( + __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(), + __rng1_vector); + oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}( + __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2>::__get_callable_deleter(), + __rng2_vector); + } + + // _IsFull is ignored here. We assume that boundary checking has been already performed for this index. + template + void + __scalar_path_impl(_IsFull, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const + { + + __f(__rng1[__idx], __rng2[__idx], __rng3[__idx]); + } + + template + void + operator()(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2, _Range3 __rng3) const + { + if constexpr (__base_t::__can_vectorize) + __vector_path_impl(__is_full, __idx, __rng1, __rng2, __rng3); + else + __scalar_path_impl(__is_full, __idx, __rng1, __rng2, __rng3); + } +}; + // If read accessor returns temporary value then __no_op returns lvalue reference to it. // After temporary value destroying it will be a reference on invalid object. // So let's don't call functor in case of __no_op @@ -132,22 +321,62 @@ struct walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op> // walk_adjacent_difference //------------------------------------------------------------------------ -template -struct walk_adjacent_difference +template +struct walk_adjacent_difference : public walk_vector_or_scalar_base<_Range1, _Range2> { + private: + using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>; _F __f; + std::size_t __n; - template + public: + walk_adjacent_difference(_F __f, std::size_t __n) : __f(__f), __n(__n) {} + + template void - operator()(const _ItemId __idx, const _Acc1& _acc_src, _Acc2& _acc_dst) const + __scalar_path_impl(_IsFull, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const { - using ::std::get; - // just copy an element if it is the first one if (__idx == 0) - _acc_dst[__idx] = _acc_src[__idx]; + __rng2[__idx] = __rng1[__idx]; + else + __f(__rng1[__idx + (-1)], __rng1[__idx], __rng2[__idx]); + } + template + void + __vector_path_impl(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const + { + using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>; + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size + 1]; + // 1. Establish a vector of __preferred_vector_size + 1 where a scalar load is performed on the first element + // followed by a vector load of the specified length. + if (__idx != 0) + __rng1_vector[0].__setup(__rng1[__idx - 1]); + else + __rng1_vector[0].__setup(__rng1[0]); + oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng1, &__rng1_vector[1]); + // 2. Perform a vector store of __preferred_vector_size adjacent differences. + oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}( + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector, + &__rng1_vector[1], __rng2); + // A dummy value is first written to global memory followed by an overwrite for the first index. Pulling the vector loads / stores into an if branch + // to better handle this results in performance degradation. + if (__idx == 0) + __rng2[0] = __rng1_vector[0].__v; + // 3. Delete temporary storage + oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}( + __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(), + __rng1_vector); + } + template + void + operator()(_IsFull __is_full, const _ItemId __idx, const _Range1 __rng1, _Range2 __rng2) const + { + if constexpr (__base_t::__can_vectorize) + __vector_path_impl(__is_full, __idx, __rng1, __rng2); else - __f(_acc_src[__idx + (-1)], _acc_src[__idx], _acc_dst[__idx]); + __scalar_path_impl(__is_full, __idx, __rng1, __rng2); } }; @@ -927,47 +1156,211 @@ struct __brick_includes //------------------------------------------------------------------------ // reverse //------------------------------------------------------------------------ -template -struct __reverse_functor +template +struct __reverse_functor : public walk_vector_or_scalar_base<_Range> { + private: + using __base_t = walk_vector_or_scalar_base<_Range>; + using _ValueType = oneapi::dpl::__internal::__value_t<_Range>; _Size __size; - template + + public: + __reverse_functor(_Size __size) : __size(__size) {} + + template void - operator()(const _Idx __idx, _Accessor& __acc) const + __vector_path_impl(_IsFull, const std::size_t __left_start_idx, _Range __rng) const + { + std::size_t __n = __size; + std::size_t __midpoint = __size / 2; + + // In the below implementation, we see that _IsFull is ignored in favor of std::true_type{} in all cases. + // This relaxation is due to the fact that in-place reverse launches work only over the first half of the + // buffer. As long as __size >= __vec_size there is no risk of an OOB accesses or a race condition. There may + // exist a single point of double processing between left and right vectors in the last work-item which + // reverses middle elements. This extra processing of elements <= __vec_size is more performant than applying + // additional branching (such as in reverse_copy). + + // 1. Load two vectors that we want to swap: one from the left half of the buffer and one from the right + const std::size_t __right_start_idx = __size - __left_start_idx - __base_t::__preferred_vector_size; + + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng_left_vector[__base_t::__preferred_vector_size]; + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng_right_vector[__base_t::__preferred_vector_size]; + + oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( + std::true_type{}, __left_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng, + __rng_left_vector); + oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( + std::true_type{}, __right_start_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng, + __rng_right_vector); + // 2. Reverse vectors in registers. Note that due to indices we have chosen, there will always be a full + // vector of elements to load + oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}( + std::true_type{}, __left_start_idx, __rng_left_vector); + oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}( + std::true_type{}, __right_start_idx, __rng_right_vector); + // 3. Store the left-half vector to the corresponding right-half indices and vice versa + oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}( + std::true_type{}, __right_start_idx, + oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op{}, + __rng_left_vector, __rng); + oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}( + std::true_type{}, __left_start_idx, + oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op{}, + __rng_right_vector, __rng); + // 4. Call destructors of temporary storage + oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}( + std::true_type{}, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(), + __rng_left_vector); + oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n}( + std::true_type{}, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(), + __rng_right_vector); + } + template + void + __scalar_path_impl(_IsFull, const std::size_t __idx, _Range __rng) const { using ::std::swap; - swap(__acc[__idx], __acc[__size - __idx - 1]); + swap(__rng[__idx], __rng[__size - __idx - 1]); + } + template + void + operator()(_IsFull __is_full, const std::size_t __idx, _Range __rng) const + { + if constexpr (__base_t::__can_vectorize) + __vector_path_impl(__is_full, __idx, __rng); + else + __scalar_path_impl(__is_full, __idx, __rng); } }; //------------------------------------------------------------------------ // reverse_copy //------------------------------------------------------------------------ -template -struct __reverse_copy +template +struct __reverse_copy : public walk_vector_or_scalar_base<_Range1, _Range2> { + private: + using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>; + using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>; _Size __size; - template + + public: + __reverse_copy(_Size __size) : __size(__size) {} + + template + void + __scalar_path_impl(_IsFull, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const + { + __rng2[__idx] = __rng1[__size - __idx - 1]; + } + template + void + __vector_path_impl(_IsFull __is_full, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const + { + std::size_t __n = __size; + std::size_t __remaining_elements = __n - __idx; + std::uint8_t __elements_to_process = + std::min(static_cast(__base_t::__preferred_vector_size), __remaining_elements); + const std::size_t __output_start = __size - __idx - __elements_to_process; + // 1. Load vector to reverse + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size]; + oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng1, __rng1_vector); + // 2, 3. Reverse in registers and flip the location of the vector in the output buffer + if (__elements_to_process == __base_t::__preferred_vector_size) + { + oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}( + __is_full, __elements_to_process, __rng1_vector); + oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}( + __is_full, __output_start, + oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op{}, + __rng1_vector, __rng2); + } + else + { + oneapi::dpl::__par_backend_hetero::__vector_reverse<__base_t::__preferred_vector_size>{}( + std::false_type{}, __elements_to_process, __rng1_vector); + for (std::uint8_t __i = 0; __i < __elements_to_process; ++__i) + __rng2[__output_start + __i] = __rng1_vector[__i].__v; + } + // 3. Cleanup + oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__remaining_elements}( + __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(), + __rng1_vector); + } + template void - operator()(const _Idx __idx, const _AccessorSrc& __acc1, _AccessorDst& __acc2) const + operator()(_IsFull __is_full, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const { - __acc2[__idx] = __acc1[__size - __idx - 1]; + if constexpr (__base_t::__can_vectorize) + __vector_path_impl(__is_full, __idx, __rng1, __rng2); + else + __scalar_path_impl(__is_full, __idx, __rng1, __rng2); } }; //------------------------------------------------------------------------ // rotate_copy //------------------------------------------------------------------------ -template -struct __rotate_copy +template +struct __rotate_copy : public walk_vector_or_scalar_base<_Range1, _Range2> { + private: + using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>; + using _ValueType = oneapi::dpl::__internal::__value_t<_Range1>; _Size __size; _Size __shift; - template + + public: + __rotate_copy(_Size __size, _Size __shift) : __size(__size), __shift(__shift) {} + + template + void + __vector_path_impl(_IsFull __is_full, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const + { + std::size_t __shifted_idx = __shift + __idx; + std::size_t __wrapped_idx = __shifted_idx % __size; + std::size_t __n = __size; + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType> __rng1_vector[__base_t::__preferred_vector_size]; + //1. Vectorize loads only if we know the wrap around point is beyond the current vector elements to process + if (__wrapped_idx + __base_t::__preferred_vector_size <= __size) + { + oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( + __is_full, __wrapped_idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng1, __rng1_vector); + } + else + { + std::size_t __remaining_elements = __idx >= __n ? 0 : __n - __idx; + std::size_t __elements_to_process = + std::min(static_cast(__base_t::__preferred_vector_size), __remaining_elements); + for (std::uint16_t __i = 0; __i != __elements_to_process; ++__i) + __rng1_vector[__i].__setup(__rng1[(__shifted_idx + __i) % __size]); + } + // 2. Store the rotation + oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}( + __is_full, __idx, + oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op{}, + __rng1_vector, __rng2); + // 3. Delete temporary storage + oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}( + __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType>::__get_callable_deleter(), + __rng1_vector); + } + template void - operator()(const _Idx __idx, const _AccessorSrc& __acc1, _AccessorDst& __acc2) const + __scalar_path_impl(_IsFull, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const { - __acc2[__idx] = __acc1[(__shift + __idx) % __size]; + __rng2[__idx] = __rng1[(__shift + __idx) % __size]; + } + template + void + operator()(_IsFull __is_full, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const + { + if constexpr (__base_t::__can_vectorize) + __vector_path_impl(__is_full, __idx, __rng1, __rng2); + else + __scalar_path_impl(__is_full, __idx, __rng1, __rng2); } }; @@ -1041,15 +1434,24 @@ class __brick_set_op } }; -template +// TODO: The implementation of shift left is reliant on exactly n (shift factor) +// work items being launched by the parallel_for pattern, so it cannot be vectorized +// or process multiple iterations per work items as is. For now, we must ensure that our +// small submitter is launched in the SYCL backend's __parallel_for +template struct __brick_shift_left { + using __base_t = walk_vector_or_scalar_base<_Range>; + using _ValueType = oneapi::dpl::__internal::__value_t<_Range>; + constexpr static bool __can_vectorize = false; + constexpr static std::uint16_t __preferred_vector_size = 1; + constexpr static std::uint16_t __preferred_iters_per_item = 1; _DiffType __size; _DiffType __n; - template + template void - operator()(const _ItemId __idx, _Range&& __rng) const + __scalar_path_impl(_IsFull __is_full, const _ItemId __idx, _Range __rng) const { const _DiffType __i = __idx - __n; //loop invariant for (_DiffType __k = __n; __k < __size; __k += __n) @@ -1058,6 +1460,13 @@ struct __brick_shift_left __rng[__k + __i] = ::std::move(__rng[__k + __idx]); } } + + template + void + operator()(_IsFull __is_full, const _ItemId __idx, _Range __rng) const + { + __scalar_path_impl(__is_full, __idx, __rng); + } }; struct __brick_assign_key_position @@ -1074,14 +1483,14 @@ struct __brick_assign_key_position }; // reduce the values in a segment associated with a key -template -struct __brick_reduce_idx +template +struct __brick_reduce_idx : public walk_scalar_base<_Range> { __brick_reduce_idx(const _BinaryOperator& __b, const _Size __n_) : __binary_op(__b), __n(__n_) {} - template + template auto - reduce(_Idx __segment_begin, _Idx __segment_end, const _Values& __values) const + reduce(std::size_t __segment_begin, std::size_t __segment_end, const _Values& __values) const { using __ret_type = oneapi::dpl::__internal::__decay_with_tuple_specialization_t; __ret_type __res = __values[__segment_begin]; @@ -1090,23 +1499,89 @@ struct __brick_reduce_idx __res = __binary_op(__res, __values[__segment_begin]); return __res; } - - template + template void - operator()(const _ItemId __idx, const _ReduceIdx& __segment_starts, const _Values& __values, - _OutValues& __out_values) const + __scalar_path_impl(_IsFull, const _ItemId __idx, const _ReduceIdx& __segment_starts, const _Values& __values, + _OutValues& __out_values) const { using __value_type = decltype(__segment_starts[__idx]); __value_type __segment_end = (__idx == __segment_starts.size() - 1) ? __value_type(__n) : __segment_starts[__idx + 1]; __out_values[__idx] = reduce(__segment_starts[__idx], __segment_end, __values); } + template + void + operator()(_IsFull __is_full, const _ItemId __idx, const _ReduceIdx& __segment_starts, const _Values& __values, + _OutValues& __out_values) const + { + __scalar_path_impl(__is_full, __idx, __segment_starts, __values, __out_values); + } private: _BinaryOperator __binary_op; _Size __n; }; +// std::swap_ranges is unique in that both sets of provided ranges will be modified. Due to this, +// we define a separate functor from __walk2_vectors_or_scalars with a customized vectorization path. +template +struct __brick_swap : public walk_vector_or_scalar_base<_Range1, _Range2> +{ + private: + using __base_t = walk_vector_or_scalar_base<_Range1, _Range2>; + _F __f; + std::size_t __n; + + public: + __brick_swap(_F __f, std::size_t __n) : __f(__f), __n(__n) {} + + template + void + __vector_path_impl(_IsFull __is_full, const _ItemId __idx, _Range1 __rng1, _Range2 __rng2) const + { + using _ValueType1 = oneapi::dpl::__internal::__value_t<_Range1>; + using _ValueType2 = oneapi::dpl::__internal::__value_t<_Range2>; + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng1_vector[__base_t::__preferred_vector_size]; + oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1> __rng2_vector[__base_t::__preferred_vector_size]; + // 1. Load inputs into vectors + oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng1, __rng1_vector); + oneapi::dpl::__par_backend_hetero::__vector_load<__base_t::__preferred_vector_size>{__n}( + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_load_op{}, __rng2, __rng2_vector); + // 2. Swap the two ranges + oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}( + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng2_vector, + __rng1); + oneapi::dpl::__par_backend_hetero::__vector_store<__base_t::__preferred_vector_size>{__n}( + __is_full, __idx, oneapi::dpl::__par_backend_hetero::__lazy_store_transform_op<_F>{__f}, __rng1_vector, + __rng2); + // 3. Explicitly call destructor of lazy union type + oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}( + __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType1>::__get_callable_deleter(), + __rng1_vector); + oneapi::dpl::__par_backend_hetero::__vector_walk<__base_t::__preferred_vector_size>{__n - __idx}( + __is_full, 0, oneapi::dpl::__internal::__lazy_ctor_storage<_ValueType2>::__get_callable_deleter(), + __rng2_vector); + } + + template + void + __scalar_path_impl(_IsFull __is_full, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const + { + __f(__rng1[__idx], __rng2[__idx]); + } + + template + void + operator()(_IsFull __is_full, const std::size_t __idx, const _Range1 __rng1, _Range2 __rng2) const + { + if constexpr (__base_t::__can_vectorize) + __vector_path_impl(__is_full, __idx, __rng1, __rng2); + else + __scalar_path_impl(__is_full, __idx, __rng1, __rng2); + } +}; + } // namespace unseq_backend } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h index 4d16ae4508b..f1bb3cacb63 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/utils_ranges_sycl.h @@ -18,6 +18,9 @@ #include #include +#if _ONEDPL_CPP20_RANGES_PRESENT && _ONEDPL_CPP20_CONCEPTS_PRESENT +#include // std::ranges::contiguous_range +#endif #include "../../utils_ranges.h" #include "../../iterator_impl.h" @@ -754,6 +757,31 @@ __select_backend(const execution::fpga_policy<_Factor, _KernelName>&, _Ranges&&. } #endif +// Check the outer view type type to see if we can vectorize. Any non-contiguous inputs (e.g. reverse +// views, permutation views, etc.) cannot be vectorized. If C++20 ranges are present, then we can +// use the std::ranges::contiguous_range concept. +template +struct __is_vectorizable_range +{ + constexpr static bool value = +#if _ONEDPL_CPP20_RANGES_PRESENT && _ONEDPL_CPP20_CONCEPTS_PRESENT + std::ranges::contiguous_range<_Rng>; +#else + false; +#endif +}; +// If the outer view is a guard view, then the input is passed directly as a pointer and we can use. +template +struct __is_vectorizable_range> : std::true_type +{ +}; +// If all_view is passed, then we are processing a sycl::buffer directly which is contiguous and can +// be used. +template +struct __is_vectorizable_range> : std::true_type +{ +}; + } // namespace __ranges } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h index 6970f57f4d7..937608bfc39 100644 --- a/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/histogram_impl_hetero.h @@ -143,7 +143,9 @@ __pattern_histogram(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Rando auto __init_event = oneapi::dpl::__par_backend_hetero::__parallel_for( _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__hist_fill_zeros_wrapper>(__exec), - unseq_backend::walk_n<_ExecutionPolicy, decltype(__fill_func)>{__fill_func}, __num_bins, __bins); + unseq_backend::walk1_vector_or_scalar<_ExecutionPolicy, decltype(__fill_func), decltype(__bins)>{ + __fill_func, static_cast(__num_bins)}, + __num_bins, __bins); if (__n > 0) { diff --git a/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h index d040e828eef..cf9533f8113 100644 --- a/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/numeric_impl_hetero.h @@ -263,9 +263,12 @@ __pattern_adjacent_difference(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy& oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator2>(); auto __buf2 = __keep2(__d_first, __d_last); - using _Function = unseq_backend::walk_adjacent_difference<_ExecutionPolicy, decltype(__fn)>; + using _Function = + unseq_backend::walk_adjacent_difference<_ExecutionPolicy, decltype(__fn), decltype(__buf1.all_view()), + decltype(__buf2.all_view())>; - oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, __exec, _Function{__fn}, __n, + oneapi::dpl::__par_backend_hetero::__parallel_for(_BackendTag{}, __exec, + _Function{__fn, static_cast(__n)}, __n, __buf1.all_view(), __buf2.all_view()) .__deferrable_wait(); } diff --git a/include/oneapi/dpl/pstl/utils.h b/include/oneapi/dpl/pstl/utils.h index 9c32178a78c..83b2cadf6ec 100644 --- a/include/oneapi/dpl/pstl/utils.h +++ b/include/oneapi/dpl/pstl/utils.h @@ -25,6 +25,7 @@ #include #include #include +#include #if _ONEDPL_BACKEND_SYCL # include "hetero/dpcpp/sycl_defs.h" @@ -782,6 +783,40 @@ union __lazy_ctor_storage { __v.~_Tp(); } + static auto + __get_callable_deleter() + { + return [](__lazy_ctor_storage& __storage) { __storage.__destroy(); }; + } +}; + +// To implement __min_nested_type_size, a general utility with an internal tuple +// specialization, we need to forward declare our internal tuple first as tuple_impl.h +// already includes this header. +template +struct tuple; + +// Returns the smallest type within a set of potentially nested template types. This function +// recursively explores std::tuple and oneapi::dpl::__internal::tuple for the smallest type. +// For all other types, its size is used directly. +// E.g. If we consider the type: T = tuple, int, double>, +// then __min_nested_type_size::value returns sizeof(short). +template +struct __min_nested_type_size +{ + constexpr static std::size_t value = sizeof(_T); +}; + +template +struct __min_nested_type_size> +{ + constexpr static std::size_t value = std::min({__min_nested_type_size<_Ts>::value...}); +}; + +template +struct __min_nested_type_size> +{ + constexpr static std::size_t value = std::min({__min_nested_type_size<_Ts>::value...}); }; } // namespace __internal diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp index 322d93d4824..2147b9c9027 100644 --- a/test/general/implementation_details/device_copyable.pass.cpp +++ b/test/general/implementation_details/device_copyable.pass.cpp @@ -45,10 +45,12 @@ test_device_copyable() static_assert(sycl::is_device_copyable_v, "constant_iterator_device_copyable is not device copyable"); - //custom_brick - static_assert(sycl::is_device_copyable_v>, - "custom_brick is not device copyable with device copyable types"); + //__custom_brick + static_assert( + sycl::is_device_copyable_v< + oneapi::dpl::internal::__custom_brick>, + "__custom_brick is not device copyable with device copyable types"); //replace_if_fun static_assert( sycl::is_device_copyable_v>, @@ -76,11 +78,23 @@ test_device_copyable() static_assert(sycl::is_device_copyable_v< oneapi::dpl::unseq_backend::walk_n>, "walk_n is not device copyable with device copyable types"); + //walk1_vector_or_scalar + static_assert(sycl::is_device_copyable_v>, + "walk1_vector_or_scalar is not device copyable with device copyable types"); + //walk2_vectors_or_scalars + static_assert(sycl::is_device_copyable_v>, + "walk2_vectors_or_scalars is not device copyable with device copyable types"); + //walk3_vectors_or_scalars + static_assert(sycl::is_device_copyable_v>, + "walk3_vectors_or_scalars is not device copyable with device copyable types"); //walk_adjacent_difference - static_assert( - sycl::is_device_copyable_v< - oneapi::dpl::unseq_backend::walk_adjacent_difference>, - "walk_adjacent_difference is not device copyable with device copyable types"); + static_assert(sycl::is_device_copyable_v>, + "walk_adjacent_difference is not device copyable with device copyable types"); //transform_reduce static_assert( sycl::is_device_copyable_v< @@ -148,8 +162,8 @@ test_device_copyable() int_device_copyable, int_device_copyable, std::true_type>>, "__brick_set_op is not device copyable with device copyable types"); // __brick_reduce_idx - static_assert(sycl::is_device_copyable_v< - oneapi::dpl::unseq_backend::__brick_reduce_idx>, + static_assert(sycl::is_device_copyable_v>, "__brick_reduce_idx is not device copyable with device copyable types"); //__gen_transform_input @@ -306,11 +320,11 @@ test_non_device_copyable() static_assert(!sycl::is_device_copyable_v, "iterator is device copyable"); static_assert(!sycl::is_device_copyable_v, "range_non_device_copyable is device copyable"); - //custom_brick - static_assert( - !sycl::is_device_copyable_v>, - "custom_brick is device copyable with non device copyable types"); + //__custom_brick + static_assert(!sycl::is_device_copyable_v>, + "__custom_brick is device copyable with non device copyable types"); //replace_if_fun static_assert(!sycl::is_device_copyable_v< oneapi::dpl::internal::replace_if_fun>, @@ -339,10 +353,26 @@ test_non_device_copyable() static_assert(!sycl::is_device_copyable_v< oneapi::dpl::unseq_backend::walk_n>, "walk_n is device copyable with non device copyable types"); + //walk1_vector_or_scalar + static_assert(!sycl::is_device_copyable_v>, + "walk1_vector_or_scalar is device copyable with non device copyable types"); + //walk2_vectors_or_scalars + static_assert( + !sycl::is_device_copyable_v< + oneapi::dpl::unseq_backend::walk2_vectors_or_scalars>, + "walk2_vectors_or_scalars is device copyable with non device copyable types"); + //walk3_vectors_or_scalars + static_assert(!sycl::is_device_copyable_v>, + "walk3_vectors_or_scalars is device copyable with non device copyable types"); //walk_adjacent_difference static_assert( !sycl::is_device_copyable_v< - oneapi::dpl::unseq_backend::walk_adjacent_difference>, + oneapi::dpl::unseq_backend::walk_adjacent_difference>, "walk_adjacent_difference is device copyable with non device copyable types"); //transform_reduce static_assert( @@ -411,8 +441,8 @@ test_non_device_copyable() int_device_copyable, int_device_copyable, std::true_type>>, "__brick_set_op is device copyable with non device copyable types"); //__brick_reduce_idx - static_assert(!sycl::is_device_copyable_v< - oneapi::dpl::unseq_backend::__brick_reduce_idx>, + static_assert(!sycl::is_device_copyable_v>, "__brick_reduce_idx is device copyable with non device copyable types"); //__gen_transform_input diff --git a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp index 13f03fdb3dd..41afe0693e1 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse.pass.cpp @@ -69,7 +69,7 @@ template void test() { - const ::std::size_t max_len = 100000; + const std::size_t max_len = TestUtils::get_pattern_for_max_n(); Sequence actual(max_len); @@ -99,6 +99,7 @@ int main() { test(); + test(); test(); test(); test>(); diff --git a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp index 44f9dc6fe7c..cbe63db7e47 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp @@ -82,7 +82,7 @@ template void test() { - const ::std::size_t max_len = 100000; + const std::size_t max_len = TestUtils::get_pattern_for_max_n(); Sequence actual(max_len); Sequence data(max_len, [](::std::size_t i) { return T1(i); }); @@ -100,6 +100,7 @@ main() { // clang-3.8 fails to correctly auto vectorize the loop in some cases of different types of container's elements, // for example: std::int32_t and std::int8_t. This issue isn't detected for clang-3.9 and newer versions. + test(); test(); test(); test(); diff --git a/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp index 9ad4cfc13bc..7dfc95b1485 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/copy_move.pass.cpp @@ -125,8 +125,9 @@ template void test(T trash, Convert convert) { + size_t max_n = TestUtils::get_pattern_for_max_n(); // Try sequences of various lengths. - for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) + for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) { // count is number of output elements, plus a handful // more for sake of detecting buffer overruns. @@ -166,6 +167,9 @@ main() test(-666, [](size_t j) { return std::int32_t(j); }); test(-666.0, [](size_t j) { return float64_t(j); }); + test(42, [](size_t j) { return std::uint16_t(j); }); + test(42, [](size_t j) { return std::uint8_t(j); }); + #if !TEST_DPCPP_BACKEND_PRESENT /*TODO: copy support of a class with no default constructor*/ test>(Wrapper(-666.0), [](std::int32_t j) { return Wrapper(j); }); diff --git a/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp index 66c8e321627..6c707b62079 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/fill.pass.cpp @@ -101,10 +101,12 @@ int main() { - const ::std::size_t N = 100000; + const std::size_t N = TestUtils::get_pattern_for_max_n(); for (::std::size_t n = 0; n < N; n = n < 16 ? n + 1 : size_t(3.1415 * n)) { + test_fill_by_type(n); + test_fill_by_type(n); test_fill_by_type(n); test_fill_by_type(n); } diff --git a/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp index 3ee13107011..bcd65426526 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/generate.pass.cpp @@ -82,7 +82,8 @@ template void test_generate_by_type() { - for (size_t n = 0; n <= 100000; n = n < 16 ? n + 1 : size_t(3.1415 * n)) + size_t max_n = TestUtils::get_pattern_for_max_n(); + for (size_t n = 0; n <= max_n; n = n < 16 ? n + 1 : size_t(3.1415 * n)) { Sequence in(n, [](size_t) -> T { return T(0); }); //fill by zero @@ -123,6 +124,7 @@ struct test_non_const_generate_n int main() { + test_generate_by_type(); test_generate_by_type(); test_generate_by_type(); diff --git a/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp index 92820d2012f..91291851dcb 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/replace.pass.cpp @@ -114,7 +114,7 @@ template void test(Pred pred) { - const ::std::size_t max_len = 100000; + const std::size_t max_len = TestUtils::get_pattern_for_max_n(); const T1 value = T1(0); const T1 new_value = T1(666); @@ -162,6 +162,7 @@ int main() { test(oneapi::dpl::__internal::__equal_value(666)); + test([](const std::uint8_t& elem) { return elem % 3 < 2; }); test([](const std::uint16_t& elem) { return elem % 3 < 2; }); test([](const float64_t& elem) { return elem * elem - 3.5 * elem > 10; }); //test([](const copy_int& val) { return val.value / 5 > 2; }); diff --git a/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp index 50851e3d299..b89c3aa3f40 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/replace_copy.pass.cpp @@ -75,8 +75,9 @@ template void test(T trash, const T& old_value, const T& new_value, Predicate pred, Convert convert) { + const size_t max_n = TestUtils::get_pattern_for_max_n(); // Try sequences of various lengths. - for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) + for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) { Sequence in(n, [&](size_t k) -> T { return convert(n ^ k); }); Sequence out(n, [=](size_t) { return trash; }); @@ -123,6 +124,10 @@ main() test(-666, 42, 99, [](const std::int32_t& x) { return x != 42; }, [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? 42 : -1 - std::int32_t(j); }); + test(123, 42, 99, [](const std::uint8_t& x) { return x != 42; }, + [](size_t j) { return ((j + 1) % 5 & 2) != 0 ? 42 : 255; }); + + #if !TEST_DPCPP_BACKEND_PRESENT test(Number(42, OddTag()), Number(2001, OddTag()), Number(2017, OddTag()), IsMultiple(3, OddTag()), [](std::int32_t j) { return ((j + 1) % 3 & 2) != 0 ? Number(2001, OddTag()) : Number(j, OddTag()); }); diff --git a/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp index 2147999ce3f..a6f4600c2a8 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/rotate.pass.cpp @@ -132,7 +132,7 @@ template void test() { - const std::int32_t max_len = 100000; + const std::int32_t max_len = TestUtils::get_pattern_for_max_n(); Sequence actual(max_len, [](::std::size_t i) { return T(i); }); Sequence data(max_len, [](::std::size_t i) { return T(i); }); @@ -154,6 +154,8 @@ test() int main() { + test(); + test(); test(); #if !TEST_DPCPP_BACKEND_PRESENT test>(); diff --git a/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp index edf5f56651c..1df986deb6a 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/rotate_copy.pass.cpp @@ -100,7 +100,7 @@ void test() { - const ::std::size_t max_len = 100000; + const std::size_t max_len = TestUtils::get_pattern_for_max_n(); Sequence actual(max_len, [](::std::size_t i) { return T1(i); }); @@ -125,6 +125,8 @@ test() int main() { + test(); + test(); test(); test(); test(); diff --git a/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp index afd10f55f50..f23ea69e05b 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/swap_ranges.pass.cpp @@ -115,7 +115,7 @@ template void test() { - const ::std::size_t max_len = 100000; + const std::size_t max_len = TestUtils::get_pattern_for_max_n(); Sequence data(max_len); Sequence actual(max_len); @@ -132,6 +132,7 @@ main() { test>(); test>(); + test(); test(); test(); diff --git a/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp index 113c24be7c8..5deae6ea667 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/transform_binary.pass.cpp @@ -104,7 +104,7 @@ test(Predicate pred, _IteratorAdapter adap = {}) #if PSTL_USE_DEBUG && ONEDPL_USE_OPENMP_BACKEND 10000; #else - 100000; + TestUtils::get_pattern_for_max_n(); #endif for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) { @@ -153,5 +153,7 @@ main() //test case for zip iterator test<50, std::int32_t, std::int32_t, std::int32_t>(TheOperationZip(1), _ZipIteratorAdapter{}); + test<60, std::uint16_t, std::uint16_t, std::int32_t>(TheOperation(1)); + return done(); } diff --git a/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp index 20c1403f6b9..19fe906935c 100644 --- a/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp +++ b/test/parallel_api/algorithm/alg.modifying.operations/transform_unary.pass.cpp @@ -84,7 +84,8 @@ template <::std::size_t CallNumber, typename Tin, typename Tout, typename _Op = void test() { - for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) + size_t max_n = TestUtils::get_pattern_for_max_n(); + for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) { Sequence in(n, [](std::int32_t k) { return k % 5 != 1 ? 3 * k - 7 : 0; }); diff --git a/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp b/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp index 526c1c45fcf..c23fd3013de 100644 --- a/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp +++ b/test/parallel_api/algorithm/alg.nonmodifying/for_each.pass.cpp @@ -85,7 +85,8 @@ template void test() { - for (size_t n = 0; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) + const size_t max_n = TestUtils::get_pattern_for_max_n(); + for (size_t n = 0; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) { Sequence in_out(n, Gen()); Sequence expected(n, Gen()); diff --git a/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp b/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp index 973ca3658f8..97198b72f76 100644 --- a/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp +++ b/test/parallel_api/algorithm/alg.nonmodifying/transform_if.pass.cpp @@ -162,10 +162,11 @@ void test() { const ::std::int64_t init_val = 999; - for (size_t n = 1; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) + const size_t max_n = TestUtils::get_pattern_for_max_n(); + for (size_t n = 1; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) { { - Sequence<_Type> in1(n, [=](size_t k) { return (3 * k); }); + Sequence<_Type> in1(n, [=](size_t k) { return (3 * k) % std::numeric_limits<_Type>::max(); }); Sequence<_Type> in2(n, [=](size_t k) { return k % 2 == 0 ? 1 : 0; }); Sequence<_Type> out(n, [=](size_t) { return init_val; }); @@ -178,7 +179,7 @@ test() #endif } { - Sequence<_Type> in1(n, [=](size_t k) { return k; }); + Sequence<_Type> in1(n, [=](size_t k) { return k % std::numeric_limits<_Type>::max(); }); Sequence<_Type> out(n, [=](size_t) { return init_val; }); invoke_on_all_policies<2>()(test_transform_if_unary<_Type>(), in1.begin(), in1.end(), out.begin(), @@ -196,10 +197,11 @@ void test_inplace() { const ::std::int64_t init_val = 999; - for (size_t n = 1; n <= 100000; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) + const size_t max_n = TestUtils::get_pattern_for_max_n(); + for (size_t n = 1; n <= max_n; n = n <= 16 ? n + 1 : size_t(3.1415 * n)) { { - Sequence<_Type> in1(n, [=](size_t k) { return k; }); + Sequence<_Type> in1(n, [=](size_t k) { return k % std::numeric_limits<_Type>::max(); }); Sequence<_Type> out(n, [=](size_t) { return 0; }); invoke_on_all_policies<4>()(test_transform_if_unary_inplace<_Type>(), in1.begin(), in1.end(), out.begin(), @@ -211,11 +213,15 @@ test_inplace() int main() { - test<::std::int32_t>(); - test<::std::int64_t>(); - - test_inplace<::std::int32_t>(); - test_inplace<::std::int64_t>(); + test(); + test(); + test(); + test(); + + test_inplace(); + test_inplace(); + test_inplace(); + test_inplace(); return done(); } diff --git a/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp b/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp index a617d3205d0..08b93265541 100644 --- a/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/adjacent_difference.pass.cpp @@ -139,7 +139,7 @@ template void test(Pred pred) { - const ::std::size_t max_len = 100000; + const std::size_t max_len = TestUtils::get_pattern_for_max_n(); const T2 value = T2(77); const T1 trash = T1(31); @@ -165,6 +165,7 @@ int main() { test([](std::uint32_t a, std::uint32_t b) { return a - b; }); + test([](std::uint16_t a, std::uint16_t b) { return a > b ? a - b : b - a; }); test([](std::int64_t a, std::int64_t b) { return a / (b + 1); }); test([](float32_t a, float32_t b) { return (a + b) / 2; }); #if !TEST_DPCPP_BACKEND_PRESENT diff --git a/test/support/utils.h b/test/support/utils.h index 666d5ca3646..bef28868b1b 100644 --- a/test/support/utils.h +++ b/test/support/utils.h @@ -1013,6 +1013,28 @@ generate_arithmetic_data(T* input, std::size_t size, std::uint32_t seed) input[j] = input[i]; } } + +// Utility that models __estimate_best_start_size in the SYCL backend parallel_for +// to ensure large enough inputs are used to test the large submitter path. +// A multiplier to the max n is added to ensure we get a few separate test inputs for +// this path. +std::size_t +get_pattern_for_max_n() +{ +#if TEST_DPCPP_BACKEND_PRESENT + sycl::queue q = TestUtils::get_test_queue(); + sycl::device d = q.get_device(); + constexpr std::size_t max_iters_per_item = 16; + constexpr std::size_t multiplier = 4; + std::size_t __max_n = multiplier * max_iters_per_item * d.get_info() * + d.get_info(); + __max_n = std::min(std::size_t{10000000}, __max_n); + return __max_n; +#else + return TestUtils::max_n; +#endif +} + } /* namespace TestUtils */ #endif // _UTILS_H