Skip to content

Commit

Permalink
[SYCL][CUDA] Implement root group barrier (#14828)
Browse files Browse the repository at this point in the history
This PR adds an algorithm for doing a GPU wide barrier in CUDA backend. 

Rough outline of the algorithm:
- Every `0th` thread from each workgroup performs `atomic.add(1)`
- The same thread checks the atomic result with `ld.acquire` in a loop
until it's equal to total amount of workgroups.
- All threads call group-wide `barrier.sync`

One caveat to this is that there is no initialization of the atomic
start value. So if we call this barrier several times in a kernel, on
the second iteration, the start value will already contain the result
from previous barrier. That's why we actually spin the while loop while
`current value % totalWgroups != 0`.
  • Loading branch information
konradkusiak97 authored Aug 1, 2024
1 parent 6532637 commit 132f763
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 2 deletions.
44 changes: 44 additions & 0 deletions libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,54 @@ _CLC_OVERLOAD _CLC_DEF void __spirv_MemoryBarrier(unsigned int memory,
_CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void
__spirv_ControlBarrier(unsigned int scope, unsigned int memory,
unsigned int semantics) {
unsigned int order = semantics & 0x1F;
if (scope == Subgroup) {
// use a full mask as barriers are required to be convergent and exited
// threads can safely be in the mask
__nvvm_bar_warp_sync(0xFFFFFFFF);
} else if (scope == Device && memory == Device &&
order == SequentiallyConsistent &&
__clc_nvvm_reflect_arch() >= 700) {
unsigned int env1, env2;
__asm__ __volatile__("mov.u32 %0, %%envreg1;" : "=r"(env1));
__asm__ __volatile__("mov.u32 %0, %%envreg2;" : "=r"(env2));
long long envreg1 = env1;
long long envreg2 = env2;
// Bit field insert operation. Place 32 bits of envreg2 next to 32 bits of
// envreg1: s64[envreg2][envreg1]. The resulting value is the address in
// device global memory region, where atomic operations can be performed.
long long atomicAddr;
__asm__ __volatile__("bfi.b64 %0, %1, %2, 32, 32;"
: "=l"(atomicAddr)
: "l"(envreg1), "l"(envreg2));
if (!atomicAddr) {
__builtin_trap();
} else {
unsigned int tidX = __nvvm_read_ptx_sreg_tid_x();
unsigned int tidY = __nvvm_read_ptx_sreg_tid_y();
unsigned int tidZ = __nvvm_read_ptx_sreg_tid_z();
if (tidX + tidY + tidZ == 0) {
// Increment address by 4 to get the precise region initialized to 0.
atomicAddr += 4;
unsigned int nctaidX = __nvvm_read_ptx_sreg_nctaid_x();
unsigned int nctaidY = __nvvm_read_ptx_sreg_nctaid_y();
unsigned int nctaidZ = __nvvm_read_ptx_sreg_nctaid_z();
unsigned int totalNctaid = nctaidX * nctaidY * nctaidZ;

// Do atomic.add(1) for each CTA and spin ld.acquire in a loop until all
// CTAs have performed the addition
unsigned int prev, current;
__asm__ __volatile__("atom.add.release.gpu.u32 %0,[%1],1;"
: "=r"(prev)
: "l"(atomicAddr));
do {
__asm__ __volatile__("ld.acquire.gpu.u32 %0,[%1];"
: "=r"(current)
: "l"(atomicAddr));
} while (current % totalNctaid != 0);
}
__nvvm_barrier_sync(0);
}
} else {
__syncthreads();
}
Expand Down
12 changes: 10 additions & 2 deletions sycl/test-e2e/GroupAlgorithm/root_group.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Fails with opencl non-cpu, enable when fixed.
// XFAIL: (opencl && !cpu && !accelerator)
// RUN: %{build} -I . -o %t.out
// RUN: %{build} -I . -o %t.out %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
// RUN: %{run} %t.out

// Disabled temporarily while investigation into the failure is ongoing.
Expand All @@ -10,6 +10,7 @@
#include <cstdlib>
#include <type_traits>

#include <sycl/builtins.hpp>
#include <sycl/detail/core.hpp>
#include <sycl/ext/oneapi/experimental/root_group.hpp>
#include <sycl/group_barrier.hpp>
Expand Down Expand Up @@ -53,10 +54,17 @@ void testRootGroup() {
sycl::accessor data{dataBuf, h};
h.parallel_for<
class RootGroupKernel>(range, props, [=](sycl::nd_item<1> it) {
volatile float X = 1.0f;
volatile float Y = 1.0f;
auto root = it.ext_oneapi_get_root_group();
data[root.get_local_id()] = root.get_local_id();
sycl::group_barrier(root);

// Delay half of the workgroups with extra work to check that the barrier
// synchronizes the whole device.
if (it.get_group(0) % 2 == 0) {
X += sycl::sin(X);
Y += sycl::cos(Y);
}
root =
sycl::ext::oneapi::experimental::this_work_item::get_root_group<1>();
int sum = data[root.get_local_id()] +
Expand Down

0 comments on commit 132f763

Please sign in to comment.