diff --git a/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl b/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl index 1cf3fb8750c2..eb011986659e 100644 --- a/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl +++ b/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl @@ -45,10 +45,54 @@ _CLC_OVERLOAD _CLC_DEF void __spirv_MemoryBarrier(unsigned int memory, _CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void __spirv_ControlBarrier(unsigned int scope, unsigned int memory, unsigned int semantics) { + unsigned int order = semantics & 0x1F; if (scope == Subgroup) { // use a full mask as barriers are required to be convergent and exited // threads can safely be in the mask __nvvm_bar_warp_sync(0xFFFFFFFF); + } else if (scope == Device && memory == Device && + order == SequentiallyConsistent && + __clc_nvvm_reflect_arch() >= 700) { + unsigned int env1, env2; + __asm__ __volatile__("mov.u32 %0, %%envreg1;" : "=r"(env1)); + __asm__ __volatile__("mov.u32 %0, %%envreg2;" : "=r"(env2)); + long long envreg1 = env1; + long long envreg2 = env2; + // Bit field insert operation. Place 32 bits of envreg2 next to 32 bits of + // envreg1: s64[envreg2][envreg1]. The resulting value is the address in + // device global memory region, where atomic operations can be performed. + long long atomicAddr; + __asm__ __volatile__("bfi.b64 %0, %1, %2, 32, 32;" + : "=l"(atomicAddr) + : "l"(envreg1), "l"(envreg2)); + if (!atomicAddr) { + __builtin_trap(); + } else { + unsigned int tidX = __nvvm_read_ptx_sreg_tid_x(); + unsigned int tidY = __nvvm_read_ptx_sreg_tid_y(); + unsigned int tidZ = __nvvm_read_ptx_sreg_tid_z(); + if (tidX + tidY + tidZ == 0) { + // Increment address by 4 to get the precise region initialized to 0. + atomicAddr += 4; + unsigned int nctaidX = __nvvm_read_ptx_sreg_nctaid_x(); + unsigned int nctaidY = __nvvm_read_ptx_sreg_nctaid_y(); + unsigned int nctaidZ = __nvvm_read_ptx_sreg_nctaid_z(); + unsigned int totalNctaid = nctaidX * nctaidY * nctaidZ; + + // Do atomic.add(1) for each CTA and spin ld.acquire in a loop until all + // CTAs have performed the addition + unsigned int prev, current; + __asm__ __volatile__("atom.add.release.gpu.u32 %0,[%1],1;" + : "=r"(prev) + : "l"(atomicAddr)); + do { + __asm__ __volatile__("ld.acquire.gpu.u32 %0,[%1];" + : "=r"(current) + : "l"(atomicAddr)); + } while (current % totalNctaid != 0); + } + __nvvm_barrier_sync(0); + } } else { __syncthreads(); } diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp index 983f8e7ca003..d8393f35c625 100644 --- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp +++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp @@ -1,6 +1,6 @@ // Fails with opencl non-cpu, enable when fixed. // XFAIL: (opencl && !cpu && !accelerator) -// RUN: %{build} -I . -o %t.out +// RUN: %{build} -I . -o %t.out %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %} // RUN: %{run} %t.out // Disabled temporarily while investigation into the failure is ongoing. @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -53,10 +54,17 @@ void testRootGroup() { sycl::accessor data{dataBuf, h}; h.parallel_for< class RootGroupKernel>(range, props, [=](sycl::nd_item<1> it) { + volatile float X = 1.0f; + volatile float Y = 1.0f; auto root = it.ext_oneapi_get_root_group(); data[root.get_local_id()] = root.get_local_id(); sycl::group_barrier(root); - + // Delay half of the workgroups with extra work to check that the barrier + // synchronizes the whole device. + if (it.get_group(0) % 2 == 0) { + X += sycl::sin(X); + Y += sycl::cos(Y); + } root = sycl::ext::oneapi::experimental::this_work_item::get_root_group<1>(); int sum = data[root.get_local_id()] +