pytorch
diff --git a/Diff for: ‎fbgemm_gpu/include/fbgemm_gpu/utils/kernel_launch.cuh
+122 b/Diff for: ‎fbgemm_gpu/include/fbgemm_gpu/utils/kernel_launch.cuh
+122
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <ATen/ATen.h>
+#include <iostream>
+#include <type_traits>
+#include "fbgemm_gpu/utils/tensor_accessor_builder.h"
+
+namespace fbgemm_gpu::utils {
+
+#define U64(x) static_cast<uint64_t>(x)
+
+////////////////////////////////////////////////////////////////////////////////
+// Helpers to detect TensorAccessorBuilder type (regardless of template params)
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename>
+struct is_tensor_accessor_builder : std::false_type {};
+
+template <
+    typename T,
+    size_t N,
+    size_t INB,
+    bool P,
+    template <typename>
+    class PT>
+struct is_tensor_accessor_builder<TensorAccessorBuilder<T, N, INB, P, PT>>
+    : std::true_type {};
+
+template <typename T>
+inline constexpr bool is_tensor_accessor_builder_v =
+    is_tensor_accessor_builder<T>::value;
+
+////////////////////////////////////////////////////////////////////////////////
+// Transform Kernel Argument
+//
+// Transform certain arguments before passing them to the kernel invocation
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+decltype(auto) transform_kernel_arg(const std::string_view& context, T&& arg) {
+  if constexpr (is_tensor_accessor_builder_v<std::decay_t<T>>) {
+    // If the arg is a TensorAccessorBuilder, build it out to a tensor accessor.
+    // This is the mechanism that allows us to log kernel function names on
+    // failed checks and assertions when comopiled with FBGEMM_GPU_MEMCHECK
+    // turned ON.
+    return arg.build(
+#ifdef FBGEMM_GPU_MEMCHECK
+        context.data()
+#endif
+    );
+  } else {
+    // Otherwise, forward the argument as is
+    return std::forward<T>(arg);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Launch the kernel with all the ceremonial routines
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename KernelFunc, typename... Args>
+inline void launch_kernel(
+    const std::string_view& context,
+    const KernelFunc& kernel,
+    const dim3 grid,
+    const dim3 block,
+    const size_t Ns,
+    cudaStream_t stream,
+    Args&&... args) {
+#ifdef USE_ROCM
+  // ROCm has a limit of 2^32 elements per kernel launch, but doens't
+  // automatically work around problem like CUDA does, see:
+  //  https://github.com/ROCm/hip/issues/2253
+  uint64_t grid_size = U64(grid.x) * U64(grid.y) * U64(grid.z) * U64(block.x) *
+      U64(block.y) * U64(block.z);
+  TORCH_CHECK(
+      grid_size < U64(0xFFFFFFFF),
+      "[ ",
+      context,
+      " ]: ",
+      "Kernel launch grid size ",
+      grid_size,
+      " is greater than the ROCm limit of 2^32");
+#endif
+
+  kernel<<<grid, block, Ns, stream>>>(
+      // Transform arguments to the kernel before forwarding them.
+      transform_kernel_arg(
+          // Pass the context for debugging
+          context,
+          std::forward<Args>(args))...);
+
+  // Check for CUDA errors
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return;
+}
+
+#undef U64
+
+} // namespace fbgemm_gpu::utils
+
+// The constexpr reference to the kernel is added to enable for better
+// compilation error messages upon template mismatch
+#define FBGEMM_LAUNCH_KERNEL(KERNEL, GRID, BLOCK, ...) \
+  constexpr decltype(KERNEL)& kernel = KERNEL;         \
+  fbgemm_gpu::utils::launch_kernel(                    \
+      #KERNEL,                                         \
+      kernel,                                          \
+      GRID,                                            \
+      BLOCK,                                           \
+      0,                                               \
+      at::cuda::getCurrentCUDAStream(),                \
+      __VA_ARGS__);