xla: update patches for current master, add ROCm cherry-picks

steeve · steeve · commit a7ac73031d92 · 2025-03-11T10:44:51.000Z
based on openxla/xla#23574
diff --git a/openxla/patches/0001-Added-FFI-handler-registration-API-to-the-FFI-PjRt.patch b/openxla/patches/0001-Added-FFI-handler-registration-API-to-the-FFI-PjRt.patch
@@ -1,7 +1,7 @@
-From c79f202be6fde802b4e5d697a5925d7eccea3d25 Mon Sep 17 00:00:00 2001
+From e888ca450bbc58331c81e7537dd3f2b933f92df7 Mon Sep 17 00:00:00 2001
 From: Hugo Mano <hugo@zml.ai>
 Date: Wed, 5 Feb 2025 19:25:03 +0100
-Subject: [PATCH] Added FFI handler registration API to the FFI PjRt
+Subject: [PATCH 1/6] Added FFI handler registration API to the FFI PjRt
 
 PR: https://github.com/openxla/xla/pull/13420
 ---
@@ -11,7 +11,7 @@ PR: https://github.com/openxla/xla/pull/13420
  3 files changed, 54 insertions(+), 2 deletions(-)
 
 diff --git a/xla/pjrt/c/BUILD b/xla/pjrt/c/BUILD
-index ad1b3987fe..0598281ad1 100644
+index ad2ed95bce..0e7c35c30f 100644
 --- a/xla/pjrt/c/BUILD
 +++ b/xla/pjrt/c/BUILD
 @@ -69,7 +69,12 @@ cc_library(
@@ -28,10 +28,10 @@ index ad1b3987fe..0598281ad1 100644
  )
  
 diff --git a/xla/pjrt/c/pjrt_c_api_ffi_extension.h b/xla/pjrt/c/pjrt_c_api_ffi_extension.h
-index c5766f2a19..3d74e7cbf3 100644
+index a33bd4aa9c..3309194538 100644
 --- a/xla/pjrt/c/pjrt_c_api_ffi_extension.h
 +++ b/xla/pjrt/c/pjrt_c_api_ffi_extension.h
-@@ -67,12 +67,28 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_FFI_UserData_Add_Args, user_data);
+@@ -66,12 +66,28 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_FFI_UserData_Add_Args, user_data);
  // Adds a user data to the execute context.
  typedef PJRT_Error* PJRT_FFI_UserData_Add(PJRT_FFI_UserData_Add_Args* args);
  
@@ -128,4 +128,5 @@ index 0375b39d0b..3527a0756e 100644
  }
  
 -- 
-2.39.5 (Apple Git-154)
+2.43.0
+
diff --git a/openxla/patches/0002-Various-macOS-QOL-enchancements.patch b/openxla/patches/0002-Various-macOS-QOL-enchancements.patch
@@ -1,7 +1,7 @@
-From f59ff33447ea8312ed8c4bf7e87c7d5409f0d2b9 Mon Sep 17 00:00:00 2001
+From ba424272d5e5e0b139d05d530faad7ff1fbb6af5 Mon Sep 17 00:00:00 2001
 From: Hugo Mano <hugo@zml.ai>
 Date: Wed, 12 Feb 2025 13:10:04 +0100
-Subject: [PATCH] Various macOS QOL enchancements
+Subject: [PATCH 2/6] Various macOS QOL enchancements
 
 This PR adds various small quality of life improvements to macOS builds:
 
@@ -18,10 +18,10 @@ Co-authored-by: Steeve Morin <steeve@zml.ai>
  1 file changed, 10 insertions(+), 8 deletions(-)
 
 diff --git a/xla/pjrt/c/BUILD b/xla/pjrt/c/BUILD
-index a0485b6a43..6f67ee6b78 100644
+index 0e7c35c30f..b3de80a5e7 100644
 --- a/xla/pjrt/c/BUILD
 +++ b/xla/pjrt/c/BUILD
-@@ -321,9 +321,14 @@ cc_library(
+@@ -326,9 +326,14 @@ cc_library(
  
  # PJRT CPU plugin.
  xla_cc_binary(
@@ -38,7 +38,7 @@ index a0485b6a43..6f67ee6b78 100644
          [
              "-Wl,--version-script,$(location :pjrt_c_api_cpu_version_script.lds)",
              "-Wl,--no-undefined",
-@@ -336,10 +341,7 @@ xla_cc_binary(
+@@ -341,10 +346,7 @@ xla_cc_binary(
          "notsan",
      ],
      visibility = ["//visibility:public"],
@@ -50,7 +50,7 @@ index a0485b6a43..6f67ee6b78 100644
  )
  
  cc_library(
-@@ -408,7 +410,8 @@ cc_library(
+@@ -413,7 +415,8 @@ cc_library(
  
  # PJRT GPU plugin. Can be configured to be built for CUDA or ROCM.
  xla_cc_binary(
@@ -60,7 +60,7 @@ index a0485b6a43..6f67ee6b78 100644
      linkopts = [
          "-Wl,--version-script,$(location :pjrt_c_api_gpu_version_script.lds)",
          "-Wl,--no-undefined",
-@@ -422,7 +425,6 @@ xla_cc_binary(
+@@ -427,7 +430,6 @@ xla_cc_binary(
      ],
      deps = [
          ":pjrt_c_api_gpu",
@@ -69,5 +69,5 @@ index a0485b6a43..6f67ee6b78 100644
      ] + if_cuda_is_configured([
          "//xla/stream_executor:cuda_platform",
 -- 
-2.39.5 (Apple Git-154)
+2.43.0
 
diff --git a/openxla/patches/0003-Expose-nvptxcompiler-to-link-against-in-XLA-if-enabl.patch b/openxla/patches/0003-Expose-nvptxcompiler-to-link-against-in-XLA-if-enabl.patch
@@ -1,19 +1,19 @@
-From a13ee6b68e951f3fa95f26fa3a4a9d0f8e9ab17d Mon Sep 17 00:00:00 2001
+From 87a2f9bdec76d447bfb7f1c379f3ccab93324824 Mon Sep 17 00:00:00 2001
 From: Hugo Mano <hugo@zml.ai>
 Date: Tue, 21 Jan 2025 14:41:42 +0100
-Subject: [PATCH] Expose nvptxcompiler to link against in XLA if
+Subject: [PATCH 3/6] Expose nvptxcompiler to link against in XLA if
  enable_libnvptxcompiler_support is set
 
 Only for ZML, no PR on XLA side.
 ---
- .../gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl           |  8 ++++++++
- xla/stream_executor/cuda/BUILD                       | 12 +++++++++++-
+ third_party/gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl |  8 ++++++++
+ xla/stream_executor/cuda/BUILD                     | 12 +++++++++++-
  2 files changed, 19 insertions(+), 1 deletion(-)
 
-diff --git a/third_party/tsl/third_party/gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl b/third_party/tsl/third_party/gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl
+diff --git a/third_party/gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl b/third_party/gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl
 index 16ff3c8bea..d27832bb2e 100644
---- a/third_party/tsl/third_party/gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl
-+++ b/third_party/tsl/third_party/gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl
+--- a/third_party/gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl
++++ b/third_party/gpus/cuda/hermetic/cuda_nvcc.BUILD.tpl
 @@ -45,6 +45,14 @@ filegroup(
      visibility = ["//visibility:public"],
  )
@@ -30,10 +30,10 @@ index 16ff3c8bea..d27832bb2e 100644
      name = "bin",
      srcs = glob([
 diff --git a/xla/stream_executor/cuda/BUILD b/xla/stream_executor/cuda/BUILD
-index 4ccfcbfc72..87d93f5f9c 100644
+index 62c8e4dbb2..d2f40db277 100644
 --- a/xla/stream_executor/cuda/BUILD
 +++ b/xla/stream_executor/cuda/BUILD
-@@ -78,6 +78,11 @@ config_setting(
+@@ -79,6 +79,11 @@ config_setting(
      },
  )
  
@@ -45,7 +45,7 @@ index 4ccfcbfc72..87d93f5f9c 100644
  cc_library(
      name = "cuda_platform_id",
      srcs = ["cuda_platform_id.cc"],
-@@ -122,7 +127,12 @@ cc_library(
+@@ -123,7 +128,12 @@ cc_library(
              "@tsl//tsl/platform:errors",
              "@tsl//tsl/platform:status",
              "@tsl//tsl/platform:statusor",
@@ -60,5 +60,5 @@ index 4ccfcbfc72..87d93f5f9c 100644
  )
  
 -- 
-2.39.5 (Apple Git-154)
+2.43.0
 
diff --git a/openxla/patches/0004-build-use-hermetic-cc-toolchain-for-Linux-CPU-use-gl.patch b/openxla/patches/0004-build-use-hermetic-cc-toolchain-for-Linux-CPU-use-gl.patch
@@ -1,20 +1,21 @@
-From 86de695ba9579ace447bc6c4f5c54bc03d467f85 Mon Sep 17 00:00:00 2001
+From dfdd3a3cd7f33e8c9febf787cc18dd5f38977f9e Mon Sep 17 00:00:00 2001
 From: Hugo Mano <hugo@zml.ai>
 Date: Wed, 5 Feb 2025 16:28:27 +0100
-Subject: [PATCH] build: use hermetic cc toolchain for Linux CPU (use glibc 2.31)
+Subject: [PATCH 4/6] build: use hermetic cc toolchain for Linux CPU (use glibc
+ 2.31)
 
 Only for ZML, no PR on XLA side.
 ---
  WORKSPACE | 24 ++++++++++++++++++++++++
- 1 files changed, 24 insertions(+)
+ 1 file changed, 24 insertions(+)
 
 diff --git a/WORKSPACE b/WORKSPACE
-index 028dcdc7ef..55b6ed691f 100644
+index fb250a66da..3671de7c06 100644
 --- a/WORKSPACE
 +++ b/WORKSPACE
 @@ -99,3 +99,27 @@ load(
  )
-
+ 
  nccl_configure(name = "local_config_nccl")
 +
 +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
@@ -40,5 +41,6 @@ index 028dcdc7ef..55b6ed691f 100644
 +register_toolchains(
 +    "@zig_sdk//toolchain:linux_amd64_gnu.2.31",
 +)
---
-2.39.5 (Apple Git-154)
+-- 
+2.43.0
+
diff --git a/openxla/patches/0005-ROCm-Pass-correct-warp-size-to-Triton-pipeline.patch b/openxla/patches/0005-ROCm-Pass-correct-warp-size-to-Triton-pipeline.patch
@@ -0,0 +1,138 @@
+From 3c5a3dc9cae6552fc6c58659d75077055f899126 Mon Sep 17 00:00:00 2001
+From: Dragan Mladjenovic <Dragan.Mladjenovic@amd.com>
+Date: Wed, 19 Feb 2025 10:36:32 -0600
+Subject: [PATCH 5/6] [ROCm] Pass correct warp size to Triton pipeline
+
+---
+ xla/backends/gpu/codegen/triton/compilation_pipeline.h     | 2 +-
+ .../gpu/codegen/triton/compilation_pipeline_cuda.cc        | 6 ++----
+ .../gpu/codegen/triton/compilation_pipeline_rocm.cc        | 7 +++----
+ .../gpu/codegen/triton/compilation_pipeline_stub.cc        | 2 +-
+ xla/backends/gpu/codegen/triton/fusion_emitter.cc          | 4 +---
+ .../gpu/codegen/triton/fusion_emitter_stub_test.cc         | 2 +-
+ xla/service/gpu/ir_emitter_unnested.cc                     | 7 ++++---
+ 7 files changed, 13 insertions(+), 17 deletions(-)
+
+diff --git a/xla/backends/gpu/codegen/triton/compilation_pipeline.h b/xla/backends/gpu/codegen/triton/compilation_pipeline.h
+index 9acd6fee99..c9e65798a5 100644
+--- a/xla/backends/gpu/codegen/triton/compilation_pipeline.h
++++ b/xla/backends/gpu/codegen/triton/compilation_pipeline.h
+@@ -41,7 +41,7 @@ namespace gpu {
+ // parameter which would give a hint to Triton which cluster dims we prefer to
+ // use, but that's not the case currently.
+ absl::Status CreateTritonPipeline(
+-    mlir::OpPassManager* pm, std::string arch_name, int num_warps, int num_ctas,
++    mlir::OpPassManager* pm, const se::DeviceDescription& device_info, int num_warps, int num_ctas,
+     int num_stages, mlir::triton::nvidia_gpu::ClusterInfo& out_cluster_info,
+     bool is_xla_fusion);
+ 
+diff --git a/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc b/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
+index b57300ea88..e0fcf5bfd1 100644
+--- a/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
++++ b/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
+@@ -43,13 +43,11 @@ namespace mt = ::mlir::triton;
+ namespace mt_xla = ::mlir::triton::xla;
+ 
+ absl::Status CreateTritonPipeline(mlir::OpPassManager* pm,
+-                                  std::string arch_name, int num_warps,
++                                  const se::DeviceDescription& device_info, int num_warps,
+                                   int num_ctas, int num_stages,
+                                   mt::nvidia_gpu::ClusterInfo& out_cluster_info,
+                                   bool is_xla_fusion) {
+-  TF_ASSIGN_OR_RETURN(
+-      const stream_executor::CudaComputeCapability cc,
+-      stream_executor::CudaComputeCapability::FromString(arch_name));
++  auto cc = device_info.cuda_compute_capability();
+   const int ccAsInt = cc.major * 10 + cc.minor;
+   const int threadsPerWarp = 32;
+ 
+diff --git a/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc b/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
+index 03fc4bb230..64a493ed2b 100644
+--- a/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
++++ b/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
+@@ -58,13 +58,12 @@ using ::mlir::Value;
+ using mlir::ValueRange;
+ 
+ absl::Status CreateTritonPipeline(mlir::OpPassManager* pm,
+-                                  std::string arch_name, int num_warps,
++                                  const se::DeviceDescription& device_info, int num_warps,
+                                   int num_ctas, int num_stages,
+                                   mt::nvidia_gpu::ClusterInfo& out_cluster_info,
+                                   bool is_xla_fusion) {
+-  // TODO(ROCm): Check why some test fail when threadsPerWarp is set to 64.
+-  const int threadsPerWarp = 32;
+-  auto cc = se::RocmComputeCapability(std::move(arch_name));
++  const int threadsPerWarp = device_info.threads_per_warp();
++  auto cc = device_info.rocm_compute_capability();
+ 
+   if (is_xla_fusion) {
+     pm->addPass(mt_xla::CreateInt4ToPackedInt4RewritePass());
+diff --git a/xla/backends/gpu/codegen/triton/compilation_pipeline_stub.cc b/xla/backends/gpu/codegen/triton/compilation_pipeline_stub.cc
+index d91acda7f5..ce7517a6b5 100644
+--- a/xla/backends/gpu/codegen/triton/compilation_pipeline_stub.cc
++++ b/xla/backends/gpu/codegen/triton/compilation_pipeline_stub.cc
+@@ -23,7 +23,7 @@ namespace xla {
+ namespace gpu {
+ 
+ absl::Status CreateTritonPipeline(
+-    mlir::OpPassManager* pm, std::string arch_name, int num_warps, int num_ctas,
++    mlir::OpPassManager* pm, const se::DeviceDescription& device_info, int num_warps, int num_ctas,
+     int num_stages, mlir::triton::nvidia_gpu::ClusterInfo& out_cluster_info,
+     bool is_xla_fusion) {
+   return absl::UnimplementedError("not supported for this build configuration");
+diff --git a/xla/backends/gpu/codegen/triton/fusion_emitter.cc b/xla/backends/gpu/codegen/triton/fusion_emitter.cc
+index 02644b9dc4..d164ffa9e4 100644
+--- a/xla/backends/gpu/codegen/triton/fusion_emitter.cc
++++ b/xla/backends/gpu/codegen/triton/fusion_emitter.cc
+@@ -1544,8 +1544,6 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
+     mlir::ModuleOp triton_module, llvm::Module* llvm_module,
+     mlir::MLIRContext& mlir_context, bool is_xla_fusion, bool emit_kernel) {
+   const auto& cc = device_info.gpu_compute_capability();
+-  std::string arch_name =
+-      std::visit([](auto& cc) { return cc.ToString(); }, cc);
+   if (std::holds_alternative<se::CudaComputeCapability>(cc)) {
+     auto ccCuda = std::get<se::CudaComputeCapability>(cc);
+     if (!ccCuda.IsAtLeastAmpere()) {
+@@ -1606,7 +1604,7 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
+   pm.addPass(CreateConvertIndexTypePass());
+ 
+   mlir::triton::nvidia_gpu::ClusterInfo cluster_info;
+-  if (!CreateTritonPipeline(&pm, arch_name, block_level_parameters.num_warps,
++  if (!CreateTritonPipeline(&pm, device_info, block_level_parameters.num_warps,
+                             block_level_parameters.num_ctas,
+                             block_level_parameters.num_stages, cluster_info,
+                             is_xla_fusion)
+diff --git a/xla/backends/gpu/codegen/triton/fusion_emitter_stub_test.cc b/xla/backends/gpu/codegen/triton/fusion_emitter_stub_test.cc
+index 20accf012b..26b0d91fee 100644
+--- a/xla/backends/gpu/codegen/triton/fusion_emitter_stub_test.cc
++++ b/xla/backends/gpu/codegen/triton/fusion_emitter_stub_test.cc
+@@ -51,7 +51,7 @@ TEST(TritonStub, CallStubApi) {
+   mlir::OpPassManager pm;
+   ::mlir::triton::nvidia_gpu::ClusterInfo cluster_info;
+ 
+-  EXPECT_FALSE(CreateTritonPipeline(&pm, "", 1, 1, 1, cluster_info,
++  EXPECT_FALSE(CreateTritonPipeline(&pm, {}, 1, 1, 1, cluster_info,
+                                     /*is_xla_fusion=*/true)
+                    .ok());
+   EXPECT_EQ(GetLibdevicePath({}, {}), "");
+diff --git a/xla/service/gpu/ir_emitter_unnested.cc b/xla/service/gpu/ir_emitter_unnested.cc
+index fcedefa8f3..75d970b7ae 100644
+--- a/xla/service/gpu/ir_emitter_unnested.cc
++++ b/xla/service/gpu/ir_emitter_unnested.cc
+@@ -1434,9 +1434,10 @@ absl::Status IrEmitterUnnested::EmitTritonCustomCall(
+         KernelArguments::Create(ir_emitter_context_->buffer_assignment(), instr,
+                                 instr->operands(),
+                                 /*dedup=*/false));
+-    auto launch_dimensions =
+-        LaunchDimensions(se::BlockDim(call.grid_x, call.grid_y, call.grid_z),
+-                         se::ThreadDim(call.num_warps * 32));
++    auto launch_dimensions = LaunchDimensions(
++        se::BlockDim(call.grid_x, call.grid_y, call.grid_z),
++        se::ThreadDim(call.num_warps *
++                      ir_emitter_context_->gpu_device_info().threads_per_warp()));
+ 
+     std::string sanitized_kernel_name =
+         GetSanitizedUniqueName(*ir_emitter_context_, kernel_name);
+-- 
+2.43.0
+
diff --git a/openxla/patches/0006-ROCm-Apply-precise-block-size-metadata.patch b/openxla/patches/0006-ROCm-Apply-precise-block-size-metadata.patch
diff --git a/openxla/patches/20240131-001-Expose-PJRT-mlir_to_hlo-to-public.patch b/openxla/patches/20240131-001-Expose-PJRT-mlir_to_hlo-to-public.patch