[XLA] Compute GPU stats in postsubmit and update run schedule for nightly jobs

juliagmt-google · Google-ML-Automation · commit 47247e8afaa7 · 2025-03-11T10:48:05.000-07:00
PiperOrigin-RevId: 735808871
diff --git a/.github/workflows/benchmark_postsubmit.yml b/.github/workflows/benchmark_postsubmit.yml
@@ -0,0 +1,135 @@
+# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+name: Presubmit Benchmarks
+permissions:
+  contents: read
+on:
+  workflow_dispatch:
+    inputs:
+      halt-for-connection:
+        description: 'Should this workflow run wait for a remote connection?'
+        type: choice
+        required: true
+        default: 'no'
+        options:
+        - 'yes'
+        - 'no'
+  push:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
+  cancel-in-progress: ${{ github.ref != 'main' }}
+
+jobs:
+  Tests:
+    strategy:
+      # Don't fail fast - want to see results for all builds even if one fails.
+      fail-fast: false
+      matrix:
+        job_info: [
+          {
+            pool: "linux-x86-n2-16",
+            container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
+            pretty_name: "XLA Linux x86 CPU 16 vcpu Presubmit",
+            bazel_arch_dir: "k8-opt",
+            platform: "CPU"
+          },
+          {
+            pool: "linux-arm64-c4a-16",
+            container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build-arm64:latest",
+            pretty_name: "XLA Linux ARM64 CPU 16 vcpu Presubmit",
+            bazel_arch_dir: "aarch64-opt",
+            platform: "CPU"
+          },
+          {
+            pool: "linux-x86-n2-128",
+            container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
+            pretty_name: "XLA Linux x86 CPU 128 vcpu Presubmit",
+            bazel_arch_dir: "k8-opt",
+            platform: "CPU"
+          },
+          {
+            pool: "linux-x86-g2-16-l4-1gpu",
+            container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
+            pretty_name: "XLA Linux x86 GPU T4 16 vcpu Presubmit",
+            bazel_arch_dir: "k8-opt",
+            platform: "GPU"
+          },
+        ]
+    name: ${{ matrix.job_info.pretty_name }}
+    runs-on: ${{ matrix.job_info.pool }}
+    container: ${{ matrix.job_info.container }}
+    defaults:
+      run:
+        shell: bash
+    timeout-minutes: 10
+    steps:
+      - name: Print machine specs
+        run: |
+          lscpu
+          free -h  # Memory information
+          df -h    # Disk space information
+          uname -a # Kernel information
+
+      - name: Print GitHub Context
+        run: |
+          echo "GitHub SHA: ${{ github.sha }}"
+          echo "GitHub Ref: ${{ github.ref }}"
+          echo "GitHub Ref Name: ${{ github.ref_name }}"
+          echo "GitHub Repository: ${{ github.repository }}"
+          echo "GitHub Run ID: ${{ github.run_id }}"
+          echo "GitHub Run Number: ${{ github.run_number }}"
+          echo "GitHub Workflow: ${{ github.workflow }}"
+          echo "GitHub Actor: ${{ github.actor }}"
+          echo "GitHub Event Name: ${{ github.event_name }}"
+          echo "GitHub Event: ${{ toJSON(github.event) }}"
+          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
+            echo "Pull Request Number: ${{ github.event.pull_request.number }}"
+            echo "Pull Request Head Ref: ${{ github.event.pull_request.head.ref }}"
+            echo "Pull Request Base Ref: ${{ github.event.pull_request.base.ref }}"
+          fi
+
+      - name: Checkout OpenXLA
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+
+      - name: "Run build.py"
+        run: |
+          ./build_tools/ci/build.py --build="${{ matrix.job_info.pretty_name }}_github_actions"
+
+      # Run the corresponding HLO tests based on platform
+      - name: Run HLO tests
+        run: |
+          bazel_arch_dir="${{ matrix.job_info.bazel_arch_dir }}"  # Get directory from matrix
+          binary_path=""
+          test_hlo_file=""
+
+          if [[ ${{ matrix.job_info.platform }} == "CPU" ]]; then
+            binary_path="./bazel-out/${bazel_arch_dir}/bin/xla/tools/run_hlo_module"
+            test_hlo_file="xla/tools/hlo_opt/tests/cpu_hlo.hlo"
+            echo "Running CPU test with binary: $binary_path"
+            $binary_path --input_format=hlo --reference_platform="" --platform="${{ matrix.job_info.platform }}" $test_hlo_file
+          elif [[ ${{ matrix.job_info.platform }} == "GPU" ]]; then
+            PWD=$(pwd)
+            binary_dir="./bazel-out/${bazel_arch_dir}/bin/xla/tools"
+            test_hlo_file="xla/tools/hlo_opt/tests/gpu_hlo_backend.hlo"
+            echo "Running GPU test with binary: $binary_dir"
+            $binary_dir/multihost_hlo_runner/hlo_runner_main_gpu --device_type=gpu --log_output=True --use_spmd_partitioning --profile_execution=True  --xla_gpu_dump_xspace_to=$PWD/"$test_hlo_file"_xspace.pb $test_hlo_file
+            $binary_dir/compute_gpu_device_stats_main_gpu --input=$PWD/"$test_hlo_file"_xspace.pb
+          else
+            echo "Unsupported platform: ${{ matrix.job_info.platform }}"
+            exit 1
+          fi
diff --git a/.github/workflows/benchmark_presubmit.yml b/.github/workflows/benchmark_presubmit.yml
@@ -27,7 +27,6 @@ on:
         - 'yes'
         - 'no'
   pull_request:
-  push:
     branches:
       - main
 
diff --git a/.github/workflows/cpu_benchmarks_nightly.yml b/.github/workflows/cpu_benchmarks_nightly.yml
@@ -18,7 +18,7 @@ permissions:
 on:
   workflow_dispatch:  # Allows manual triggering
   schedule:
-    - cron: '0 */6 * * *'  # Run every 6 hours (at minute 0 of hours 0, 6, 12, 18)
+    - cron: "0 0 * * *"  # Run at midnight every day
 
 jobs:
   Tests:
diff --git a/.github/workflows/gpu_benchmarks_nightly.yml b/.github/workflows/gpu_benchmarks_nightly.yml
@@ -18,7 +18,8 @@ permissions:
 on:
   workflow_dispatch:  # Allows manual triggering
   schedule:
-    - cron: '0 */6 * * *'  # Run every 6 hours (at minute 0 of hours 0, 6, 12, 18)
+    - cron: "0 0 * * *"  # Run at midnight every day
+
 
 jobs:
   Tests:
diff --git a/build_tools/ci/build.py b/build_tools/ci/build.py
@@ -57,6 +57,7 @@
 )
 _XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS = (
     "//xla/tools/multihost_hlo_runner:hlo_runner_main_gpu",
+    "//xla/tools:compute_gpu_device_stats_main_gpu",
 )
 _KOKORO_ARTIFACTS_DIR = os.environ.get(
     "KOKORO_ARTIFACTS_DIR", "$KOKORO_ARTIFACTS_DIR"
diff --git a/build_tools/ci/golden_commands.txt b/build_tools/ci/golden_commands.txt
@@ -45,8 +45,8 @@ bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_CPU_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_T4_16_VCPU_PRESUBMIT_GITHUB_ACTIONS
 nvidia-smi
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//build_tools/ci:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=false --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu
-bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//build_tools/ci:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=false --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//build_tools/ci:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=false --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_gpu_device_stats_main_gpu
+bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//build_tools/ci:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=false --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_gpu_device_stats_main_gpu
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_T4_16_VCPU_PRESUBMIT_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_T4_GITHUB_ACTIONS
diff --git a/xla/tools/BUILD b/xla/tools/BUILD
@@ -802,6 +802,20 @@ xla_cc_binary(
     ],
 )
 
+xla_cc_binary(
+    name = "compute_gpu_device_stats_main_gpu",
+    srcs = ["compute_gpu_device_stats_main.cc"],
+    tags = ["gpu"],
+    deps = [
+        ":compute_gpu_device_stats",
+        "//xla:debug_options_flags",
+        "//xla/tsl/util:command_line_flags",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@tsl//tsl/platform:platform_port",
+    ],
+)
+
 tsl_pybind_extension(
     name = "collective_perf_table_gen_bindings",
     srcs = ["collective_perf_table_gen_bindings.cc"],

Original file line number	Diff line number	Diff line change
`@@ -57,6 +57,7 @@`
`57`	`57`	`)`
`58`	`58`	`_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS = (`
`59`	`59`	`"//xla/tools/multihost_hlo_runner:hlo_runner_main_gpu",`
	`60`	`+ "//xla/tools:compute_gpu_device_stats_main_gpu",`
`60`	`61`	`)`
`61`	`62`	`_KOKORO_ARTIFACTS_DIR = os.environ.get(`
`62`	`63`	`"KOKORO_ARTIFACTS_DIR", "$KOKORO_ARTIFACTS_DIR"`