Skip to content

Commit 47247e8

Browse files
[XLA] Compute GPU stats in postsubmit and update run schedule for nightly jobs
PiperOrigin-RevId: 735808871
1 parent 822cd3b commit 47247e8

7 files changed

+155
-5
lines changed
+135
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
# ============================================================================
15+
name: Presubmit Benchmarks
16+
permissions:
17+
contents: read
18+
on:
19+
workflow_dispatch:
20+
inputs:
21+
halt-for-connection:
22+
description: 'Should this workflow run wait for a remote connection?'
23+
type: choice
24+
required: true
25+
default: 'no'
26+
options:
27+
- 'yes'
28+
- 'no'
29+
push:
30+
branches:
31+
- main
32+
33+
concurrency:
34+
group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
35+
cancel-in-progress: ${{ github.ref != 'main' }}
36+
37+
jobs:
38+
Tests:
39+
strategy:
40+
# Don't fail fast - want to see results for all builds even if one fails.
41+
fail-fast: false
42+
matrix:
43+
job_info: [
44+
{
45+
pool: "linux-x86-n2-16",
46+
container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
47+
pretty_name: "XLA Linux x86 CPU 16 vcpu Presubmit",
48+
bazel_arch_dir: "k8-opt",
49+
platform: "CPU"
50+
},
51+
{
52+
pool: "linux-arm64-c4a-16",
53+
container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build-arm64:latest",
54+
pretty_name: "XLA Linux ARM64 CPU 16 vcpu Presubmit",
55+
bazel_arch_dir: "aarch64-opt",
56+
platform: "CPU"
57+
},
58+
{
59+
pool: "linux-x86-n2-128",
60+
container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
61+
pretty_name: "XLA Linux x86 CPU 128 vcpu Presubmit",
62+
bazel_arch_dir: "k8-opt",
63+
platform: "CPU"
64+
},
65+
{
66+
pool: "linux-x86-g2-16-l4-1gpu",
67+
container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
68+
pretty_name: "XLA Linux x86 GPU T4 16 vcpu Presubmit",
69+
bazel_arch_dir: "k8-opt",
70+
platform: "GPU"
71+
},
72+
]
73+
name: ${{ matrix.job_info.pretty_name }}
74+
runs-on: ${{ matrix.job_info.pool }}
75+
container: ${{ matrix.job_info.container }}
76+
defaults:
77+
run:
78+
shell: bash
79+
timeout-minutes: 10
80+
steps:
81+
- name: Print machine specs
82+
run: |
83+
lscpu
84+
free -h # Memory information
85+
df -h # Disk space information
86+
uname -a # Kernel information
87+
88+
- name: Print GitHub Context
89+
run: |
90+
echo "GitHub SHA: ${{ github.sha }}"
91+
echo "GitHub Ref: ${{ github.ref }}"
92+
echo "GitHub Ref Name: ${{ github.ref_name }}"
93+
echo "GitHub Repository: ${{ github.repository }}"
94+
echo "GitHub Run ID: ${{ github.run_id }}"
95+
echo "GitHub Run Number: ${{ github.run_number }}"
96+
echo "GitHub Workflow: ${{ github.workflow }}"
97+
echo "GitHub Actor: ${{ github.actor }}"
98+
echo "GitHub Event Name: ${{ github.event_name }}"
99+
echo "GitHub Event: ${{ toJSON(github.event) }}"
100+
if [[ "${{ github.event_name }}" == "pull_request" ]]; then
101+
echo "Pull Request Number: ${{ github.event.pull_request.number }}"
102+
echo "Pull Request Head Ref: ${{ github.event.pull_request.head.ref }}"
103+
echo "Pull Request Base Ref: ${{ github.event.pull_request.base.ref }}"
104+
fi
105+
106+
- name: Checkout OpenXLA
107+
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
108+
109+
- name: "Run build.py"
110+
run: |
111+
./build_tools/ci/build.py --build="${{ matrix.job_info.pretty_name }}_github_actions"
112+
113+
# Run the corresponding HLO tests based on platform
114+
- name: Run HLO tests
115+
run: |
116+
bazel_arch_dir="${{ matrix.job_info.bazel_arch_dir }}" # Get directory from matrix
117+
binary_path=""
118+
test_hlo_file=""
119+
120+
if [[ ${{ matrix.job_info.platform }} == "CPU" ]]; then
121+
binary_path="./bazel-out/${bazel_arch_dir}/bin/xla/tools/run_hlo_module"
122+
test_hlo_file="xla/tools/hlo_opt/tests/cpu_hlo.hlo"
123+
echo "Running CPU test with binary: $binary_path"
124+
$binary_path --input_format=hlo --reference_platform="" --platform="${{ matrix.job_info.platform }}" $test_hlo_file
125+
elif [[ ${{ matrix.job_info.platform }} == "GPU" ]]; then
126+
PWD=$(pwd)
127+
binary_dir="./bazel-out/${bazel_arch_dir}/bin/xla/tools"
128+
test_hlo_file="xla/tools/hlo_opt/tests/gpu_hlo_backend.hlo"
129+
echo "Running GPU test with binary: $binary_dir"
130+
$binary_dir/multihost_hlo_runner/hlo_runner_main_gpu --device_type=gpu --log_output=True --use_spmd_partitioning --profile_execution=True --xla_gpu_dump_xspace_to=$PWD/"$test_hlo_file"_xspace.pb $test_hlo_file
131+
$binary_dir/compute_gpu_device_stats_main_gpu --input=$PWD/"$test_hlo_file"_xspace.pb
132+
else
133+
echo "Unsupported platform: ${{ matrix.job_info.platform }}"
134+
exit 1
135+
fi

.github/workflows/benchmark_presubmit.yml

-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ on:
2727
- 'yes'
2828
- 'no'
2929
pull_request:
30-
push:
3130
branches:
3231
- main
3332

.github/workflows/cpu_benchmarks_nightly.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ permissions:
1818
on:
1919
workflow_dispatch: # Allows manual triggering
2020
schedule:
21-
- cron: '0 */6 * * *' # Run every 6 hours (at minute 0 of hours 0, 6, 12, 18)
21+
- cron: "0 0 * * *" # Run at midnight every day
2222

2323
jobs:
2424
Tests:

.github/workflows/gpu_benchmarks_nightly.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ permissions:
1818
on:
1919
workflow_dispatch: # Allows manual triggering
2020
schedule:
21-
- cron: '0 */6 * * *' # Run every 6 hours (at minute 0 of hours 0, 6, 12, 18)
21+
- cron: "0 0 * * *" # Run at midnight every day
22+
2223

2324
jobs:
2425
Tests:

build_tools/ci/build.py

+1
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
)
5858
_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS = (
5959
"//xla/tools/multihost_hlo_runner:hlo_runner_main_gpu",
60+
"//xla/tools:compute_gpu_device_stats_main_gpu",
6061
)
6162
_KOKORO_ARTIFACTS_DIR = os.environ.get(
6263
"KOKORO_ARTIFACTS_DIR", "$KOKORO_ARTIFACTS_DIR"

build_tools/ci/golden_commands.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@ bazel analyze-profile profile.json.gz
4545
# END BuildType.XLA_LINUX_X86_CPU_GITHUB_ACTIONS
4646
# BEGIN BuildType.XLA_LINUX_X86_GPU_T4_16_VCPU_PRESUBMIT_GITHUB_ACTIONS
4747
nvidia-smi
48-
parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//build_tools/ci:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=false --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu
49-
bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//build_tools/ci:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=false --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu
48+
parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//build_tools/ci:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=false --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_gpu_device_stats_main_gpu
49+
bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//build_tools/ci:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=false --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_gpu_device_stats_main_gpu
5050
bazel analyze-profile profile.json.gz
5151
# END BuildType.XLA_LINUX_X86_GPU_T4_16_VCPU_PRESUBMIT_GITHUB_ACTIONS
5252
# BEGIN BuildType.XLA_LINUX_X86_GPU_T4_GITHUB_ACTIONS

xla/tools/BUILD

+14
Original file line numberDiff line numberDiff line change
@@ -802,6 +802,20 @@ xla_cc_binary(
802802
],
803803
)
804804

805+
xla_cc_binary(
806+
name = "compute_gpu_device_stats_main_gpu",
807+
srcs = ["compute_gpu_device_stats_main.cc"],
808+
tags = ["gpu"],
809+
deps = [
810+
":compute_gpu_device_stats",
811+
"//xla:debug_options_flags",
812+
"//xla/tsl/util:command_line_flags",
813+
"@com_google_absl//absl/log",
814+
"@com_google_absl//absl/strings",
815+
"@tsl//tsl/platform:platform_port",
816+
],
817+
)
818+
805819
tsl_pybind_extension(
806820
name = "collective_perf_table_gen_bindings",
807821
srcs = ["collective_perf_table_gen_bindings.cc"],

0 commit comments

Comments
 (0)