-
Notifications
You must be signed in to change notification settings - Fork 321
130 lines (118 loc) · 4.65 KB
/
RunTests.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: Tests
on:
pull_request:
push:
branches: [ "main" ]
workflow_dispatch:
schedule:
# Run the job every 4 hours
- cron: '0 */4 * * *'
jobs:
prelim:
runs-on: ["self-hosted"]
steps:
- name: Test gsutil installation
run: which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}
- name: Cleanup old docker images
run: docker system prune --all --force
tpu_image:
needs: prelim
uses: ./.github/workflows/build_upload_internal.yml
with:
device_type: tpu
device_name: v4-8
build_mode: stable_stack
base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/tpu:latest
gpu_image:
needs: prelim
uses: ./.github/workflows/build_upload_internal.yml
with:
device_type: gpu
device_name: a100-40gb-4
build_mode: pinned
base_image: gcr.io/tpu-prod-env-multipod/maxtext_gpu_jax_pinned:latest
tpu_unit_tests:
needs: tpu_image
uses: ./.github/workflows/run_tests_internal.yml
with:
device_type: tpu
device_name: v4-8
pytest_marker: 'not gpu_only and not integration_test'
test_directory: 'tests'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
tpu_integration_tests:
needs: tpu_image
uses: ./.github/workflows/run_tests_internal.yml
with:
device_type: tpu
device_name: v4-8
pytest_marker: 'not gpu_only and integration_test'
test_directory: 'tests/integration_tests'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
gpu_unit_tests:
needs: gpu_image
uses: ./.github/workflows/run_tests_internal.yml
with:
device_type: gpu
device_name: a100-40gb-4
pytest_marker: 'not tpu_only and not integration_test'
test_directory: 'tests'
xla_python_client_mem_fraction: 0.65
tf_force_gpu_allow_growth: true
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
gpu_integration_tests:
needs: gpu_image
uses: ./.github/workflows/run_tests_internal.yml
with:
device_type: gpu
device_name: a100-40gb-4
pytest_marker: 'not tpu_only and integration_test'
test_directory: 'tests/integration_tests'
xla_python_client_mem_fraction: 0.65
tf_force_gpu_allow_growth: true
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
clean_up:
if: ${{ always() }} # always execute, regardless of previous jobs or steps.
needs: [gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests]
name: "Clean up"
runs-on: ["self-hosted"]
permissions:
contents: read
issues: write # for failed-build-issue
steps:
- name: Delete GPU image
run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:gpu --force-delete-tags --quiet
- name: Delete TPU image
run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu --force-delete-tags --quiet
notify:
name: Notify failed build # creates an issue or modifies last open existing issue for failed build
needs: [gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests]
runs-on: ["self-hosted"]
steps:
- name: Check whether one of the jobs failed
if: ${{ failure() && github.event.pull_request == null }}
uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Log message if dependent job succeeded
if: ${{ ! (failure() && github.event.pull_request == null) }}
run: echo "Conditions for creating/updating issue not met. Skipping."