-
Notifications
You must be signed in to change notification settings - Fork 56
144 lines (129 loc) · 5.01 KB
/
_transformer_engine_eks.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
name: ~test TransformerEngine tests on EKS cluster (8xH100)
on:
workflow_call:
inputs:
JAX_IMAGE:
type: string
description: 'URI of image to run tests on'
required: true
JOB_NAME:
type: string
description: 'Job name identifying the unique GitHub Actions run'
required: true
S3_BUCKET:
type: string
description: 'AWS S3 bucket to which logs will be uploaded for processing in CI'
required: true
CI_NAME:
type: string
description: 'Name of the CI'
required: true
permissions:
contents: read # to fetch code
actions: write # to cancel previous workflows
packages: write # to upload container
jobs:
te-test-eks:
env:
TE_EKS_FILES_PREFIX: .github/eks-workflow-files/transformer-engine
RUN_NAME: ${{ inputs.JOB_NAME }}-${{ matrix.N_GPU }}gpu-${{ matrix.TEST }}
BADGE_SUFFIX: "${{ matrix.TEST }} (${{ matrix.N_GPU }})"
strategy:
fail-fast: true
matrix:
include:
- TEST: unittest
N_GPU: 8
runs-on: [eks] # cluster with H100 nodes
steps:
- name: Check out the repository
uses: actions/checkout@v4
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: K8s GHCR store and delete token
id: store-token
uses: ./.github/actions/store-delete-k8s-ghcr
- name: Configure job manifest
run: |
set -x
K8S_JOB_TEMPLATE="${{ env.TE_EKS_FILES_PREFIX }}/test.yml"
K8S_JOB_MANIFEST="${{ env.TE_EKS_FILES_PREFIX }}/${{ matrix.TEST }}.yml"
SCRIPT="${{ env.TE_EKS_FILES_PREFIX }}/scripts/${{ matrix.TEST }}.sh"
if ! [ -f $K8S_JOB_TEMPLATE ]; then
echo "Template manifest does not exist at $K8S_JOB_TEMPLATE"
exit 1
fi
# paste in script to manifest from file
# update placeholder values
sed '/SCRIPT/ {
r '$SCRIPT'
d
}' $K8S_JOB_TEMPLATE | \
sed s@JOB_NAME@${{ env.RUN_NAME }}@g | \
sed s@IMAGE_URI@${{ inputs.JAX_IMAGE }}@g | \
sed s@SECRETS_NAME@${{ steps.store-token.outputs.token-name }}@g | \
sed s@N_GPU@${{ matrix.N_GPU }}@g | \
tee $K8S_JOB_MANIFEST
- name: Submit & delete transformer engine unit test job
uses: ./.github/actions/submit-delete-k8s-job
with:
job-config-file: ${{ env.TE_EKS_FILES_PREFIX }}/${{ matrix.TEST }}.yml
job-name: ${{ env.RUN_NAME }}
- name: Download and process logs from S3
id: s3-logs-process
run: |
LOCAL_DIR=${{ inputs.CI_NAME }}-output
REPORT_JSON=pytest-report.jsonl
mkdir -p $LOCAL_DIR
aws s3 cp s3://${{ inputs.S3_BUCKET }}/${{ inputs.CI_NAME }}/${{ env.RUN_NAME }}/ $LOCAL_DIR/ --recursive
passed_tests=$(cat $LOCAL_DIR/$REPORT_JSON | jq --slurp | jq '.[] | select(.outcome == "passed") | .outcome' | wc -l || true)
failed_tests=$(cat $LOCAL_DIR/$REPORT_JSON | jq --slurp | jq '.[] | select(.outcome == "failed") | .outcome' | wc -l || true)
total_tests=$((failed_tests + passed_tests))
echo "Passed tests: $passed_tests"
echo "Failed tests: $failed_tests"
echo "Total tests: $total_tests"
echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
- name: Generate sitrep
id: sitrep
if: ${{ !cancelled() }}
shell: bash -x -e {0}
run: |
# bring in utility functions
source .github/workflows/scripts/to_json.sh
badge_label='TransformerEngine EKS ${{ env.BADGE_SUFFIX }}'
total_tests=${{ steps.s3-logs-process.outputs.TOTAL_TESTS }} \
failed_tests=${{ steps.s3-logs-process.outputs.FAILED_TESTS }} \
passed_tests=${{ steps.s3-logs-process.outputs.PASSED_TESTS }} \
errors="0" \
summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
badge_message="Passed $passed_tests out of $total_tests." \
badge_color="brightgreen"
if [ "$failed_tests" -gt 0 ]; then
badge_color="red"
fi \
to_json \
summary \
errors total_tests passed_tests failed_tests \
badge_label badge_color badge_message \
> sitrep.json
schemaVersion=1 \
label="${badge_label}" \
message="Passed $passed_tests out of $total_tests." \
color=$badge_color \
to_json schemaVersion label message color \
> badge-transformer-engine-test.json
- name: Upload artifacts
if: ${{ !cancelled() }}
uses: actions/upload-artifact@v4
with:
name: "artifact-multigpu-test-${{ env.RUN_NAME }}"
path: |
sitrep.json
badge-transformer-engine-test.json
trasformer-engine-output/*