Skip to content

Commit a1cc0d1

Browse files
authored
Update _test_t5x_fmha.yaml
1 parent 609e4a6 commit a1cc0d1

File tree

1 file changed

+19
-19
lines changed

1 file changed

+19
-19
lines changed

.github/workflows/_test_t5x_fmha.yaml

+19-19
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,12 @@ on:
2222
type: string
2323
description: 'Name of the endpoint JSON file for shields.io badge'
2424
required: false
25-
default: 'badge-t5x-fmha-mgmn-test.json'
25+
default: 'badge-fmha-t5x-mgmn-test.json'
2626
ARTIFACT_NAME:
2727
type: string
2828
description: 'Name of the artifact zip file'
2929
required: false
30-
default: 'artifact-t5x-fmha-mgmn-test'
30+
default: 'artifact-fmha-t5x-mgmn-test'
3131
FW_NAME:
3232
type: string
3333
description: 'Name of the framework being used'
@@ -39,15 +39,15 @@ on:
3939
value: ${{ jobs.sitrep.outputs.STATUS }}
4040

4141
jobs:
42-
t5x-fmha-multi-gpu:
42+
fmha-t5x-multi-gpu:
4343
strategy:
4444
matrix:
4545
N_GPU: [1, 2, 4, 8]
4646
fail-fast: false
4747

4848
runs-on: ubuntu-22.04
4949
env:
50-
BADGE_FILENAME_FULL: t5x-fmha-multi-gpu
50+
BADGE_FILENAME_FULL: fmha-t5x-multi-gpu
5151
steps:
5252
- name: Print environment variables
5353
run: env
@@ -76,7 +76,7 @@ jobs:
7676
run: |
7777
IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')"
7878
TEST_CASE_NAME=1P${{ matrix.N_GPU }}G
79-
JOB_NAME=${{ inputs.FW_NAME }}-FMHA-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
79+
JOB_NAME=FMHA-${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
8080
LOG_FILE=/nfs/cluster/${JOB_NAME}.log
8181
MODEL_PATH=/nfs/cluster/${JOB_NAME}
8282
BATCH_SIZE=$((${{ inputs.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }}))
@@ -153,7 +153,7 @@ jobs:
153153
output/ || true
154154
rsync -rtz --progress \
155155
output/ \
156-
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.FW_NAME }}-FMHA-${GITHUB_RUN_ID}/ || true
156+
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/FMHA-${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}/ || true
157157
158158
- name: Write SLURM job status to file
159159
shell: bash -x -e {0}
@@ -181,15 +181,15 @@ jobs:
181181
if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
182182
badge_message='error'
183183
badge_color=red
184-
summary="T5x FMHA Multi GPU ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message"
184+
summary="FMHA T5x Multi GPU ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message"
185185
else
186186
badge_message="${passed_tests}/${total_tests} passed"
187187
if [[ ${failed_tests} == 0 ]]; then
188188
badge_color=brightgreen
189189
else
190190
badge_color=yellow
191191
fi
192-
summary="T5x FMHA Multi GPU ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message"
192+
summary="FMHA T5x Multi GPU ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message"
193193
fi
194194
195195
to_json \
@@ -211,7 +211,7 @@ jobs:
211211
name: ${{ steps.meta.outputs.JOB_NAME }}
212212
path: output/*
213213

214-
t5x-fmha-multi-node:
214+
fmha-t5x-multi-node:
215215
strategy:
216216
matrix:
217217
N_GPU: [1, 2, 4, 8]
@@ -220,7 +220,7 @@ jobs:
220220

221221
runs-on: ubuntu-22.04
222222
env:
223-
BADGE_FILENAME_FULL: t5x-fmha-multi-node
223+
BADGE_FILENAME_FULL: fmha-t5x-multi-node
224224
steps:
225225
- name: Print environment variables
226226
run: env
@@ -250,7 +250,7 @@ jobs:
250250
IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')"
251251
TEST_CASE_NAME=${{ matrix.N_GPU }}G${{ matrix.N_NODE }}N
252252
TOTAL_TASKS=$((${{ matrix.N_GPU }} * ${{ matrix.N_NODE }}))
253-
JOB_NAME=${{ inputs.FW_NAME }}-FMHA-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
253+
JOB_NAME=FMHA-${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
254254
LOG_FILE=/nfs/cluster/${JOB_NAME}.log
255255
MODEL_PATH=/nfs/cluster/${JOB_NAME}
256256
BATCH_SIZE=$((${{ inputs.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }} * ${{ matrix.N_NODE }}))
@@ -330,7 +330,7 @@ jobs:
330330
output/ || true
331331
rsync -rtz --progress \
332332
output/ \
333-
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.FW_NAME }}-FMHA-${GITHUB_RUN_ID}/ || true
333+
${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/FMHA-${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}/ || true
334334
335335
- name: Write SLURM job status to file
336336
shell: bash -x -e {0}
@@ -358,15 +358,15 @@ jobs:
358358
if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
359359
badge_message='error'
360360
badge_color=red
361-
summary="T5x FMHA Multi NODE ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message"
361+
summary="FMHA T5x Multi NODE ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message"
362362
else
363363
badge_message="${passed_tests}/${total_tests} passed"
364364
if [[ ${failed_tests} == 0 ]]; then
365365
badge_color=brightgreen
366366
else
367367
badge_color=yellow
368368
fi
369-
summary="T5x FMHA Multi NODE ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message"
369+
summary="FMHA T5x Multi NODE ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message"
370370
fi
371371
372372
to_json \
@@ -389,7 +389,7 @@ jobs:
389389
path: output/*
390390

391391
metrics:
392-
needs: [t5x-fmha-multi-node, t5x-fmha-multi-gpu]
392+
needs: [fmha-t5x-multi-node, fmha-t5x-multi-gpu]
393393
runs-on: ubuntu-22.04
394394

395395
steps:
@@ -422,7 +422,7 @@ jobs:
422422
423423
summary:
424424
runs-on: ubuntu-22.04
425-
needs: [t5x-fmha-multi-node, t5x-fmha-multi-gpu]
425+
needs: [fmha-t5x-multi-node, fmha-t5x-multi-gpu]
426426
if: "!cancelled()"
427427
steps:
428428
- name: Generate TensorBoard query URL
@@ -432,18 +432,18 @@ jobs:
432432
433433
## T5X MGMN training
434434
435-
[view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars&regexInput=${{ inputs.FW_NAME }}-FMHA-${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per)
435+
[view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars&regexInput=FMHA-${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per)
436436
437437
EOF
438438
) | tee $GITHUB_STEP_SUMMARY
439439
440440
outcome:
441-
needs: [t5x-fmha-multi-node, t5x-fmha-multi-gpu]
441+
needs: [fmha-t5x-multi-node, fmha-t5x-multi-gpu]
442442
runs-on: ubuntu-22.04
443443
if: "!cancelled()"
444444
steps:
445445
- name: Sets workflow status based on test outputs
446446
run: |
447-
if [[ ${{ needs.sitrep.outputs.STATUS }} != 'success' ]]; then
447+
if [[ ${{ needs.metrics.outputs.STATUS }} != 'success' ]]; then
448448
exit 1
449449
fi

0 commit comments

Comments
 (0)