22
22
type : string
23
23
description : ' Name of the endpoint JSON file for shields.io badge'
24
24
required : false
25
- default : ' badge-t5x- fmha-mgmn-test.json'
25
+ default : ' badge-fmha-t5x -mgmn-test.json'
26
26
ARTIFACT_NAME :
27
27
type : string
28
28
description : ' Name of the artifact zip file'
29
29
required : false
30
- default : ' artifact-t5x- fmha-mgmn-test'
30
+ default : ' artifact-fmha-t5x -mgmn-test'
31
31
FW_NAME :
32
32
type : string
33
33
description : ' Name of the framework being used'
39
39
value : ${{ jobs.sitrep.outputs.STATUS }}
40
40
41
41
jobs :
42
- t5x- fmha-multi-gpu :
42
+ fmha-t5x -multi-gpu :
43
43
strategy :
44
44
matrix :
45
45
N_GPU : [1, 2, 4, 8]
46
46
fail-fast : false
47
47
48
48
runs-on : ubuntu-22.04
49
49
env :
50
- BADGE_FILENAME_FULL : t5x- fmha-multi-gpu
50
+ BADGE_FILENAME_FULL : fmha-t5x -multi-gpu
51
51
steps :
52
52
- name : Print environment variables
53
53
run : env
76
76
run : |
77
77
IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')"
78
78
TEST_CASE_NAME=1P${{ matrix.N_GPU }}G
79
- JOB_NAME=${{ inputs.FW_NAME }}-FMHA -${GITHUB_RUN_ID}-${TEST_CASE_NAME}
79
+ JOB_NAME=FMHA- ${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
80
80
LOG_FILE=/nfs/cluster/${JOB_NAME}.log
81
81
MODEL_PATH=/nfs/cluster/${JOB_NAME}
82
82
BATCH_SIZE=$((${{ inputs.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }}))
@@ -153,7 +153,7 @@ jobs:
153
153
output/ || true
154
154
rsync -rtz --progress \
155
155
output/ \
156
- ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.FW_NAME }}-FMHA -${GITHUB_RUN_ID}/ || true
156
+ ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/FMHA- ${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}/ || true
157
157
158
158
- name : Write SLURM job status to file
159
159
shell : bash -x -e {0}
@@ -181,15 +181,15 @@ jobs:
181
181
if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
182
182
badge_message='error'
183
183
badge_color=red
184
- summary="T5x FMHA Multi GPU ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message"
184
+ summary="FMHA T5x Multi GPU ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message"
185
185
else
186
186
badge_message="${passed_tests}/${total_tests} passed"
187
187
if [[ ${failed_tests} == 0 ]]; then
188
188
badge_color=brightgreen
189
189
else
190
190
badge_color=yellow
191
191
fi
192
- summary="T5x FMHA Multi GPU ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message"
192
+ summary="FMHA T5x Multi GPU ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message"
193
193
fi
194
194
195
195
to_json \
@@ -211,7 +211,7 @@ jobs:
211
211
name : ${{ steps.meta.outputs.JOB_NAME }}
212
212
path : output/*
213
213
214
- t5x- fmha-multi-node :
214
+ fmha-t5x -multi-node :
215
215
strategy :
216
216
matrix :
217
217
N_GPU : [1, 2, 4, 8]
@@ -220,7 +220,7 @@ jobs:
220
220
221
221
runs-on : ubuntu-22.04
222
222
env :
223
- BADGE_FILENAME_FULL : t5x- fmha-multi-node
223
+ BADGE_FILENAME_FULL : fmha-t5x -multi-node
224
224
steps :
225
225
- name : Print environment variables
226
226
run : env
@@ -250,7 +250,7 @@ jobs:
250
250
IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')"
251
251
TEST_CASE_NAME=${{ matrix.N_GPU }}G${{ matrix.N_NODE }}N
252
252
TOTAL_TASKS=$((${{ matrix.N_GPU }} * ${{ matrix.N_NODE }}))
253
- JOB_NAME=${{ inputs.FW_NAME }}-FMHA -${GITHUB_RUN_ID}-${TEST_CASE_NAME}
253
+ JOB_NAME=FMHA- ${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-${TEST_CASE_NAME}
254
254
LOG_FILE=/nfs/cluster/${JOB_NAME}.log
255
255
MODEL_PATH=/nfs/cluster/${JOB_NAME}
256
256
BATCH_SIZE=$((${{ inputs.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }} * ${{ matrix.N_NODE }}))
@@ -330,7 +330,7 @@ jobs:
330
330
output/ || true
331
331
rsync -rtz --progress \
332
332
output/ \
333
- ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.FW_NAME }}-FMHA -${GITHUB_RUN_ID}/ || true
333
+ ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/FMHA- ${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}/ || true
334
334
335
335
- name : Write SLURM job status to file
336
336
shell : bash -x -e {0}
@@ -358,15 +358,15 @@ jobs:
358
358
if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
359
359
badge_message='error'
360
360
badge_color=red
361
- summary="T5x FMHA Multi NODE ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message"
361
+ summary="FMHA T5x Multi NODE ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message"
362
362
else
363
363
badge_message="${passed_tests}/${total_tests} passed"
364
364
if [[ ${failed_tests} == 0 ]]; then
365
365
badge_color=brightgreen
366
366
else
367
367
badge_color=yellow
368
368
fi
369
- summary="T5x FMHA Multi NODE ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message"
369
+ summary="FMHA T5x Multi NODE ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message"
370
370
fi
371
371
372
372
to_json \
@@ -389,7 +389,7 @@ jobs:
389
389
path : output/*
390
390
391
391
metrics :
392
- needs : [t5x- fmha-multi-node, t5x- fmha-multi-gpu]
392
+ needs : [fmha-t5x- multi-node, fmha-t5x -multi-gpu]
393
393
runs-on : ubuntu-22.04
394
394
395
395
steps :
@@ -422,7 +422,7 @@ jobs:
422
422
423
423
summary :
424
424
runs-on : ubuntu-22.04
425
- needs : [t5x- fmha-multi-node, t5x- fmha-multi-gpu]
425
+ needs : [fmha-t5x- multi-node, fmha-t5x -multi-gpu]
426
426
if : " !cancelled()"
427
427
steps :
428
428
- name : Generate TensorBoard query URL
@@ -432,18 +432,18 @@ jobs:
432
432
433
433
## T5X MGMN training
434
434
435
- [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=${{ inputs.FW_NAME }}-FMHA -${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per)
435
+ [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=FMHA- ${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per)
436
436
437
437
EOF
438
438
) | tee $GITHUB_STEP_SUMMARY
439
439
440
440
outcome :
441
- needs : [t5x- fmha-multi-node, t5x- fmha-multi-gpu]
441
+ needs : [fmha-t5x- multi-node, fmha-t5x -multi-gpu]
442
442
runs-on : ubuntu-22.04
443
443
if : " !cancelled()"
444
444
steps :
445
445
- name : Sets workflow status based on test outputs
446
446
run : |
447
- if [[ ${{ needs.sitrep .outputs.STATUS }} != 'success' ]]; then
447
+ if [[ ${{ needs.metrics .outputs.STATUS }} != 'success' ]]; then
448
448
exit 1
449
449
fi
0 commit comments