@@ -37,7 +37,6 @@ permissions:
37
37
packages : write # to upload container
38
38
39
39
jobs :
40
-
41
40
build-base :
42
41
uses : ./.github/workflows/_build_base.yaml
43
42
with :
66
65
URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
67
66
secrets : inherit
68
67
69
- build-triton :
70
- needs : build-jax
71
- if : inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
72
- uses : ./.github/workflows/_build.yaml
73
- with :
74
- ARCHITECTURE : ${{ inputs.ARCHITECTURE }}
75
- ARTIFACT_NAME : artifact-triton-build
76
- BADGE_FILENAME : badge-triton-build
77
- BUILD_DATE : ${{ inputs.BUILD_DATE }}
78
- BASE_IMAGE : ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
79
- CONTAINER_NAME : triton
80
- DOCKERFILE : .github/container/Dockerfile.triton
81
- RUNNER_SIZE : large
82
- EXTRA_BUILD_ARGS : |
83
- URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
84
- secrets : inherit
85
-
86
68
build-equinox :
87
69
needs : build-jax
88
70
uses : ./.github/workflows/_build.yaml
@@ -176,22 +158,35 @@ jobs:
176
158
URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
177
159
secrets : inherit
178
160
161
+ build-axlearn :
162
+ needs : build-jax
163
+ uses : ./.github/workflows/_build.yaml
164
+ with :
165
+ ARCHITECTURE : ${{ inputs.ARCHITECTURE }}
166
+ ARTIFACT_NAME : artifact-axlearn-build
167
+ BADGE_FILENAME : badge-axlearn-build
168
+ BUILD_DATE : ${{ inputs.BUILD_DATE }}
169
+ BASE_IMAGE : ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
170
+ CONTAINER_NAME : axlearn
171
+ DOCKERFILE : .github/container/Dockerfile.axlearn
172
+ RUNNER_SIZE : large
173
+ secrets : inherit
174
+
179
175
collect-docker-tags :
180
176
runs-on : ubuntu-22.04
181
- if : " !cancelled()"
177
+ if : ${{ !cancelled() }}
182
178
needs :
183
179
- build-base
184
180
- build-jax
185
- - build-triton
186
181
- build-equinox
187
182
- build-maxtext
188
183
- build-levanter
189
184
- build-upstream-t5x
190
185
- build-rosetta-t5x
191
186
- build-gemma
187
+ - build-axlearn
192
188
outputs :
193
189
TAGS : ${{ steps.collect-tags.outputs.TAGS }}
194
-
195
190
steps :
196
191
- name : Save docker tags as a JSON object
197
192
id : collect-tags
@@ -200,7 +195,21 @@ jobs:
200
195
[\
201
196
{"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
202
197
{"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
198
+ {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
199
+ {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
200
+ {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
201
+ {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
202
+ {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
203
+ {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
204
+ {"flavor": "axlearn", "stage": "final", "priority": 900, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
203
205
{"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
206
+ {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
207
+ {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
208
+ {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
209
+ {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
210
+ {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
211
+ {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
212
+ {"flavor": "axlearn", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
204
213
{}\
205
214
]
206
215
EOF
@@ -399,9 +408,8 @@ jobs:
399
408
runs-on : eks
400
409
env :
401
410
JAX_DOCKER_IMAGE : ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
402
- JOB_NAME : ${{ github.run_id }}-${{ github.run_attempt }}-jax
403
- POSTPROCESS_JOB_NAME : ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
404
- TOKEN_NAME : ${{ github.run_id }}-${{ github.run_attempt }}-token
411
+ JOB_NAME : ${{ github.run_id }}-nsys-jax
412
+ POSTPROCESS_JOB_NAME : ${{ github.run_id }}-nsys-jax-postprocess
405
413
steps :
406
414
- name : Check out the repository
407
415
uses : actions/checkout@v4
@@ -411,59 +419,37 @@ jobs:
411
419
registry : ghcr.io
412
420
username : ${{ github.repository_owner }}
413
421
password : ${{ secrets.GITHUB_TOKEN }}
414
- - name : Store GitHub Container Registry token as Kubernetes secret
415
- run : |
416
- kubectl create secret generic \
417
- ${{ github.run_id }}-${{ github.run_attempt }}-token \
418
- --from-file=.dockerconfigjson=$HOME/.docker/config.json \
419
- --type=kubernetes.io/dockerconfigjson
422
+ - name : K8s GHCR store and delete token
423
+ id : store-token
424
+ uses : ./.github/actions/store-delete-k8s-ghcr
420
425
- name : Configure Kubernetes job
421
426
run : |
422
427
yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
423
428
| select(di == 1).metadata.name = strenv(JOB_NAME)
424
- | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
429
+ | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
425
430
| select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
426
431
| select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
427
432
.github/eks-workflow-files/job.yml
428
433
git diff .github/eks-workflow-files/job.yml
429
434
- name : Submit Kubernetes job
430
- run : kubectl apply -f .github/eks-workflow-files/job.yml
431
- - name : Wait for Kubernetes job to start
432
- run : |
433
- while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-jax --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
434
- sleep 2
435
- done
436
- - name : Stream Kubernetes job output
437
- run : kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-jax
438
- # Clean up in case of errors as well as success
439
- - name : Delete Kubernetes job
440
- if : always()
441
- run : kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-jax
435
+ uses : ./.github/actions/submit-delete-k8s-job
436
+ with :
437
+ job-config-file : .github/eks-workflow-files/job.yml
438
+ job-name : ${{ env.JOB_NAME }}
442
439
- name : Configure post-processing job
443
440
run : |
444
441
export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
445
442
yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
446
443
| .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
447
- | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
444
+ | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
448
445
| .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
449
446
.github/eks-workflow-files/post-process-job.yml
450
447
git diff .github/eks-workflow-files/post-process-job.yml
451
- - name : Submit post-processing Kubernetes job
452
- run : kubectl apply -f .github/eks-workflow-files/post-process-job.yml
453
- - name : Wait for post-processing Kubernetes job to start
454
- run : |
455
- while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
456
- sleep 2
457
- done
458
- - name : Stream post-processing Kubernetes job output
459
- run : kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-postprocess
460
- # Clean up in case of errors as well as success
461
- - name : Delete post-processing Kubernetes job
462
- if : always()
463
- run : kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
464
- - name : Delete GitHub Container Registry token
465
- if : always()
466
- run : kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token
448
+ - name : Submit post process Kubernetes job
449
+ uses : ./.github/actions/submit-delete-k8s-job
450
+ with :
451
+ job-config-file : .github/eks-workflow-files/post-process-job.yml
452
+ job-name : ${{ env.POSTPROCESS_JOB_NAME }}
467
453
468
454
# test-equinox:
469
455
# needs: build-equinox
@@ -663,3 +649,136 @@ jobs:
663
649
with :
664
650
MAXTEXT_IMAGE : ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
665
651
secrets : inherit
652
+
653
+ test-axlearn-eks :
654
+ needs : build-axlearn
655
+ if : inputs.ARCHITECTURE == 'amd64'
656
+ runs-on : eks
657
+ env :
658
+ AXLEARN_DOCKER_IMAGE : ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
659
+ JOB_NAME : axlearn-${{ github.run_id }}
660
+ steps :
661
+ - name : Check out the repository
662
+ uses : actions/checkout@v4
663
+ - name : Login to GitHub Container Registry
664
+ uses : docker/login-action@v3
665
+ with :
666
+ registry : ghcr.io
667
+ username : ${{ github.repository_owner }}
668
+ password : ${{ secrets.GITHUB_TOKEN }}
669
+ - name : K8s GHCR store and delete token
670
+ id : store-token
671
+ uses : ./.github/actions/store-delete-k8s-ghcr
672
+ - name : Configure axlearn test job
673
+ run : |
674
+ # Replace placeholders in axlearn-job.yml with environment variables
675
+ yq -i ea '
676
+ select(di == 0).metadata.name = strenv(JOB_NAME)
677
+ | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
678
+ | select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}"
679
+ | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
680
+ .github/eks-workflow-files/axlearn/axlearn-job.yml
681
+ git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
682
+ - name : Submit & delete axlearn test
683
+ uses : ./.github/actions/submit-delete-k8s-job
684
+ with :
685
+ job-config-file : " .github/eks-workflow-files/axlearn/axlearn-job.yml"
686
+ job-name : ${{ env.JOB_NAME }}
687
+ - name : Download logs from S3
688
+ id : log-s3
689
+ run : |
690
+ mkdir -p axlearn-output
691
+ aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt axlearn-output/
692
+ aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/ axlearn-output/ --recursive --exclude "*" --include "*.log"
693
+
694
+ passed_tests=$(grep -c ": PASSED" axlearn-output/summary.txt || true)
695
+ failed_tests=$(grep -c ": FAILED" axlearn-output/summary.txt || true)
696
+ total_tests=$((failed_tests + passed_tests))
697
+
698
+ echo "Passed tests: $passed_tests"
699
+ echo "Failed tests: $failed_tests"
700
+ echo "Total tests: $total_tests"
701
+ echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
702
+ echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
703
+ echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
704
+ - name : Generate sitrep
705
+ id : sitrep
706
+ if : ${{ !cancelled() }}
707
+ shell : bash -x -e {0}
708
+ run : |
709
+ # bring in utility functions
710
+ source .github/workflows/scripts/to_json.sh
711
+
712
+ badge_label='Axlearn EKS Unit'
713
+
714
+ total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
715
+ failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
716
+ passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
717
+ errors="0" \
718
+ summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
719
+ badge_message="Passed $passed_tests out of $total_tests." \
720
+ badge_color="brightgreen"
721
+ if [ "$failed_tests" -gt 0 ]; then
722
+ badge_color="red"
723
+ fi \
724
+
725
+ to_json \
726
+ summary \
727
+ errors total_tests passed_tests failed_tests \
728
+ badge_label badge_color badge_message \
729
+ > sitrep.json
730
+
731
+ schemaVersion=1 \
732
+ label="${badge_label}" \
733
+ message="Passed $passed_tests out of $total_tests." \
734
+ color=$badge_color \
735
+ to_json schemaVersion label message color \
736
+ > badge-axlearn-test.json
737
+
738
+ - name : Upload artifacts
739
+ if : ${{ !cancelled() }}
740
+ uses : actions/upload-artifact@v4
741
+ with :
742
+ name : " artifact-axlearn-test"
743
+ path : |
744
+ sitrep.json
745
+ badge-axlearn-test.json
746
+ axlearn-output/*
747
+
748
+ # the fuji test will run for 20 minutes only, as per 2025-02-24
749
+ # is not possible to set the `max_steps` value
750
+ # this will be done with a customer python code
751
+ test-axlearn-fuji-models-eks :
752
+ needs : build-axlearn
753
+ if : inputs.ARCHITECTURE == 'amd64'
754
+ runs-on : eks
755
+ env :
756
+ AXLEARN_DOCKER_IMAGE : ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
757
+ JOB_NAME : axlearn-fuji-3b-${{ github.run_id }}
758
+ steps :
759
+ - name : Check out the repository
760
+ uses : actions/checkout@v4
761
+ - name : Login to GitHub Container Registry
762
+ uses : docker/login-action@v3
763
+ with :
764
+ registry : ghcr.io
765
+ username : ${{ github.repository_owner }}
766
+ password : ${{ secrets.GITHUB_TOKEN }}
767
+ - name : K8s GHCR store and delete token
768
+ id : store-token
769
+ uses : ./.github/actions/store-delete-k8s-ghcr
770
+ - name : Configure axlearn test job
771
+ run : |
772
+ yq -i ea '
773
+ select(di == 0).metadata.name = strenv(JOB_NAME)
774
+ | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
775
+ | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
776
+ .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
777
+ git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
778
+
779
+ - name : Submit & delete axlearn test
780
+ uses : ./.github/actions/submit-delete-k8s-job
781
+ with :
782
+ job-config-file : " .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
783
+ job-name : ${{ env.JOB_NAME }}
784
+
0 commit comments