Skip to content

Commit a5d1514

Browse files
committed
Merge main
1 parent 85e00ce commit a5d1514

File tree

1 file changed

+179
-60
lines changed

1 file changed

+179
-60
lines changed

.github/workflows/_ci.yaml

+179-60
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ permissions:
3737
packages: write # to upload container
3838

3939
jobs:
40-
4140
build-base:
4241
uses: ./.github/workflows/_build_base.yaml
4342
with:
@@ -66,23 +65,6 @@ jobs:
6665
URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
6766
secrets: inherit
6867

69-
build-triton:
70-
needs: build-jax
71-
if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
72-
uses: ./.github/workflows/_build.yaml
73-
with:
74-
ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
75-
ARTIFACT_NAME: artifact-triton-build
76-
BADGE_FILENAME: badge-triton-build
77-
BUILD_DATE: ${{ inputs.BUILD_DATE }}
78-
BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
79-
CONTAINER_NAME: triton
80-
DOCKERFILE: .github/container/Dockerfile.triton
81-
RUNNER_SIZE: large
82-
EXTRA_BUILD_ARGS: |
83-
URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
84-
secrets: inherit
85-
8668
build-equinox:
8769
needs: build-jax
8870
uses: ./.github/workflows/_build.yaml
@@ -176,22 +158,35 @@ jobs:
176158
URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
177159
secrets: inherit
178160

161+
build-axlearn:
162+
needs: build-jax
163+
uses: ./.github/workflows/_build.yaml
164+
with:
165+
ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
166+
ARTIFACT_NAME: artifact-axlearn-build
167+
BADGE_FILENAME: badge-axlearn-build
168+
BUILD_DATE: ${{ inputs.BUILD_DATE }}
169+
BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
170+
CONTAINER_NAME: axlearn
171+
DOCKERFILE: .github/container/Dockerfile.axlearn
172+
RUNNER_SIZE: large
173+
secrets: inherit
174+
179175
collect-docker-tags:
180176
runs-on: ubuntu-22.04
181-
if: "!cancelled()"
177+
if: ${{ !cancelled() }}
182178
needs:
183179
- build-base
184180
- build-jax
185-
- build-triton
186181
- build-equinox
187182
- build-maxtext
188183
- build-levanter
189184
- build-upstream-t5x
190185
- build-rosetta-t5x
191186
- build-gemma
187+
- build-axlearn
192188
outputs:
193189
TAGS: ${{ steps.collect-tags.outputs.TAGS }}
194-
195190
steps:
196191
- name: Save docker tags as a JSON object
197192
id: collect-tags
@@ -200,7 +195,21 @@ jobs:
200195
[\
201196
{"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
202197
{"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
198+
{"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
199+
{"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
200+
{"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
201+
{"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
202+
{"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
203+
{"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
204+
{"flavor": "axlearn", "stage": "final", "priority": 900, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
203205
{"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
206+
{"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
207+
{"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
208+
{"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
209+
{"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
210+
{"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
211+
{"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
212+
{"flavor": "axlearn", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
204213
{}\
205214
]
206215
EOF
@@ -399,9 +408,8 @@ jobs:
399408
runs-on: eks
400409
env:
401410
JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
402-
JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-jax
403-
POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
404-
TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token
411+
JOB_NAME: ${{ github.run_id }}-nsys-jax
412+
POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
405413
steps:
406414
- name: Check out the repository
407415
uses: actions/checkout@v4
@@ -411,59 +419,37 @@ jobs:
411419
registry: ghcr.io
412420
username: ${{ github.repository_owner }}
413421
password: ${{ secrets.GITHUB_TOKEN }}
414-
- name: Store GitHub Container Registry token as Kubernetes secret
415-
run: |
416-
kubectl create secret generic \
417-
${{ github.run_id }}-${{ github.run_attempt }}-token \
418-
--from-file=.dockerconfigjson=$HOME/.docker/config.json \
419-
--type=kubernetes.io/dockerconfigjson
422+
- name: K8s GHCR store and delete token
423+
id: store-token
424+
uses: ./.github/actions/store-delete-k8s-ghcr
420425
- name: Configure Kubernetes job
421426
run: |
422427
yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
423428
| select(di == 1).metadata.name = strenv(JOB_NAME)
424-
| select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
429+
| select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
425430
| select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
426431
| select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
427432
.github/eks-workflow-files/job.yml
428433
git diff .github/eks-workflow-files/job.yml
429434
- name: Submit Kubernetes job
430-
run: kubectl apply -f .github/eks-workflow-files/job.yml
431-
- name: Wait for Kubernetes job to start
432-
run: |
433-
while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-jax --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
434-
sleep 2
435-
done
436-
- name: Stream Kubernetes job output
437-
run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-jax
438-
# Clean up in case of errors as well as success
439-
- name: Delete Kubernetes job
440-
if: always()
441-
run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-jax
435+
uses: ./.github/actions/submit-delete-k8s-job
436+
with:
437+
job-config-file: .github/eks-workflow-files/job.yml
438+
job-name: ${{ env.JOB_NAME }}
442439
- name: Configure post-processing job
443440
run: |
444441
export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
445442
yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
446443
| .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
447-
| .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
444+
| .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
448445
| .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
449446
.github/eks-workflow-files/post-process-job.yml
450447
git diff .github/eks-workflow-files/post-process-job.yml
451-
- name: Submit post-processing Kubernetes job
452-
run: kubectl apply -f .github/eks-workflow-files/post-process-job.yml
453-
- name: Wait for post-processing Kubernetes job to start
454-
run: |
455-
while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
456-
sleep 2
457-
done
458-
- name: Stream post-processing Kubernetes job output
459-
run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-postprocess
460-
# Clean up in case of errors as well as success
461-
- name: Delete post-processing Kubernetes job
462-
if: always()
463-
run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
464-
- name: Delete GitHub Container Registry token
465-
if: always()
466-
run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token
448+
- name: Submit post process Kubernetes job
449+
uses: ./.github/actions/submit-delete-k8s-job
450+
with:
451+
job-config-file: .github/eks-workflow-files/post-process-job.yml
452+
job-name: ${{ env.POSTPROCESS_JOB_NAME }}
467453

468454
# test-equinox:
469455
# needs: build-equinox
@@ -663,3 +649,136 @@ jobs:
663649
with:
664650
MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
665651
secrets: inherit
652+
653+
test-axlearn-eks:
654+
needs: build-axlearn
655+
if: inputs.ARCHITECTURE == 'amd64'
656+
runs-on: eks
657+
env:
658+
AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
659+
JOB_NAME: axlearn-${{ github.run_id }}
660+
steps:
661+
- name: Check out the repository
662+
uses: actions/checkout@v4
663+
- name: Login to GitHub Container Registry
664+
uses: docker/login-action@v3
665+
with:
666+
registry: ghcr.io
667+
username: ${{ github.repository_owner }}
668+
password: ${{ secrets.GITHUB_TOKEN }}
669+
- name: K8s GHCR store and delete token
670+
id: store-token
671+
uses: ./.github/actions/store-delete-k8s-ghcr
672+
- name: Configure axlearn test job
673+
run: |
674+
# Replace placeholders in axlearn-job.yml with environment variables
675+
yq -i ea '
676+
select(di == 0).metadata.name = strenv(JOB_NAME)
677+
| select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
678+
| select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}"
679+
| select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
680+
.github/eks-workflow-files/axlearn/axlearn-job.yml
681+
git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
682+
- name: Submit & delete axlearn test
683+
uses: ./.github/actions/submit-delete-k8s-job
684+
with:
685+
job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml"
686+
job-name: ${{ env.JOB_NAME }}
687+
- name: Download logs from S3
688+
id: log-s3
689+
run: |
690+
mkdir -p axlearn-output
691+
aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt axlearn-output/
692+
aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/ axlearn-output/ --recursive --exclude "*" --include "*.log"
693+
694+
passed_tests=$(grep -c ": PASSED" axlearn-output/summary.txt || true)
695+
failed_tests=$(grep -c ": FAILED" axlearn-output/summary.txt || true)
696+
total_tests=$((failed_tests + passed_tests))
697+
698+
echo "Passed tests: $passed_tests"
699+
echo "Failed tests: $failed_tests"
700+
echo "Total tests: $total_tests"
701+
echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
702+
echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
703+
echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
704+
- name: Generate sitrep
705+
id: sitrep
706+
if: ${{ !cancelled() }}
707+
shell: bash -x -e {0}
708+
run: |
709+
# bring in utility functions
710+
source .github/workflows/scripts/to_json.sh
711+
712+
badge_label='Axlearn EKS Unit'
713+
714+
total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
715+
failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
716+
passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
717+
errors="0" \
718+
summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
719+
badge_message="Passed $passed_tests out of $total_tests." \
720+
badge_color="brightgreen"
721+
if [ "$failed_tests" -gt 0 ]; then
722+
badge_color="red"
723+
fi \
724+
725+
to_json \
726+
summary \
727+
errors total_tests passed_tests failed_tests \
728+
badge_label badge_color badge_message \
729+
> sitrep.json
730+
731+
schemaVersion=1 \
732+
label="${badge_label}" \
733+
message="Passed $passed_tests out of $total_tests." \
734+
color=$badge_color \
735+
to_json schemaVersion label message color \
736+
> badge-axlearn-test.json
737+
738+
- name: Upload artifacts
739+
if: ${{ !cancelled() }}
740+
uses: actions/upload-artifact@v4
741+
with:
742+
name: "artifact-axlearn-test"
743+
path: |
744+
sitrep.json
745+
badge-axlearn-test.json
746+
axlearn-output/*
747+
748+
# the fuji test will run for 20 minutes only, as per 2025-02-24
749+
# is not possible to set the `max_steps` value
750+
# this will be done with a customer python code
751+
test-axlearn-fuji-models-eks:
752+
needs: build-axlearn
753+
if: inputs.ARCHITECTURE == 'amd64'
754+
runs-on: eks
755+
env:
756+
AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
757+
JOB_NAME: axlearn-fuji-3b-${{ github.run_id }}
758+
steps:
759+
- name: Check out the repository
760+
uses: actions/checkout@v4
761+
- name: Login to GitHub Container Registry
762+
uses: docker/login-action@v3
763+
with:
764+
registry: ghcr.io
765+
username: ${{ github.repository_owner }}
766+
password: ${{ secrets.GITHUB_TOKEN }}
767+
- name: K8s GHCR store and delete token
768+
id: store-token
769+
uses: ./.github/actions/store-delete-k8s-ghcr
770+
- name: Configure axlearn test job
771+
run: |
772+
yq -i ea '
773+
select(di == 0).metadata.name = strenv(JOB_NAME)
774+
| select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
775+
| select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
776+
.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
777+
git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
778+
779+
- name: Submit & delete axlearn test
780+
uses: ./.github/actions/submit-delete-k8s-job
781+
with:
782+
job-config-file: ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
783+
job-name: ${{ env.JOB_NAME }}
784+

0 commit comments

Comments
 (0)