Merge main

aybchan · aybchan · commit a5d15146bd39 · 2025-03-12T14:04:36.000Z
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
@@ -37,7 +37,6 @@ permissions:
   packages: write # to upload container
 
 jobs:
-
   build-base:
     uses: ./.github/workflows/_build_base.yaml
     with:
@@ -66,23 +65,6 @@ jobs:
         URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
     secrets: inherit
 
-  build-triton:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-triton-build
-      BADGE_FILENAME: badge-triton-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: triton
-      DOCKERFILE: .github/container/Dockerfile.triton
-      RUNNER_SIZE: large
-      EXTRA_BUILD_ARGS: |
-        URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
-    secrets: inherit
-
   build-equinox:
     needs: build-jax
     uses: ./.github/workflows/_build.yaml
@@ -176,22 +158,35 @@ jobs:
         URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
     secrets: inherit
 
+  build-axlearn:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-axlearn-build
+      BADGE_FILENAME: badge-axlearn-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: axlearn
+      DOCKERFILE: .github/container/Dockerfile.axlearn
+      RUNNER_SIZE: large
+    secrets: inherit
+
   collect-docker-tags:
     runs-on: ubuntu-22.04
-    if: "!cancelled()"
+    if: ${{ !cancelled() }}
     needs:
       - build-base
       - build-jax
-      - build-triton
       - build-equinox
       - build-maxtext
       - build-levanter
       - build-upstream-t5x
       - build-rosetta-t5x
       - build-gemma
+      - build-axlearn
     outputs:
       TAGS: ${{ steps.collect-tags.outputs.TAGS }}
-
     steps:
       - name: Save docker tags as a JSON object
         id: collect-tags
@@ -200,7 +195,21 @@ jobs:
           [\
             {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
             {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "levanter",     "stage": "final",   "priority": 900,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "gemma",        "stage": "final",   "priority": 900,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "levanter",     "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "gemma",        "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
             {}\
           ]
           EOF
@@ -399,9 +408,8 @@ jobs:
     runs-on: eks
     env:
       JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-jax
-      POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
-      TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token
+      JOB_NAME: ${{ github.run_id }}-nsys-jax
+      POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
     steps:
     - name: Check out the repository
       uses: actions/checkout@v4
@@ -411,59 +419,37 @@ jobs:
         registry: ghcr.io
         username: ${{ github.repository_owner }}
         password: ${{ secrets.GITHUB_TOKEN }}
-    - name: Store GitHub Container Registry token as Kubernetes secret
-      run: |
-        kubectl create secret generic \
-          ${{ github.run_id }}-${{ github.run_attempt }}-token \
-          --from-file=.dockerconfigjson=$HOME/.docker/config.json \
-          --type=kubernetes.io/dockerconfigjson
+    - name: K8s GHCR store and delete token
+      id: store-token
+      uses: ./.github/actions/store-delete-k8s-ghcr 
     - name: Configure Kubernetes job
       run: |
         yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
           | select(di == 1).metadata.name = strenv(JOB_NAME)
-          | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+          | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
           | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
           | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
           .github/eks-workflow-files/job.yml
         git diff .github/eks-workflow-files/job.yml
     - name: Submit Kubernetes job
-      run: kubectl apply -f .github/eks-workflow-files/job.yml
-    - name: Wait for Kubernetes job to start
-      run: |
-        while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-jax --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
-          sleep 2
-        done
-    - name: Stream Kubernetes job output
-      run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-jax
-    # Clean up in case of errors as well as success
-    - name: Delete Kubernetes job
-      if: always()
-      run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-jax
+      uses: ./.github/actions/submit-delete-k8s-job
+      with: 
+        job-config-file: .github/eks-workflow-files/job.yml
+        job-name: ${{ env.JOB_NAME }}
     - name: Configure post-processing job
       run: |
         export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
         yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
           | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
-          | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+          | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
           | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
           .github/eks-workflow-files/post-process-job.yml
         git diff .github/eks-workflow-files/post-process-job.yml
-    - name: Submit post-processing Kubernetes job
-      run: kubectl apply -f .github/eks-workflow-files/post-process-job.yml
-    - name: Wait for post-processing Kubernetes job to start
-      run: |
-        while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
-          sleep 2
-        done
-    - name: Stream post-processing Kubernetes job output
-      run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-postprocess
-    # Clean up in case of errors as well as success
-    - name: Delete post-processing Kubernetes job
-      if: always()
-      run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
-    - name: Delete GitHub Container Registry token
-      if: always()
-      run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token
+    - name: Submit post process Kubernetes job
+      uses: ./.github/actions/submit-delete-k8s-job
+      with: 
+        job-config-file: .github/eks-workflow-files/post-process-job.yml
+        job-name: ${{ env.POSTPROCESS_JOB_NAME }}
 
   # test-equinox:
   #   needs: build-equinox
@@ -663,3 +649,136 @@ jobs:
     with:
       MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
     secrets: inherit
+
+  test-axlearn-eks:
+    needs: build-axlearn
+    if: inputs.ARCHITECTURE == 'amd64'
+    runs-on: eks
+    env:
+      AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
+      JOB_NAME: axlearn-${{ github.run_id }}
+    steps:
+    - name: Check out the repository
+      uses: actions/checkout@v4
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: ghcr.io
+        username: ${{ github.repository_owner }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    - name: K8s GHCR store and delete token 
+      id: store-token
+      uses: ./.github/actions/store-delete-k8s-ghcr
+    - name: Configure axlearn test job
+      run: |
+        # Replace placeholders in axlearn-job.yml with environment variables
+        yq -i ea '
+           select(di == 0).metadata.name = strenv(JOB_NAME)
+          | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
+          | select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}"
+          | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
+        .github/eks-workflow-files/axlearn/axlearn-job.yml
+        git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
+    - name: Submit & delete axlearn test 
+      uses: ./.github/actions/submit-delete-k8s-job 
+      with:
+        job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml"
+        job-name: ${{ env.JOB_NAME }}
+    - name: Download logs from S3
+      id: log-s3
+      run: |
+        mkdir -p axlearn-output
+        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt axlearn-output/
+        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/ axlearn-output/ --recursive --exclude "*" --include "*.log"
+
+        passed_tests=$(grep -c ": PASSED" axlearn-output/summary.txt || true)
+        failed_tests=$(grep -c ": FAILED" axlearn-output/summary.txt || true)
+        total_tests=$((failed_tests + passed_tests))
+
+        echo "Passed tests: $passed_tests"
+        echo "Failed tests: $failed_tests"
+        echo "Total tests: $total_tests"
+        echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
+        echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
+    - name: Generate sitrep
+      id: sitrep
+      if: ${{ !cancelled() }}
+      shell: bash -x -e {0}
+      run: |
+        # bring in utility functions
+        source .github/workflows/scripts/to_json.sh
+
+        badge_label='Axlearn EKS Unit'
+
+        total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
+        failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
+        passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
+        errors="0" \
+        summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
+        badge_message="Passed $passed_tests out of $total_tests." \
+        badge_color="brightgreen"
+        if [ "$failed_tests" -gt 0 ]; then
+          badge_color="red"
+        fi \
+
+        to_json \
+          summary \
+          errors total_tests passed_tests failed_tests \
+          badge_label badge_color badge_message \
+        > sitrep.json
+
+        schemaVersion=1 \
+        label="${badge_label}" \
+        message="Passed $passed_tests out of $total_tests." \
+        color=$badge_color \
+        to_json schemaVersion label message color \
+        > badge-axlearn-test.json
+
+    - name: Upload artifacts
+      if: ${{ !cancelled() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: "artifact-axlearn-test"
+        path: |
+          sitrep.json
+          badge-axlearn-test.json
+          axlearn-output/*
+
+  # the fuji test will run for 20 minutes only, as per 2025-02-24 
+  # is not possible to set the `max_steps` value
+  # this will be done with a customer python code
+  test-axlearn-fuji-models-eks:
+    needs: build-axlearn
+    if: inputs.ARCHITECTURE == 'amd64'
+    runs-on: eks
+    env:
+      AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
+      JOB_NAME: axlearn-fuji-3b-${{ github.run_id }}
+    steps:
+    - name: Check out the repository
+      uses: actions/checkout@v4
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: ghcr.io
+        username: ${{ github.repository_owner }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    - name: K8s GHCR store and delete token
+      id: store-token
+      uses: ./.github/actions/store-delete-k8s-ghcr
+    - name: Configure axlearn test job
+      run: |
+        yq -i ea '
+           select(di == 0).metadata.name = strenv(JOB_NAME)
+          | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
+          | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
+        .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
+        git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
+
+    - name: Submit & delete axlearn test 
+      uses: ./.github/actions/submit-delete-k8s-job 
+      with:
+        job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
+        job-name: ${{ env.JOB_NAME }}
+