From e5ee4726468d8973f9c75cab3505d4a81c989d24 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 7 Mar 2025 08:54:37 +0000
Subject: [PATCH 1/7] Improve error handling, s3 mounting, distributed tests
 for

---
 .../actions/submit-delete-k8s-job/action.yml  | 36 +++++++++++++++++--
 .github/workflows/_ci.yaml                    |  1 -
 README.md                                     |  8 ++---
 3 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml
index dbeabe668..ffaa35575 100644
--- a/.github/actions/submit-delete-k8s-job/action.yml
+++ b/.github/actions/submit-delete-k8s-job/action.yml
@@ -14,9 +14,10 @@ runs:
   steps:
     - name: Submit and Delete Kubernetes job
       uses: ./.github/actions/with-post-step 
+      shell: bash -eo pipefail
       with: 
         main: |
-          echo "Submit K8s job" 
+          echo "Submit K8s job ${{ inputs.job-config-file }}" 
           kubectl apply -f "${{ inputs.job-config-file }}"
           
           # Wait for job to be craeted
@@ -32,6 +33,37 @@ runs:
           
           # Stream logs
           kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }}
-          
+
+          # Check whether the job succeeded or failed 
+          while readarray -d : -t status < <(kubectl get job/${{ inputs.job-name }} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
+            failures="${status[0]:-0}"
+            successes="${status[1]:-0}"
+            total=$((failures + successes))
+
+            if [[ $total -lt 2 ]]; then
+              # neither "failed" nor "succeeded" is 2, so wait
+              sleep 1
+            elif [[ $total -eq 2 ]]; then
+              # we have total=2 => either 2 successes or 2 failures 
+              # (or 1 failed + 1 succeeded). 
+              # In any case, the job is done – break.
+              break
+            else
+              # Just in case we get an unexpected number
+              exit 255
+            fi
+          done
+
+          # If job indicates a failure try to print out the info
+          if [[ $failures -gt 0 ]]; then
+            echo "Job ${{ inputs.job-name }} has $failures failures"
+            # this is for batch jobs only
+            pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} -o name)
+            if [[ -n "${pods}" ]]; then
+              kubectl describe ${pods}
+            fi
+            exit 1
+          fi 
         post: | 
+          echo "Deleting K8s job: ${{ input.job-name }}"
           kubectl delete -f "${{ inputs.job-config-file }}"
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 8ed17d9d6..20084befa 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -769,4 +769,3 @@ jobs:
       with:
         job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
         job-name: ${{ env.JOB_NAME }}
-
diff --git a/README.md b/README.md
index 83053215e..f6778d128 100644
--- a/README.md
+++ b/README.md
@@ -10,12 +10,12 @@ We support and test the following JAX frameworks and model architectures. More d
 
 | Framework | Models | Use cases | Container |
 | :--- | :---: | :---: | :---: |
-| [maxtext](./rosetta/rosetta/projects/maxtext)| GPT, LLaMA, Gemma, Mistral, Mixtral | pretraining | `ghcr.io/nvidia/jax:maxtext` |
+| [maxtext](./rosetta/rosetta/projects/maxtext)| GPT, LLaMA, Gemma, Mistral, Mixtral | pre-training | `ghcr.io/nvidia/jax:maxtext` |
 | [t5x](./rosetta/rosetta/projects/t5x) | T5, ViT | pre-training, fine-tuning | `ghcr.io/nvidia/jax:t5x` |
 | [t5x](./rosetta/rosetta/projects/imagen) | Imagen | pre-training | `ghcr.io/nvidia/t5x:imagen-2023-10-02.v3` |
 | [big vision](./rosetta/rosetta/projects/paligemma) | PaliGemma | fine-tuning, evaluation | `ghcr.io/nvidia/jax:gemma` |
-| levanter | GPT, LLaMA, MPT, Backpacks | pretraining, fine-tuning | `ghcr.io/nvidia/jax:levanter` |
-| axlearn | Fuji | pretraining | `gchr.io/nvidia/jax:axlearn` | 
+| levanter | GPT, LLaMA, MPT, Backpacks | pre-training, fine-tuning | `ghcr.io/nvidia/jax:levanter` |
+| axlearn | Fuji | pre-training | `gchr.io/nvidia/jax:axlearn` | 
 
 # Build Pipeline Status
 <table>
@@ -269,7 +269,7 @@ We support and test the following JAX frameworks and model architectures. More d
       </td>
       <td>
         <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae#file-badge-maxtext-test-json">
-          <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-axleran-test.json&logo=nvidia&label=A100%20distributed">
+          <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-axlearn-test.json&logo=nvidia&label=A100%20distributed">
         </a>
       </td>
     </tr>

From 9de29cd1a487f60c8f0e904a04234c54de68b5f7 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 10 Mar 2025 10:37:14 +0100
Subject: [PATCH 2/7] test mounted s3 bucket

---
 .../actions/submit-delete-k8s-job/action.yml  |  26 +-
 .github/container/symlnk-cudnn.sh             |   2 +-
 .github/container/symlnk-nccl.sh              |   2 +-
 .../axlearn/axlearn-job.yml                   |  31 +-
 .github/workflows/_ci.yaml                    | 976 +++++++++---------
 README.md                                     |   2 +-
 6 files changed, 518 insertions(+), 521 deletions(-)

diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml
index ffaa35575..6788ebe3b 100644
--- a/.github/actions/submit-delete-k8s-job/action.yml
+++ b/.github/actions/submit-delete-k8s-job/action.yml
@@ -34,22 +34,30 @@ runs:
           # Stream logs
           kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }}
 
-          # Check whether the job succeeded or failed 
+          # Detect job parallelism 
+          parallelism=$(kubectl get job/"${{ inputs.job-name }}" -o jsonpath='{.spec.parallelism}')
+          # if parallelism is not set, use default value of 1 
+          if [[ -z "${parallelism}" ]]; then
+            echo "No parallelism specified, defaulting to 1"
+            parallelism=1
+          fi
+
+          # Check whether the job succeeded or failed
           while readarray -d : -t status < <(kubectl get job/${{ inputs.job-name }} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
             failures="${status[0]:-0}"
             successes="${status[1]:-0}"
             total=$((failures + successes))
 
-            if [[ $total -lt 2 ]]; then
-              # neither "failed" nor "succeeded" is 2, so wait
+            if [[ $total -lt $parallelism ]]; then
+              # neither "failed" nor "succeeded", so wait
               sleep 1
-            elif [[ $total -eq 2 ]]; then
-              # we have total=2 => either 2 successes or 2 failures 
-              # (or 1 failed + 1 succeeded). 
-              # In any case, the job is done – break.
+            elif [[ $total -eq $parallelism ]]; then
+              # we have total=parallelism => either X successes or X failures  
+              # In any case, the job is done
               break
             else
-              # Just in case we get an unexpected number
+              # Log here 
+              echo "Unexpected number of completed pods ${total} with parallelism ${parallelism}"
               exit 255
             fi
           done
@@ -64,6 +72,6 @@ runs:
             fi
             exit 1
           fi 
-        post: | 
+        post: |
           echo "Deleting K8s job: ${{ input.job-name }}"
           kubectl delete -f "${{ inputs.job-config-file }}"
diff --git a/.github/container/symlnk-cudnn.sh b/.github/container/symlnk-cudnn.sh
index 5db2c411f..824ab1e7d 100755
--- a/.github/container/symlnk-cudnn.sh
+++ b/.github/container/symlnk-cudnn.sh
@@ -10,7 +10,7 @@ CUDNN_MAJOR_VERSION=9
 prefix=/opt/nvidia/cudnn
 if [[ -d "${prefix}" ]]; then
   echo "Skipping link farm creation"
-  exit 1
+  exit 0
 fi
 
 arch=$(uname -m)-linux-gnu
diff --git a/.github/container/symlnk-nccl.sh b/.github/container/symlnk-nccl.sh
index 33b4ebaa9..8e3d335e0 100755
--- a/.github/container/symlnk-nccl.sh
+++ b/.github/container/symlnk-nccl.sh
@@ -8,7 +8,7 @@ set -ex -o pipefail
 prefix=/opt/nvidia/nccl
 if [[ -d "${prefix}" ]]; then
   echo "Skipping link farm creation"
-  exit 1
+  exit 0
 fi
 arch=$(uname -m)-linux-gnu
 nccl_packages=$(dpkg -l 'libnccl*' | awk '/^ii/ {print $2}')
diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml
index 8f70908da..1c0a57e4f 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-job.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -26,33 +26,22 @@ spec:
 
                       sync  
                       wait 
-                      # after execution flag the results have been produced 
-                      touch /opt/output/done
-                  resources:
-                    limits:
-                        nvidia.com/gpu: 8
-                  volumeMounts:
-                    - name: output
-                      mountPath: /opt/output
-                - name: upload
-                  image: amazon/aws-cli
+                      # copy results to the mounted s3 bucket 
+                      mkdir -p /opt/jax-toolbox-eks-output/axlearn/${RUN_ID}
+                      cp /opt/output/summary.txt /opt/jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt
+                      # copy all the log files 
+                      cp /opt/output/*.log /opt/jax-toolbox-eks-output/axlearn/${RUN_ID}/.
                   env:
                     - name: RUN_ID
                       value: PLACEHOLDER
-                  command:
-                    - sh
-                    - -c
-                    - |
-                      while [ ! -f /opt/output/done ]; do
-                        sleep 5
-                      done
-                      # Upload to S3 bucket
-                      aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt
-                      # Upload logs to S3 bucket
-                      aws s3 cp /opt/output/ s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/ --recursive --exclude "*" --include "*.log"
+                  resources:
+                    limits:
+                        nvidia.com/gpu: 8
                   volumeMounts:
                     - name: output
                       mountPath: /opt/output
+                    - name: s3-storage 
+                      mountPath: /opt/jax-toolbox-eks-output
             imagePullSecrets:
                 - name: PLACEHOLDER
             volumes:
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 20084befa..d2ea0b7c5 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -66,115 +66,115 @@ jobs:
         URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }}
     secrets: inherit
 
-  build-triton:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-triton-build
-      BADGE_FILENAME: badge-triton-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: triton
-      DOCKERFILE: .github/container/Dockerfile.triton
-      RUNNER_SIZE: large
-      EXTRA_BUILD_ARGS: |
-        URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
-    secrets: inherit
+  # build-triton:
+  #   needs: build-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-triton-build
+  #     BADGE_FILENAME: badge-triton-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: triton
+  #     DOCKERFILE: .github/container/Dockerfile.triton
+  #     RUNNER_SIZE: large
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }}
+  #   secrets: inherit
 
-  build-equinox:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-equinox-build
-      BADGE_FILENAME: badge-equinox-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: equinox
-      DOCKERFILE: .github/container/Dockerfile.equinox
-      EXTRA_BUILD_ARGS: |
-        URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
-    secrets: inherit
+  # build-equinox:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-equinox-build
+  #     BADGE_FILENAME: badge-equinox-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: equinox
+  #     DOCKERFILE: .github/container/Dockerfile.equinox
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
+  #   secrets: inherit
 
-  build-maxtext:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-maxtext-build
-      BADGE_FILENAME: badge-maxtext-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: maxtext
-      DOCKERFILE: .github/container/Dockerfile.maxtext
-      EXTRA_BUILD_ARGS: |
-        URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
-    secrets: inherit
+  # build-maxtext:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-maxtext-build
+  #     BADGE_FILENAME: badge-maxtext-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: maxtext
+  #     DOCKERFILE: .github/container/Dockerfile.maxtext
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
+  #   secrets: inherit
 
-  build-levanter:
-    needs: [build-jax]
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: "artifact-levanter-build"
-      BADGE_FILENAME: "badge-levanter-build"
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: levanter
-      DOCKERFILE: .github/container/Dockerfile.levanter
-      EXTRA_BUILD_ARGS: |
-        URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }}
-        URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }}
-    secrets: inherit
+  # build-levanter:
+  #   needs: [build-jax]
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: "artifact-levanter-build"
+  #     BADGE_FILENAME: "badge-levanter-build"
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: levanter
+  #     DOCKERFILE: .github/container/Dockerfile.levanter
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }}
+  #       URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }}
+  #   secrets: inherit
 
-  build-upstream-t5x:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: "artifact-t5x-build"
-      BADGE_FILENAME: "badge-t5x-build"
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: upstream-t5x
-      DOCKERFILE: .github/container/Dockerfile.t5x
-      EXTRA_BUILD_ARGS: |
-        URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
-        URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
-    secrets: inherit
+  # build-upstream-t5x:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: "artifact-t5x-build"
+  #     BADGE_FILENAME: "badge-t5x-build"
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: upstream-t5x
+  #     DOCKERFILE: .github/container/Dockerfile.t5x
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }}
+  #       URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }}
+  #   secrets: inherit
 
-  build-rosetta-t5x:
-    needs: build-upstream-t5x
-    uses: ./.github/workflows/_build_rosetta.yaml
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
-      BASE_LIBRARY: t5x
-    secrets: inherit
+  # build-rosetta-t5x:
+  #   needs: build-upstream-t5x
+  #   uses: ./.github/workflows/_build_rosetta.yaml
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
+  #     BASE_LIBRARY: t5x
+  #   secrets: inherit
 
-  build-gemma:
-    needs: build-jax
-    uses: ./.github/workflows/_build.yaml
-    if: inputs.ARCHITECTURE == 'amd64' # build only amd64
-    with:
-      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-      ARTIFACT_NAME: artifact-gemma-build
-      BADGE_FILENAME: badge-gemma-build
-      BUILD_DATE: ${{ inputs.BUILD_DATE }}
-      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-      CONTAINER_NAME: gemma
-      DOCKERFILE: rosetta/Dockerfile.gemma
-      DOCKER_CONTEXT: .
-      EXTRA_BUILD_ARGS: |
-        URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }}
-        URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }}
-        URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
-        URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
-        URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
-    secrets: inherit
+  # build-gemma:
+  #   needs: build-jax
+  #   uses: ./.github/workflows/_build.yaml
+  #   if: inputs.ARCHITECTURE == 'amd64' # build only amd64
+  #   with:
+  #     ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #     ARTIFACT_NAME: artifact-gemma-build
+  #     BADGE_FILENAME: badge-gemma-build
+  #     BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #     BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #     CONTAINER_NAME: gemma
+  #     DOCKERFILE: rosetta/Dockerfile.gemma
+  #     DOCKER_CONTEXT: .
+  #     EXTRA_BUILD_ARGS: |
+  #       URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }}
+  #       URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }}
+  #       URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }}
+  #       URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }}
+  #       URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
+  #   secrets: inherit
 
   build-axlearn:
     needs: build-jax
@@ -196,13 +196,13 @@ jobs:
     needs:
       - build-base
       - build-jax
-      - build-triton
-      - build-equinox
-      - build-maxtext
-      - build-levanter
-      - build-upstream-t5x
-      - build-rosetta-t5x
-      - build-gemma
+      # - build-triton
+      # - build-equinox
+      # - build-maxtext
+      # - build-levanter
+      # - build-upstream-t5x
+      # - build-rosetta-t5x
+      # - build-gemma
       - build-axlearn
     outputs:
       TAGS: ${{ steps.collect-tags.outputs.TAGS }}
@@ -214,22 +214,22 @@ jobs:
           [\
             {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
             {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "triton",       "stage": "final",   "priority": 900,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "levanter",     "stage": "final",   "priority": 900,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "gemma",        "stage": "final",   "priority": 900,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "triton",       "stage": "final",   "priority": 900,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "levanter",     "stage": "final",   "priority": 900,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
+            # {"flavor": "gemma",        "stage": "final",   "priority": 900,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "levanter",     "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "gemma",        "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "levanter",     "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
+            # {"flavor": "gemma",        "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
 
             {}\
@@ -239,404 +239,404 @@ jobs:
 
           echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
 
-  test-distribution:
-    runs-on: ubuntu-22.04
-    strategy:
-      matrix:
-        TEST_SCRIPT:
-          - extra-only-distribution.sh
-          - mirror-only-distribution.sh
-          - upstream-only-distribution.sh
-          - local-patch-distribution.sh
-      fail-fast: false
-    steps:
-      - name: Print environment variables
-        run: env
-      - name: Set git login for tests
-        run: |
-          git config --global user.email "jax@nvidia.com"
-          git config --global user.name "JAX-Toolbox CI"
-      - name: Check out the repository under ${GITHUB_WORKSPACE}
-        uses: actions/checkout@v4
-      - name: Run integration test ${{ matrix.TEST_SCRIPT }}
-        run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
-
-  test-jax:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: jax
-      EXECUTE: |
-        docker run -i --shm-size=1g --gpus all \
-        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-backend-independent.log
-          test-jax.sh -b backend-independent 
-        EOF
-        docker run -i --shm-size=1g --gpus all \
-        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee tee test-gpu.log
-          nvidia-cuda-mps-control -d
-          test-jax.sh -b gpu
-        EOF
-      STATISTICS_SCRIPT: |
-        errors=$(cat test-*.log | grep -c 'ERROR:' || true)
-        failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
-        passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-backend-independent.log
-        test-gpu.log
-    secrets: inherit
-
-  test-nsys-jax:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: nsys-jax
-      EXECUTE: |
-        set -o pipefail
-        num_tests=0
-        num_failures=0
-        # Run the pytest-driven tests; failure is explicitly handled below so set +e to
-        # avoid an early abort here.
-        set +e
-        docker run -i --shm-size=1g --gpus all \
-          -v $PWD:/opt/output \
-          ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-          bash <<"EOF" |& tee test-nsys-jax.log
-            # nsys-jax is already installed, this is just adding the test dependencies
-            pip install pytest-reportlog nsys-jax[test]
-            # abuse knowledge that nsys-jax is installed editable, so the tests exist
-            test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
-            pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
-        EOF
-        set -e
-        GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
-        for mode in 1-process 2-process process-per-gpu; do
-          DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
-          if [[ "${mode}" == "1-process" ]]; then
-            PROCESS_COUNT=1
-            ARGS=""
-          elif [[ "${mode}" == "2-process" ]]; then
-            # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
-            # this will flush out more bugs than process-per-node or process-per-GPU.
-            PROCESS_COUNT=2
-            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
-          else
-            PROCESS_COUNT=${GPUS_PER_NODE}
-            ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
-          fi
-          for collection in full partial; do
-            NSYS_JAX="nsys-jax"
-            if [[ "${mode}" == "1-process" ]]; then
-              # We will not run nsys-jax-combine, so run analyses eagerly
-              NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
-            fi
-            NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
-            if [[ "${collection}" == "partial" ]]; then
-              NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
-              # nvbug/4801401
-              NSYS_JAX+=" --sample=none"
-            fi
-            set +e
-            ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
-              -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
-            num_failures=$((num_failures + ($? != 0)))
-            set -e
-            num_tests=$((num_tests + 1))
-          done
-          if [[ "${mode}" != "1-process" ]]; then
-            # Run nsys-jax-combine
-            NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
-            for (( i=0; i<PROCESS_COUNT; i++ )); do
-              NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
-            done
-            set +e
-            ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
-            num_failures=$((num_failures + ($? != 0)))
-            set -e
-            num_tests=$((num_tests + 1))
-          fi
-        done
-        ls -R .
-        echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
-        echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
-        exit $num_failures
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-nsys-jax.log)
-        num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-        failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-        total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
-        num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
-        num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        # pytest-driven part
-        test-nsys-jax.log
-        pytest-report.jsonl
-        # nsys-jax logfiles
-        *process-*-execution.log
-        # nsys-jax output for the case that doesn't use nsys-jax-combine
-        1-process-*-execution-0.zip
-        # nsys-jax-combine output/logfiles
-        *process*-*-execution.zip
-        *-execution-combine.log
-    secrets: inherit
-
-  #test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
-  #runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
-  #not already have nsys-jax installed
-  test-nsys-jax-archive:
-    needs: test-nsys-jax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    strategy:
-      matrix:
-        os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Download nsys-jax output .zip files
-      uses: actions/download-artifact@v4
-      with:
-        name: nsys-jax-unit-test-A100
-    - name: Extract archives and execute install scripts
-      run: |
-        pip install virtualenv # for install.sh
-        for zip in $(ls *.zip); do
-          ZIP="${PWD}/${zip}"
-          pushd $(mktemp -d)
-          unzip "${ZIP}"
-          ls -l
-          # TODO: verify this isn't needed, or make sure it isn't needed
-          chmod 755 install.sh
-          # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
-          # Skip executing Jupyter lab
-          NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
-          popd
-        done
-
-  test-nsys-jax-eks:
-    needs: build-jax
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    runs-on: eks
-    env:
-      JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: ${{ github.run_id }}-nsys-jax
-      POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
-    steps:
-    - name: Check out the repository
-      uses: actions/checkout@v4
-    - name: Login to GitHub Container Registry
-      uses: docker/login-action@v3
-      with:
-        registry: ghcr.io
-        username: ${{ github.repository_owner }}
-        password: ${{ secrets.GITHUB_TOKEN }}
-    - name: K8s GHCR store and delete token
-      id: store-token
-      uses: ./.github/actions/store-delete-k8s-ghcr 
-    - name: Configure Kubernetes job
-      run: |
-        yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
-          | select(di == 1).metadata.name = strenv(JOB_NAME)
-          | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
-          | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
-          | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
-          .github/eks-workflow-files/job.yml
-        git diff .github/eks-workflow-files/job.yml
-    - name: Submit Kubernetes job
-      uses: ./.github/actions/submit-delete-k8s-job
-      with: 
-        job-config-file: .github/eks-workflow-files/job.yml
-        job-name: ${{ env.JOB_NAME }}
-    - name: Configure post-processing job
-      run: |
-        export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
-        yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
-          | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
-          | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
-          | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
-          .github/eks-workflow-files/post-process-job.yml
-        git diff .github/eks-workflow-files/post-process-job.yml
-    - name: Submit post process Kubernetes job
-      uses: ./.github/actions/submit-delete-k8s-job
-      with: 
-        job-config-file: .github/eks-workflow-files/post-process-job.yml
-        job-name: ${{ env.POSTPROCESS_JOB_NAME }}
-
-  # test-equinox:
-  #   needs: build-equinox
+  # test-distribution:
+  #   runs-on: ubuntu-22.04
+  #   strategy:
+  #     matrix:
+  #       TEST_SCRIPT:
+  #         - extra-only-distribution.sh
+  #         - mirror-only-distribution.sh
+  #         - upstream-only-distribution.sh
+  #         - local-patch-distribution.sh
+  #     fail-fast: false
+  #   steps:
+  #     - name: Print environment variables
+  #       run: env
+  #     - name: Set git login for tests
+  #       run: |
+  #         git config --global user.email "jax@nvidia.com"
+  #         git config --global user.name "JAX-Toolbox CI"
+  #     - name: Check out the repository under ${GITHUB_WORKSPACE}
+  #       uses: actions/checkout@v4
+  #     - name: Run integration test ${{ matrix.TEST_SCRIPT }}
+  #       run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }}
+
+  # test-jax:
+  #   needs: build-jax
   #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
   #   uses: ./.github/workflows/_test_unit.yaml
   #   with:
-  #     IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}
-  #     TEST_NAME: equinox
+  #     TEST_NAME: jax
   #     EXECUTE: |
-  #       docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \
-  #       bash -exc -o pipefail \
-  #       'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log
+  #       docker run -i --shm-size=1g --gpus all \
+  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-backend-independent.log
+  #         test-jax.sh -b backend-independent 
+  #       EOF
+  #       docker run -i --shm-size=1g --gpus all \
+  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee tee test-gpu.log
+  #         nvidia-cuda-mps-control -d
+  #         test-jax.sh -b gpu
+  #       EOF
   #     STATISTICS_SCRIPT: |
-  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+  #       errors=$(cat test-*.log | grep -c 'ERROR:' || true)
+  #       failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
+  #       passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
   #       total_tests=$((failed_tests + passed_tests))
   #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
   #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
   #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
   #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
   #     ARTIFACTS: |
-  #       test-equinox.log
+  #       test-backend-independent.log
+  #       test-gpu.log
   #   secrets: inherit
 
-  # test-te-multigpu:
-  #   needs: build-upstream-pax
+  # test-nsys-jax:
+  #   needs: build-jax
   #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-  #   uses: ./.github/workflows/_test_te.yaml
+  #   uses: ./.github/workflows/_test_unit.yaml
   #   with:
-  #     TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
+  #     TEST_NAME: nsys-jax
+  #     EXECUTE: |
+  #       set -o pipefail
+  #       num_tests=0
+  #       num_failures=0
+  #       # Run the pytest-driven tests; failure is explicitly handled below so set +e to
+  #       # avoid an early abort here.
+  #       set +e
+  #       docker run -i --shm-size=1g --gpus all \
+  #         -v $PWD:/opt/output \
+  #         ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #         bash <<"EOF" |& tee test-nsys-jax.log
+  #           # nsys-jax is already installed, this is just adding the test dependencies
+  #           pip install pytest-reportlog nsys-jax[test]
+  #           # abuse knowledge that nsys-jax is installed editable, so the tests exist
+  #           test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
+  #           pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}"
+  #       EOF
+  #       set -e
+  #       GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU')
+  #       for mode in 1-process 2-process process-per-gpu; do
+  #         DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"
+  #         if [[ "${mode}" == "1-process" ]]; then
+  #           PROCESS_COUNT=1
+  #           ARGS=""
+  #         elif [[ "${mode}" == "2-process" ]]; then
+  #           # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that
+  #           # this will flush out more bugs than process-per-node or process-per-GPU.
+  #           PROCESS_COUNT=2
+  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed"
+  #         else
+  #           PROCESS_COUNT=${GPUS_PER_NODE}
+  #           ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed"
+  #         fi
+  #         for collection in full partial; do
+  #           NSYS_JAX="nsys-jax"
+  #           if [[ "${mode}" == "1-process" ]]; then
+  #             # We will not run nsys-jax-combine, so run analyses eagerly
+  #             NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary"
+  #           fi
+  #           NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}"
+  #           if [[ "${collection}" == "partial" ]]; then
+  #             NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop"
+  #             # nvbug/4801401
+  #             NSYS_JAX+=" --sample=none"
+  #           fi
+  #           set +e
+  #           ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \
+  #             -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log
+  #           num_failures=$((num_failures + ($? != 0)))
+  #           set -e
+  #           num_tests=$((num_tests + 1))
+  #         done
+  #         if [[ "${mode}" != "1-process" ]]; then
+  #           # Run nsys-jax-combine
+  #           NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip"
+  #           for (( i=0; i<PROCESS_COUNT; i++ )); do
+  #             NSYS_JAX_COMBINE+=" /opt/output/${mode}-${collection}-execution-${i}.zip"
+  #           done
+  #           set +e
+  #           ${DOCKER} ${NSYS_JAX_COMBINE} |& tee ${mode}-${collection}-execution-combine.log
+  #           num_failures=$((num_failures + ($? != 0)))
+  #           set -e
+  #           num_tests=$((num_tests + 1))
+  #         fi
+  #       done
+  #       ls -R .
+  #       echo "NSYS_JAX_TEST_COUNT=${num_tests}" >> $GITHUB_ENV
+  #       echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV
+  #       exit $num_failures
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-nsys-jax.log)
+  #       num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+  #       failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+  #       total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests ))
+  #       num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT ))
+  #       num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT ))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       # pytest-driven part
+  #       test-nsys-jax.log
+  #       pytest-report.jsonl
+  #       # nsys-jax logfiles
+  #       *process-*-execution.log
+  #       # nsys-jax output for the case that doesn't use nsys-jax-combine
+  #       1-process-*-execution-0.zip
+  #       # nsys-jax-combine output/logfiles
+  #       *process*-*-execution.zip
+  #       *-execution-combine.log
   #   secrets: inherit
 
-  test-upstream-t5x:
-    needs: build-upstream-t5x
-    if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
-    uses: ./.github/workflows/_test_upstream_t5x.yaml
-    with:
-      T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  #test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
+  #runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
+  #not already have nsys-jax installed
+  # test-nsys-jax-archive:
+  #   needs: test-nsys-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   strategy:
+  #     matrix:
+  #       os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
+  #   runs-on: ${{ matrix.os }}
+  #   steps:
+  #   - name: Download nsys-jax output .zip files
+  #     uses: actions/download-artifact@v4
+  #     with:
+  #       name: nsys-jax-unit-test-A100
+  #   - name: Extract archives and execute install scripts
+  #     run: |
+  #       pip install virtualenv # for install.sh
+  #       for zip in $(ls *.zip); do
+  #         ZIP="${PWD}/${zip}"
+  #         pushd $(mktemp -d)
+  #         unzip "${ZIP}"
+  #         ls -l
+  #         # TODO: verify this isn't needed, or make sure it isn't needed
+  #         chmod 755 install.sh
+  #         # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
+  #         # Skip executing Jupyter lab
+  #         NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
+  #         popd
+  #       done
+
+  # test-nsys-jax-eks:
+  #   needs: build-jax
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   runs-on: eks
+  #   env:
+  #     JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
+  #     JOB_NAME: ${{ github.run_id }}-nsys-jax
+  #     POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
+  #   steps:
+  #   - name: Check out the repository
+  #     uses: actions/checkout@v4
+  #   - name: Login to GitHub Container Registry
+  #     uses: docker/login-action@v3
+  #     with:
+  #       registry: ghcr.io
+  #       username: ${{ github.repository_owner }}
+  #       password: ${{ secrets.GITHUB_TOKEN }}
+  #   - name: K8s GHCR store and delete token
+  #     id: store-token
+  #     uses: ./.github/actions/store-delete-k8s-ghcr 
+  #   - name: Configure Kubernetes job
+  #     run: |
+  #       yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
+  #         | select(di == 1).metadata.name = strenv(JOB_NAME)
+  #         | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
+  #         | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
+  #         | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
+  #         .github/eks-workflow-files/job.yml
+  #       git diff .github/eks-workflow-files/job.yml
+  #   - name: Submit Kubernetes job
+  #     uses: ./.github/actions/submit-delete-k8s-job
+  #     with: 
+  #       job-config-file: .github/eks-workflow-files/job.yml
+  #       job-name: ${{ env.JOB_NAME }}
+  #   - name: Configure post-processing job
+  #     run: |
+  #       export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
+  #       yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
+  #         | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
+  #         | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
+  #         | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
+  #         .github/eks-workflow-files/post-process-job.yml
+  #       git diff .github/eks-workflow-files/post-process-job.yml
+  #   - name: Submit post process Kubernetes job
+  #     uses: ./.github/actions/submit-delete-k8s-job
+  #     with: 
+  #       job-config-file: .github/eks-workflow-files/post-process-job.yml
+  #       job-name: ${{ env.POSTPROCESS_JOB_NAME }}
+
+  # # test-equinox:
+  # #   needs: build-equinox
+  # #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  # #   uses: ./.github/workflows/_test_unit.yaml
+  # #   with:
+  # #     IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}
+  # #     TEST_NAME: equinox
+  # #     EXECUTE: |
+  # #       docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \
+  # #       bash -exc -o pipefail \
+  # #       'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log
+  # #     STATISTICS_SCRIPT: |
+  # #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  # #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+  # #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+  # #       total_tests=$((failed_tests + passed_tests))
+  # #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  # #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  # #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  # #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  # #     ARTIFACTS: |
+  # #       test-equinox.log
+  # #   secrets: inherit
+
+  # # test-te-multigpu:
+  # #   needs: build-upstream-pax
+  # #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  # #   uses: ./.github/workflows/_test_te.yaml
+  # #   with:
+  # #     TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}
+  # #   secrets: inherit
+
+  # test-upstream-t5x:
+  #   needs: build-upstream-t5x
+  #   if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
+  #   uses: ./.github/workflows/_test_upstream_t5x.yaml
+  #   with:
+  #     T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
-  test-rosetta-t5x:
-    needs: build-rosetta-t5x
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_t5x_rosetta.yaml
-    with:
-      T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-rosetta-t5x:
+  #   needs: build-rosetta-t5x
+  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+  #   uses: ./.github/workflows/_test_t5x_rosetta.yaml
+  #   with:
+  #     T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
-  test-triton:
-    needs: build-triton
-    if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: triton
-      EXECUTE: |
-        docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
-        ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-triton.log
-          # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on
-          # actually having a CUDA backend for pytoch
-          pip install --no-deps torch
-          python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
-        EOF
-      STATISTICS_SCRIPT: |
-        curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
-        total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
-        errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
-        failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
-        passed_tests=$((total_tests - errors - failed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-triton.log
-    secrets: inherit
+  # test-triton:
+  #   needs: build-triton
+  #   if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: triton
+  #     EXECUTE: |
+  #       docker run -i --shm-size=1g --gpus all --volume $PWD:/output \
+  #       ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-triton.log
+  #         # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on
+  #         # actually having a CUDA backend for pytoch
+  #         pip install --no-deps torch
+  #         python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq;
+  #       total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml)
+  #       errors=$(./yq '.testsuites."+@errors"' triton_test.xml)
+  #       failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml)
+  #       passed_tests=$((total_tests - errors - failed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-triton.log
+  #   secrets: inherit
 
-  test-levanter:
-    needs: build-levanter
-    if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: levanter
-      EXECUTE: |
-        docker run -i --gpus all --shm-size=1g \
-        ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-levanter.log
-          pip install flake8 pytest soundfile librosa
-          PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
-        EOF
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-levanter.log)
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-levanter.log
-    secrets: inherit
-    
-  # test-te:
-  #   needs: build-upstream-pax
+  # test-levanter:
+  #   needs: build-levanter
   #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
   #   uses: ./.github/workflows/_test_unit.yaml
   #   with:
-  #     TEST_NAME: te
+  #     TEST_NAME: levanter
   #     EXECUTE: |
-  #       docker run -i --gpus all --shm-size=1g -v $PWD:/log \
-  #       ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-te.log
-  #         pip install pytest-reportlog
-  #         pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax
+  #       docker run -i --gpus all --shm-size=1g \
+  #       ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-levanter.log
+  #         pip install flake8 pytest soundfile librosa
+  #         PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray"
   #       EOF
   #     STATISTICS_SCRIPT: |
-  #       summary_line=$(tail -n1 test-te.log)
+  #       summary_line=$(tail -n1 test-levanter.log)
   #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-  #       failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
   #       total_tests=$((failed_tests + passed_tests))
   #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
   #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
   #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
   #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     TIMEOUT_MINUTES: 120
   #     ARTIFACTS: |
-  #       test-te.log
-  #       pytest-report.jsonl
+  #       test-levanter.log
+  #   secrets: inherit
+    
+  # # test-te:
+  # #   needs: build-upstream-pax
+  # #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
+  # #   uses: ./.github/workflows/_test_unit.yaml
+  # #   with:
+  # #     TEST_NAME: te
+  # #     EXECUTE: |
+  # #       docker run -i --gpus all --shm-size=1g -v $PWD:/log \
+  # #       ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \
+  # #       bash <<"EOF" |& tee test-te.log
+  # #         pip install pytest-reportlog
+  # #         pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax
+  # #       EOF
+  # #     STATISTICS_SCRIPT: |
+  # #       summary_line=$(tail -n1 test-te.log)
+  # #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  # #       passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+  # #       failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+  # #       total_tests=$((failed_tests + passed_tests))
+  # #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  # #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  # #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  # #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  # #     TIMEOUT_MINUTES: 120
+  # #     ARTIFACTS: |
+  # #       test-te.log
+  # #       pytest-report.jsonl
+  # #   secrets: inherit
+
+  # test-gemma:
+  #   needs: build-gemma
+  #   uses: ./.github/workflows/_test_unit.yaml  
+  #   if: inputs.ARCHITECTURE == 'amd64'
+  #   with:
+  #     TEST_NAME: gemma
+  #     EXECUTE: |
+  #       docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
+  #       bash -ec \
+  #       "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-gemma.log)
+  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-gemma.log
   #   secrets: inherit
 
-  test-gemma:
-    needs: build-gemma
-    uses: ./.github/workflows/_test_unit.yaml  
-    if: inputs.ARCHITECTURE == 'amd64'
-    with:
-      TEST_NAME: gemma
-      EXECUTE: |
-        docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \
-        bash -ec \
-        "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-gemma.log)
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}')
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-gemma.log
-    secrets: inherit
-
-  test-maxtext:
-    needs: build-maxtext
-    if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
-    uses: ./.github/workflows/_test_maxtext.yaml
-    with:
-      MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
+  # test-maxtext:
+  #   needs: build-maxtext
+  #   if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners
+  #   uses: ./.github/workflows/_test_maxtext.yaml
+  #   with:
+  #     MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
 
   test-axlearn-eks:
     needs: build-axlearn
@@ -663,7 +663,7 @@ jobs:
         yq -i ea '
            select(di == 0).metadata.name = strenv(JOB_NAME)
           | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
-          | select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}"
+          | select(di == 0).spec.template.spec.containers[0].env[0].value = "${{ github.run_id }}"
           | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
         .github/eks-workflow-files/axlearn/axlearn-job.yml
         git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
diff --git a/README.md b/README.md
index f6778d128..bca12d6e5 100644
--- a/README.md
+++ b/README.md
@@ -269,7 +269,7 @@ We support and test the following JAX frameworks and model architectures. More d
       </td>
       <td>
         <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae#file-badge-maxtext-test-json">
-          <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-axlearn-test.json&logo=nvidia&label=A100%20distributed">
+          <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-axlearn-test.json&logo=nvidia&label=H100%20distributed">
         </a>
       </td>
     </tr>

From 6c6cd345f3075d73d0cbc6ea318504341b1f9f1d Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 10 Mar 2025 11:50:57 +0100
Subject: [PATCH 3/7] Fix action

---
 .github/actions/submit-delete-k8s-job/action.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml
index 6788ebe3b..ffff89751 100644
--- a/.github/actions/submit-delete-k8s-job/action.yml
+++ b/.github/actions/submit-delete-k8s-job/action.yml
@@ -14,7 +14,6 @@ runs:
   steps:
     - name: Submit and Delete Kubernetes job
       uses: ./.github/actions/with-post-step 
-      shell: bash -eo pipefail
       with: 
         main: |
           echo "Submit K8s job ${{ inputs.job-config-file }}" 
@@ -73,5 +72,5 @@ runs:
             exit 1
           fi 
         post: |
-          echo "Deleting K8s job: ${{ input.job-name }}"
+          echo "Deleting K8s job: ${{ inputs.job-name }}"
           kubectl delete -f "${{ inputs.job-config-file }}"

From aa63ac6e06a8540055f6ba522a9dedc12a1e8ce3 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 10 Mar 2025 12:36:28 +0100
Subject: [PATCH 4/7] fix the bash shell and remember to mount the volume

---
 .github/actions/submit-delete-k8s-job/action.yml   | 1 +
 .github/eks-workflow-files/axlearn/axlearn-job.yml | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml
index ffff89751..44b630ec4 100644
--- a/.github/actions/submit-delete-k8s-job/action.yml
+++ b/.github/actions/submit-delete-k8s-job/action.yml
@@ -14,6 +14,7 @@ runs:
   steps:
     - name: Submit and Delete Kubernetes job
       uses: ./.github/actions/with-post-step 
+      shell: bash
       with: 
         main: |
           echo "Submit K8s job ${{ inputs.job-config-file }}" 
diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml
index 1c0a57e4f..5c65975a6 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-job.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -47,3 +47,6 @@ spec:
             volumes:
                 - name: output
                   emptyDir: {}
+                - name: s3-storage
+                  persistentVolumeClaim:
+                    claimName: s3-pvc 

From 0ae1b83f63f4dbb1cacf88a415f55e17250c70b5 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 10 Mar 2025 14:06:42 +0100
Subject: [PATCH 5/7] start working on the shell of the action

---
 .github/actions/submit-delete-k8s-job/action.yml | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml
index 44b630ec4..fd76be0d8 100644
--- a/.github/actions/submit-delete-k8s-job/action.yml
+++ b/.github/actions/submit-delete-k8s-job/action.yml
@@ -14,7 +14,6 @@ runs:
   steps:
     - name: Submit and Delete Kubernetes job
       uses: ./.github/actions/with-post-step 
-      shell: bash
       with: 
         main: |
           echo "Submit K8s job ${{ inputs.job-config-file }}" 
@@ -37,7 +36,7 @@ runs:
           # Detect job parallelism 
           parallelism=$(kubectl get job/"${{ inputs.job-name }}" -o jsonpath='{.spec.parallelism}')
           # if parallelism is not set, use default value of 1 
-          if [[ -z "${parallelism}" ]]; then
+          if [ -z "${parallelism}" ]; then
             echo "No parallelism specified, defaulting to 1"
             parallelism=1
           fi
@@ -48,10 +47,10 @@ runs:
             successes="${status[1]:-0}"
             total=$((failures + successes))
 
-            if [[ $total -lt $parallelism ]]; then
+            if [ $total -lt $parallelism ]; then
               # neither "failed" nor "succeeded", so wait
               sleep 1
-            elif [[ $total -eq $parallelism ]]; then
+            elif [ $total -eq $parallelism ]; then
               # we have total=parallelism => either X successes or X failures  
               # In any case, the job is done
               break
@@ -63,11 +62,11 @@ runs:
           done
 
           # If job indicates a failure try to print out the info
-          if [[ $failures -gt 0 ]]; then
+          if [ $failures -gt 0 ]; then
             echo "Job ${{ inputs.job-name }} has $failures failures"
             # this is for batch jobs only
             pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} -o name)
-            if [[ -n "${pods}" ]]; then
+            if [ -n "${pods}" ]; then
               kubectl describe ${pods}
             fi
             exit 1

From 8f65cd4a2705c6d21f5637e5a6e87027b88ce2a5 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 10 Mar 2025 15:42:16 +0100
Subject: [PATCH 6/7] try to fix using posix-sh-compatible

---
 .github/actions/submit-delete-k8s-job/action.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml
index fd76be0d8..c98da4b98 100644
--- a/.github/actions/submit-delete-k8s-job/action.yml
+++ b/.github/actions/submit-delete-k8s-job/action.yml
@@ -42,7 +42,7 @@ runs:
           fi
 
           # Check whether the job succeeded or failed
-          while readarray -d : -t status < <(kubectl get job/${{ inputs.job-name }} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
+          while IFS=: read -r failures successes; do
             failures="${status[0]:-0}"
             successes="${status[1]:-0}"
             total=$((failures + successes))
@@ -59,7 +59,9 @@ runs:
               echo "Unexpected number of completed pods ${total} with parallelism ${parallelism}"
               exit 255
             fi
-          done
+          done <<EOF
+          $(kubectl get job/"${{ inputs.job-name }}" -o 'jsonpath={.status.failed}:{.status.succeeded}')
+          EOF
 
           # If job indicates a failure try to print out the info
           if [ $failures -gt 0 ]; then
@@ -70,7 +72,7 @@ runs:
               kubectl describe ${pods}
             fi
             exit 1
-          fi 
+          fi
         post: |
           echo "Deleting K8s job: ${{ inputs.job-name }}"
           kubectl delete -f "${{ inputs.job-config-file }}"

From 7e70be7bf44a76c6f9f792f0dfe0f440df99316f Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 10 Mar 2025 16:38:22 +0100
Subject: [PATCH 7/7] test on name of the volume and location

---
 .github/eks-workflow-files/axlearn/axlearn-job.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml
index 5c65975a6..090ad19f3 100644
--- a/.github/eks-workflow-files/axlearn/axlearn-job.yml
+++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -27,10 +27,10 @@ spec:
                       sync  
                       wait 
                       # copy results to the mounted s3 bucket 
-                      mkdir -p /opt/jax-toolbox-eks-output/axlearn/${RUN_ID}
-                      cp /opt/output/summary.txt /opt/jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt
+                      mkdir -p /jax-toolbox-eks-output/axlearn/${RUN_ID}
+                      cp /opt/output/summary.txt /jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt
                       # copy all the log files 
-                      cp /opt/output/*.log /opt/jax-toolbox-eks-output/axlearn/${RUN_ID}/.
+                      cp /opt/output/*.log /jax-toolbox-eks-output/axlearn/${RUN_ID}/.
                   env:
                     - name: RUN_ID
                       value: PLACEHOLDER
@@ -41,7 +41,7 @@ spec:
                     - name: output
                       mountPath: /opt/output
                     - name: s3-storage 
-                      mountPath: /opt/jax-toolbox-eks-output
+                      mountPath: /jax-toolbox-eks-output
             imagePullSecrets:
                 - name: PLACEHOLDER
             volumes: