NVIDIA · Steboss · Mar 7, 2025 · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025
diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml
@@ -14,9 +14,10 @@ runs:
   steps:
     - name: Submit and Delete Kubernetes job
       uses: ./.github/actions/with-post-step 
+      shell: bash -eo pipefail
       with: 
         main: |
-          echo "Submit K8s job" 
+          echo "Submit K8s job ${{ inputs.job-config-file }}" 
           kubectl apply -f "${{ inputs.job-config-file }}"
 
           # Wait for job to be craeted
@@ -32,6 +33,37 @@ runs:
 
           # Stream logs
           kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }}
-
+
+          # Check whether the job succeeded or failed 
+          while readarray -d : -t status < <(kubectl get job/${{ inputs.job-name }} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
+            failures="${status[0]:-0}"
+            successes="${status[1]:-0}"
+            total=$((failures + successes))
+
+            if [[ $total -lt 2 ]]; then
+              # neither "failed" nor "succeeded" is 2, so wait
+              sleep 1
+            elif [[ $total -eq 2 ]]; then
+              # we have total=2 => either 2 successes or 2 failures 
+              # (or 1 failed + 1 succeeded). 
+              # In any case, the job is done – break.
+              break
+            else
+              # Just in case we get an unexpected number
+              exit 255
+            fi
+          done
+
+          # If job indicates a failure try to print out the info
+          if [[ $failures -gt 0 ]]; then
+            echo "Job ${{ inputs.job-name }} has $failures failures"
+            # this is for batch jobs only
+            pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} -o name)
+            if [[ -n "${pods}" ]]; then
+              kubectl describe ${pods}
+            fi
+            exit 1
+          fi 
         post: | 
+          echo "Deleting K8s job: ${{ input.job-name }}"
           kubectl delete -f "${{ inputs.job-config-file }}"
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
@@ -769,4 +769,3 @@ jobs:
       with:
         job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
         job-name: ${{ env.JOB_NAME }}
-
diff --git a/README.md b/README.md
@@ -10,12 +10,12 @@ We support and test the following JAX frameworks and model architectures. More d
 
 | Framework | Models | Use cases | Container |
 | :--- | :---: | :---: | :---: |
-| [maxtext](./rosetta/rosetta/projects/maxtext)| GPT, LLaMA, Gemma, Mistral, Mixtral | pretraining | `ghcr.io/nvidia/jax:maxtext` |
+| [maxtext](./rosetta/rosetta/projects/maxtext)| GPT, LLaMA, Gemma, Mistral, Mixtral | pre-training | `ghcr.io/nvidia/jax:maxtext` |
 | [t5x](./rosetta/rosetta/projects/t5x) | T5, ViT | pre-training, fine-tuning | `ghcr.io/nvidia/jax:t5x` |
 | [t5x](./rosetta/rosetta/projects/imagen) | Imagen | pre-training | `ghcr.io/nvidia/t5x:imagen-2023-10-02.v3` |
 | [big vision](./rosetta/rosetta/projects/paligemma) | PaliGemma | fine-tuning, evaluation | `ghcr.io/nvidia/jax:gemma` |
-| levanter | GPT, LLaMA, MPT, Backpacks | pretraining, fine-tuning | `ghcr.io/nvidia/jax:levanter` |
-| axlearn | Fuji | pretraining | `gchr.io/nvidia/jax:axlearn` | 
+| levanter | GPT, LLaMA, MPT, Backpacks | pre-training, fine-tuning | `ghcr.io/nvidia/jax:levanter` |
+| axlearn | Fuji | pre-training | `gchr.io/nvidia/jax:axlearn` | 
 
 # Build Pipeline Status
 <table>
@@ -269,7 +269,7 @@ We support and test the following JAX frameworks and model architectures. More d
       </td>
       <td>
         <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae#file-badge-maxtext-test-json">
-          <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-axleran-test.json&logo=nvidia&label=A100%20distributed">
+          <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-axlearn-test.json&logo=nvidia&label=A100%20distributed">
         </a>
       </td>
     </tr>
Original file line number	Diff line number	Diff line change
Expand Up		@@ -769,4 +769,3 @@ jobs:
		with:
		job-config-file: ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
		job-name: ${{ env.JOB_NAME }}