From e5ee4726468d8973f9c75cab3505d4a81c989d24 Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 7 Mar 2025 08:54:37 +0000 Subject: [PATCH 1/7] Improve error handling, s3 mounting, distributed tests for --- .../actions/submit-delete-k8s-job/action.yml | 36 +++++++++++++++++-- .github/workflows/_ci.yaml | 1 - README.md | 8 ++--- 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml index dbeabe668..ffaa35575 100644 --- a/.github/actions/submit-delete-k8s-job/action.yml +++ b/.github/actions/submit-delete-k8s-job/action.yml @@ -14,9 +14,10 @@ runs: steps: - name: Submit and Delete Kubernetes job uses: ./.github/actions/with-post-step + shell: bash -eo pipefail with: main: | - echo "Submit K8s job" + echo "Submit K8s job ${{ inputs.job-config-file }}" kubectl apply -f "${{ inputs.job-config-file }}" # Wait for job to be craeted @@ -32,6 +33,37 @@ runs: # Stream logs kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }} - + + # Check whether the job succeeded or failed + while readarray -d : -t status < <(kubectl get job/${{ inputs.job-name }} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do + failures="${status[0]:-0}" + successes="${status[1]:-0}" + total=$((failures + successes)) + + if [[ $total -lt 2 ]]; then + # neither "failed" nor "succeeded" is 2, so wait + sleep 1 + elif [[ $total -eq 2 ]]; then + # we have total=2 => either 2 successes or 2 failures + # (or 1 failed + 1 succeeded). + # In any case, the job is done – break. + break + else + # Just in case we get an unexpected number + exit 255 + fi + done + + # If job indicates a failure try to print out the info + if [[ $failures -gt 0 ]]; then + echo "Job ${{ inputs.job-name }} has $failures failures" + # this is for batch jobs only + pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} -o name) + if [[ -n "${pods}" ]]; then + kubectl describe ${pods} + fi + exit 1 + fi post: | + echo "Deleting K8s job: ${{ input.job-name }}" kubectl delete -f "${{ inputs.job-config-file }}" diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 8ed17d9d6..20084befa 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -769,4 +769,3 @@ jobs: with: job-config-file: ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml" job-name: ${{ env.JOB_NAME }} - diff --git a/README.md b/README.md index 83053215e..f6778d128 100644 --- a/README.md +++ b/README.md @@ -10,12 +10,12 @@ We support and test the following JAX frameworks and model architectures. More d | Framework | Models | Use cases | Container | | :--- | :---: | :---: | :---: | -| [maxtext](./rosetta/rosetta/projects/maxtext)| GPT, LLaMA, Gemma, Mistral, Mixtral | pretraining | `ghcr.io/nvidia/jax:maxtext` | +| [maxtext](./rosetta/rosetta/projects/maxtext)| GPT, LLaMA, Gemma, Mistral, Mixtral | pre-training | `ghcr.io/nvidia/jax:maxtext` | | [t5x](./rosetta/rosetta/projects/t5x) | T5, ViT | pre-training, fine-tuning | `ghcr.io/nvidia/jax:t5x` | | [t5x](./rosetta/rosetta/projects/imagen) | Imagen | pre-training | `ghcr.io/nvidia/t5x:imagen-2023-10-02.v3` | | [big vision](./rosetta/rosetta/projects/paligemma) | PaliGemma | fine-tuning, evaluation | `ghcr.io/nvidia/jax:gemma` | -| levanter | GPT, LLaMA, MPT, Backpacks | pretraining, fine-tuning | `ghcr.io/nvidia/jax:levanter` | -| axlearn | Fuji | pretraining | `gchr.io/nvidia/jax:axlearn` | +| levanter | GPT, LLaMA, MPT, Backpacks | pre-training, fine-tuning | `ghcr.io/nvidia/jax:levanter` | +| axlearn | Fuji | pre-training | `gchr.io/nvidia/jax:axlearn` | # Build Pipeline Status @@ -269,7 +269,7 @@ We support and test the following JAX frameworks and model architectures. More d From 9de29cd1a487f60c8f0e904a04234c54de68b5f7 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 10 Mar 2025 10:37:14 +0100 Subject: [PATCH 2/7] test mounted s3 bucket --- .../actions/submit-delete-k8s-job/action.yml | 26 +- .github/container/symlnk-cudnn.sh | 2 +- .github/container/symlnk-nccl.sh | 2 +- .../axlearn/axlearn-job.yml | 31 +- .github/workflows/_ci.yaml | 976 +++++++++--------- README.md | 2 +- 6 files changed, 518 insertions(+), 521 deletions(-) diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml index ffaa35575..6788ebe3b 100644 --- a/.github/actions/submit-delete-k8s-job/action.yml +++ b/.github/actions/submit-delete-k8s-job/action.yml @@ -34,22 +34,30 @@ runs: # Stream logs kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }} - # Check whether the job succeeded or failed + # Detect job parallelism + parallelism=$(kubectl get job/"${{ inputs.job-name }}" -o jsonpath='{.spec.parallelism}') + # if parallelism is not set, use default value of 1 + if [[ -z "${parallelism}" ]]; then + echo "No parallelism specified, defaulting to 1" + parallelism=1 + fi + + # Check whether the job succeeded or failed while readarray -d : -t status < <(kubectl get job/${{ inputs.job-name }} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do failures="${status[0]:-0}" successes="${status[1]:-0}" total=$((failures + successes)) - if [[ $total -lt 2 ]]; then - # neither "failed" nor "succeeded" is 2, so wait + if [[ $total -lt $parallelism ]]; then + # neither "failed" nor "succeeded", so wait sleep 1 - elif [[ $total -eq 2 ]]; then - # we have total=2 => either 2 successes or 2 failures - # (or 1 failed + 1 succeeded). - # In any case, the job is done – break. + elif [[ $total -eq $parallelism ]]; then + # we have total=parallelism => either X successes or X failures + # In any case, the job is done break else - # Just in case we get an unexpected number + # Log here + echo "Unexpected number of completed pods ${total} with parallelism ${parallelism}" exit 255 fi done @@ -64,6 +72,6 @@ runs: fi exit 1 fi - post: | + post: | echo "Deleting K8s job: ${{ input.job-name }}" kubectl delete -f "${{ inputs.job-config-file }}" diff --git a/.github/container/symlnk-cudnn.sh b/.github/container/symlnk-cudnn.sh index 5db2c411f..824ab1e7d 100755 --- a/.github/container/symlnk-cudnn.sh +++ b/.github/container/symlnk-cudnn.sh @@ -10,7 +10,7 @@ CUDNN_MAJOR_VERSION=9 prefix=/opt/nvidia/cudnn if [[ -d "${prefix}" ]]; then echo "Skipping link farm creation" - exit 1 + exit 0 fi arch=$(uname -m)-linux-gnu diff --git a/.github/container/symlnk-nccl.sh b/.github/container/symlnk-nccl.sh index 33b4ebaa9..8e3d335e0 100755 --- a/.github/container/symlnk-nccl.sh +++ b/.github/container/symlnk-nccl.sh @@ -8,7 +8,7 @@ set -ex -o pipefail prefix=/opt/nvidia/nccl if [[ -d "${prefix}" ]]; then echo "Skipping link farm creation" - exit 1 + exit 0 fi arch=$(uname -m)-linux-gnu nccl_packages=$(dpkg -l 'libnccl*' | awk '/^ii/ {print $2}') diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index 8f70908da..1c0a57e4f 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -26,33 +26,22 @@ spec: sync wait - # after execution flag the results have been produced - touch /opt/output/done - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: output - mountPath: /opt/output - - name: upload - image: amazon/aws-cli + # copy results to the mounted s3 bucket + mkdir -p /opt/jax-toolbox-eks-output/axlearn/${RUN_ID} + cp /opt/output/summary.txt /opt/jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt + # copy all the log files + cp /opt/output/*.log /opt/jax-toolbox-eks-output/axlearn/${RUN_ID}/. env: - name: RUN_ID value: PLACEHOLDER - command: - - sh - - -c - - | - while [ ! -f /opt/output/done ]; do - sleep 5 - done - # Upload to S3 bucket - aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt - # Upload logs to S3 bucket - aws s3 cp /opt/output/ s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/ --recursive --exclude "*" --include "*.log" + resources: + limits: + nvidia.com/gpu: 8 volumeMounts: - name: output mountPath: /opt/output + - name: s3-storage + mountPath: /opt/jax-toolbox-eks-output imagePullSecrets: - name: PLACEHOLDER volumes: diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 20084befa..d2ea0b7c5 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -66,115 +66,115 @@ jobs: URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} secrets: inherit - build-triton: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-triton-build - BADGE_FILENAME: badge-triton-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: triton - DOCKERFILE: .github/container/Dockerfile.triton - RUNNER_SIZE: large - EXTRA_BUILD_ARGS: | - URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} - secrets: inherit + # build-triton: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-triton-build + # BADGE_FILENAME: badge-triton-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: triton + # DOCKERFILE: .github/container/Dockerfile.triton + # RUNNER_SIZE: large + # EXTRA_BUILD_ARGS: | + # URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} + # secrets: inherit - build-equinox: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-equinox-build - BADGE_FILENAME: badge-equinox-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: equinox - DOCKERFILE: .github/container/Dockerfile.equinox - EXTRA_BUILD_ARGS: | - URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} - secrets: inherit + # build-equinox: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-equinox-build + # BADGE_FILENAME: badge-equinox-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: equinox + # DOCKERFILE: .github/container/Dockerfile.equinox + # EXTRA_BUILD_ARGS: | + # URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} + # secrets: inherit - build-maxtext: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-maxtext-build - BADGE_FILENAME: badge-maxtext-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: maxtext - DOCKERFILE: .github/container/Dockerfile.maxtext - EXTRA_BUILD_ARGS: | - URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} - secrets: inherit + # build-maxtext: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-maxtext-build + # BADGE_FILENAME: badge-maxtext-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: maxtext + # DOCKERFILE: .github/container/Dockerfile.maxtext + # EXTRA_BUILD_ARGS: | + # URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} + # secrets: inherit - build-levanter: - needs: [build-jax] - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: "artifact-levanter-build" - BADGE_FILENAME: "badge-levanter-build" - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: levanter - DOCKERFILE: .github/container/Dockerfile.levanter - EXTRA_BUILD_ARGS: | - URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} - URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} - secrets: inherit + # build-levanter: + # needs: [build-jax] + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: "artifact-levanter-build" + # BADGE_FILENAME: "badge-levanter-build" + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: levanter + # DOCKERFILE: .github/container/Dockerfile.levanter + # EXTRA_BUILD_ARGS: | + # URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} + # URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} + # secrets: inherit - build-upstream-t5x: - needs: build-jax - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: "artifact-t5x-build" - BADGE_FILENAME: "badge-t5x-build" - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: upstream-t5x - DOCKERFILE: .github/container/Dockerfile.t5x - EXTRA_BUILD_ARGS: | - URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} - URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} - secrets: inherit + # build-upstream-t5x: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: "artifact-t5x-build" + # BADGE_FILENAME: "badge-t5x-build" + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: upstream-t5x + # DOCKERFILE: .github/container/Dockerfile.t5x + # EXTRA_BUILD_ARGS: | + # URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} + # URLREF_AIRIO=${{ fromJson(inputs.SOURCE_URLREFS).AIRIO }} + # secrets: inherit - build-rosetta-t5x: - needs: build-upstream-t5x - uses: ./.github/workflows/_build_rosetta.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} - BASE_LIBRARY: t5x - secrets: inherit + # build-rosetta-t5x: + # needs: build-upstream-t5x + # uses: ./.github/workflows/_build_rosetta.yaml + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} + # BASE_LIBRARY: t5x + # secrets: inherit - build-gemma: - needs: build-jax - uses: ./.github/workflows/_build.yaml - if: inputs.ARCHITECTURE == 'amd64' # build only amd64 - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-gemma-build - BADGE_FILENAME: badge-gemma-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: gemma - DOCKERFILE: rosetta/Dockerfile.gemma - DOCKER_CONTEXT: . - EXTRA_BUILD_ARGS: | - URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} - URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} - URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} - URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} - URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} - secrets: inherit + # build-gemma: + # needs: build-jax + # uses: ./.github/workflows/_build.yaml + # if: inputs.ARCHITECTURE == 'amd64' # build only amd64 + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-gemma-build + # BADGE_FILENAME: badge-gemma-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: gemma + # DOCKERFILE: rosetta/Dockerfile.gemma + # DOCKER_CONTEXT: . + # EXTRA_BUILD_ARGS: | + # URLREF_GEMMA=${{ fromJson(inputs.SOURCE_URLREFS).GEMMA }} + # URLREF_BIG_VISION=${{ fromJson(inputs.SOURCE_URLREFS).BIG_VISION }} + # URLREF_COMMON_LOOP_UTILS=${{ fromJson(inputs.SOURCE_URLREFS).COMMON_LOOP_UTILS }} + # URLREF_FLAXFORMER=${{ fromJson(inputs.SOURCE_URLREFS).FLAXFORMER }} + # URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} + # secrets: inherit build-axlearn: needs: build-jax @@ -196,13 +196,13 @@ jobs: needs: - build-base - build-jax - - build-triton - - build-equinox - - build-maxtext - - build-levanter - - build-upstream-t5x - - build-rosetta-t5x - - build-gemma + # - build-triton + # - build-equinox + # - build-maxtext + # - build-levanter + # - build-upstream-t5x + # - build-rosetta-t5x + # - build-gemma - build-axlearn outputs: TAGS: ${{ steps.collect-tags.outputs.TAGS }} @@ -214,22 +214,22 @@ jobs: [\ {"flavor": "base", "stage": "final", "priority": 800, "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\ {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ + # {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "axlearn", "stage": "final", "priority": 900, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ + # {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "axlearn", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\ {}\ @@ -239,404 +239,404 @@ jobs: echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT - test-distribution: - runs-on: ubuntu-22.04 - strategy: - matrix: - TEST_SCRIPT: - - extra-only-distribution.sh - - mirror-only-distribution.sh - - upstream-only-distribution.sh - - local-patch-distribution.sh - fail-fast: false - steps: - - name: Print environment variables - run: env - - name: Set git login for tests - run: | - git config --global user.email "jax@nvidia.com" - git config --global user.name "JAX-Toolbox CI" - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v4 - - name: Run integration test ${{ matrix.TEST_SCRIPT }} - run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} - - test-jax: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: jax - EXECUTE: | - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-backend-independent.log - test-jax.sh -b backend-independent - EOF - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee tee test-gpu.log - nvidia-cuda-mps-control -d - test-jax.sh -b gpu - EOF - STATISTICS_SCRIPT: | - errors=$(cat test-*.log | grep -c 'ERROR:' || true) - failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) - passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-backend-independent.log - test-gpu.log - secrets: inherit - - test-nsys-jax: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: nsys-jax - EXECUTE: | - set -o pipefail - num_tests=0 - num_failures=0 - # Run the pytest-driven tests; failure is explicitly handled below so set +e to - # avoid an early abort here. - set +e - docker run -i --shm-size=1g --gpus all \ - -v $PWD:/opt/output \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-nsys-jax.log - # nsys-jax is already installed, this is just adding the test dependencies - pip install pytest-reportlog nsys-jax[test] - # abuse knowledge that nsys-jax is installed editable, so the tests exist - test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') - pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" - EOF - set -e - GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') - for mode in 1-process 2-process process-per-gpu; do - DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" - if [[ "${mode}" == "1-process" ]]; then - PROCESS_COUNT=1 - ARGS="" - elif [[ "${mode}" == "2-process" ]]; then - # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that - # this will flush out more bugs than process-per-node or process-per-GPU. - PROCESS_COUNT=2 - ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" - else - PROCESS_COUNT=${GPUS_PER_NODE} - ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" - fi - for collection in full partial; do - NSYS_JAX="nsys-jax" - if [[ "${mode}" == "1-process" ]]; then - # We will not run nsys-jax-combine, so run analyses eagerly - NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" - fi - NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" - if [[ "${collection}" == "partial" ]]; then - NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" - # nvbug/4801401 - NSYS_JAX+=" --sample=none" - fi - set +e - ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ - -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log - num_failures=$((num_failures + ($? != 0))) - set -e - num_tests=$((num_tests + 1)) - done - if [[ "${mode}" != "1-process" ]]; then - # Run nsys-jax-combine - NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" - for (( i=0; i> $GITHUB_ENV - echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV - exit $num_failures - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-nsys-jax.log) - num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) - num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) - num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT - ARTIFACTS: | - # pytest-driven part - test-nsys-jax.log - pytest-report.jsonl - # nsys-jax logfiles - *process-*-execution.log - # nsys-jax output for the case that doesn't use nsys-jax-combine - 1-process-*-execution-0.zip - # nsys-jax-combine output/logfiles - *process*-*-execution.zip - *-execution-combine.log - secrets: inherit - - #test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test - #runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does - #not already have nsys-jax installed - test-nsys-jax-archive: - needs: test-nsys-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - strategy: - matrix: - os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] - runs-on: ${{ matrix.os }} - steps: - - name: Download nsys-jax output .zip files - uses: actions/download-artifact@v4 - with: - name: nsys-jax-unit-test-A100 - - name: Extract archives and execute install scripts - run: | - pip install virtualenv # for install.sh - for zip in $(ls *.zip); do - ZIP="${PWD}/${zip}" - pushd $(mktemp -d) - unzip "${ZIP}" - ls -l - # TODO: verify this isn't needed, or make sure it isn't needed - chmod 755 install.sh - # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout - # Skip executing Jupyter lab - NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh - popd - done - - test-nsys-jax-eks: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - runs-on: eks - env: - JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: ${{ github.run_id }}-nsys-jax - POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess - steps: - - name: Check out the repository - uses: actions/checkout@v4 - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: K8s GHCR store and delete token - id: store-token - uses: ./.github/actions/store-delete-k8s-ghcr - - name: Configure Kubernetes job - run: | - yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) - | select(di == 1).metadata.name = strenv(JOB_NAME) - | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" - | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) - | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \ - .github/eks-workflow-files/job.yml - git diff .github/eks-workflow-files/job.yml - - name: Submit Kubernetes job - uses: ./.github/actions/submit-delete-k8s-job - with: - job-config-file: .github/eks-workflow-files/job.yml - job-name: ${{ env.JOB_NAME }} - - name: Configure post-processing job - run: | - export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" - yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) - | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) - | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" - | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ - .github/eks-workflow-files/post-process-job.yml - git diff .github/eks-workflow-files/post-process-job.yml - - name: Submit post process Kubernetes job - uses: ./.github/actions/submit-delete-k8s-job - with: - job-config-file: .github/eks-workflow-files/post-process-job.yml - job-name: ${{ env.POSTPROCESS_JOB_NAME }} - - # test-equinox: - # needs: build-equinox + # test-distribution: + # runs-on: ubuntu-22.04 + # strategy: + # matrix: + # TEST_SCRIPT: + # - extra-only-distribution.sh + # - mirror-only-distribution.sh + # - upstream-only-distribution.sh + # - local-patch-distribution.sh + # fail-fast: false + # steps: + # - name: Print environment variables + # run: env + # - name: Set git login for tests + # run: | + # git config --global user.email "jax@nvidia.com" + # git config --global user.name "JAX-Toolbox CI" + # - name: Check out the repository under ${GITHUB_WORKSPACE} + # uses: actions/checkout@v4 + # - name: Run integration test ${{ matrix.TEST_SCRIPT }} + # run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} + + # test-jax: + # needs: build-jax # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a # uses: ./.github/workflows/_test_unit.yaml # with: - # IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} - # TEST_NAME: equinox + # TEST_NAME: jax # EXECUTE: | - # docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ - # bash -exc -o pipefail \ - # 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-backend-independent.log + # test-jax.sh -b backend-independent + # EOF + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee tee test-gpu.log + # nvidia-cuda-mps-control -d + # test-jax.sh -b gpu + # EOF # STATISTICS_SCRIPT: | - # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # errors=$(cat test-*.log | grep -c 'ERROR:' || true) + # failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) + # passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) # total_tests=$((failed_tests + passed_tests)) # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT # ARTIFACTS: | - # test-equinox.log + # test-backend-independent.log + # test-gpu.log # secrets: inherit - # test-te-multigpu: - # needs: build-upstream-pax + # test-nsys-jax: + # needs: build-jax # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_te.yaml + # uses: ./.github/workflows/_test_unit.yaml # with: - # TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} + # TEST_NAME: nsys-jax + # EXECUTE: | + # set -o pipefail + # num_tests=0 + # num_failures=0 + # # Run the pytest-driven tests; failure is explicitly handled below so set +e to + # # avoid an early abort here. + # set +e + # docker run -i --shm-size=1g --gpus all \ + # -v $PWD:/opt/output \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-nsys-jax.log + # # nsys-jax is already installed, this is just adding the test dependencies + # pip install pytest-reportlog nsys-jax[test] + # # abuse knowledge that nsys-jax is installed editable, so the tests exist + # test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') + # pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" + # EOF + # set -e + # GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') + # for mode in 1-process 2-process process-per-gpu; do + # DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" + # if [[ "${mode}" == "1-process" ]]; then + # PROCESS_COUNT=1 + # ARGS="" + # elif [[ "${mode}" == "2-process" ]]; then + # # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that + # # this will flush out more bugs than process-per-node or process-per-GPU. + # PROCESS_COUNT=2 + # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" + # else + # PROCESS_COUNT=${GPUS_PER_NODE} + # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" + # fi + # for collection in full partial; do + # NSYS_JAX="nsys-jax" + # if [[ "${mode}" == "1-process" ]]; then + # # We will not run nsys-jax-combine, so run analyses eagerly + # NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" + # fi + # NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" + # if [[ "${collection}" == "partial" ]]; then + # NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" + # # nvbug/4801401 + # NSYS_JAX+=" --sample=none" + # fi + # set +e + # ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ + # -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log + # num_failures=$((num_failures + ($? != 0))) + # set -e + # num_tests=$((num_tests + 1)) + # done + # if [[ "${mode}" != "1-process" ]]; then + # # Run nsys-jax-combine + # NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" + # for (( i=0; i> $GITHUB_ENV + # echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV + # exit $num_failures + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-nsys-jax.log) + # num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + # total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) + # num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) + # num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # # pytest-driven part + # test-nsys-jax.log + # pytest-report.jsonl + # # nsys-jax logfiles + # *process-*-execution.log + # # nsys-jax output for the case that doesn't use nsys-jax-combine + # 1-process-*-execution-0.zip + # # nsys-jax-combine output/logfiles + # *process*-*-execution.zip + # *-execution-combine.log # secrets: inherit - test-upstream-t5x: - needs: build-upstream-t5x - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_upstream_t5x.yaml - with: - T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + #test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test + #runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does + #not already have nsys-jax installed + # test-nsys-jax-archive: + # needs: test-nsys-jax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # strategy: + # matrix: + # os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] + # runs-on: ${{ matrix.os }} + # steps: + # - name: Download nsys-jax output .zip files + # uses: actions/download-artifact@v4 + # with: + # name: nsys-jax-unit-test-A100 + # - name: Extract archives and execute install scripts + # run: | + # pip install virtualenv # for install.sh + # for zip in $(ls *.zip); do + # ZIP="${PWD}/${zip}" + # pushd $(mktemp -d) + # unzip "${ZIP}" + # ls -l + # # TODO: verify this isn't needed, or make sure it isn't needed + # chmod 755 install.sh + # # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout + # # Skip executing Jupyter lab + # NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh + # popd + # done + + # test-nsys-jax-eks: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # runs-on: eks + # env: + # JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + # JOB_NAME: ${{ github.run_id }}-nsys-jax + # POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess + # steps: + # - name: Check out the repository + # uses: actions/checkout@v4 + # - name: Login to GitHub Container Registry + # uses: docker/login-action@v3 + # with: + # registry: ghcr.io + # username: ${{ github.repository_owner }} + # password: ${{ secrets.GITHUB_TOKEN }} + # - name: K8s GHCR store and delete token + # id: store-token + # uses: ./.github/actions/store-delete-k8s-ghcr + # - name: Configure Kubernetes job + # run: | + # yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) + # | select(di == 1).metadata.name = strenv(JOB_NAME) + # | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" + # | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) + # | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \ + # .github/eks-workflow-files/job.yml + # git diff .github/eks-workflow-files/job.yml + # - name: Submit Kubernetes job + # uses: ./.github/actions/submit-delete-k8s-job + # with: + # job-config-file: .github/eks-workflow-files/job.yml + # job-name: ${{ env.JOB_NAME }} + # - name: Configure post-processing job + # run: | + # export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" + # yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) + # | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) + # | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" + # | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ + # .github/eks-workflow-files/post-process-job.yml + # git diff .github/eks-workflow-files/post-process-job.yml + # - name: Submit post process Kubernetes job + # uses: ./.github/actions/submit-delete-k8s-job + # with: + # job-config-file: .github/eks-workflow-files/post-process-job.yml + # job-name: ${{ env.POSTPROCESS_JOB_NAME }} + + # # test-equinox: + # # needs: build-equinox + # # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # # uses: ./.github/workflows/_test_unit.yaml + # # with: + # # IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} + # # TEST_NAME: equinox + # # EXECUTE: | + # # docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ + # # bash -exc -o pipefail \ + # # 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log + # # STATISTICS_SCRIPT: | + # # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # # total_tests=$((failed_tests + passed_tests)) + # # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # # ARTIFACTS: | + # # test-equinox.log + # # secrets: inherit + + # # test-te-multigpu: + # # needs: build-upstream-pax + # # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # # uses: ./.github/workflows/_test_te.yaml + # # with: + # # TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} + # # secrets: inherit + + # test-upstream-t5x: + # needs: build-upstream-t5x + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_upstream_t5x.yaml + # with: + # T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-rosetta-t5x: - needs: build-rosetta-t5x - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_t5x_rosetta.yaml - with: - T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-rosetta-t5x: + # needs: build-rosetta-t5x + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_t5x_rosetta.yaml + # with: + # T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-triton: - needs: build-triton - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: triton - EXECUTE: | - docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ - ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-triton.log - # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on - # actually having a CUDA backend for pytoch - pip install --no-deps torch - python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml - EOF - STATISTICS_SCRIPT: | - curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; - total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) - errors=$(./yq '.testsuites."+@errors"' triton_test.xml) - failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) - passed_tests=$((total_tests - errors - failed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-triton.log - secrets: inherit + # test-triton: + # needs: build-triton + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: triton + # EXECUTE: | + # docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ + # ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-triton.log + # # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on + # # actually having a CUDA backend for pytoch + # pip install --no-deps torch + # python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml + # EOF + # STATISTICS_SCRIPT: | + # curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; + # total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) + # errors=$(./yq '.testsuites."+@errors"' triton_test.xml) + # failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) + # passed_tests=$((total_tests - errors - failed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-triton.log + # secrets: inherit - test-levanter: - needs: build-levanter - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: levanter - EXECUTE: | - docker run -i --gpus all --shm-size=1g \ - ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-levanter.log - pip install flake8 pytest soundfile librosa - PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" - EOF - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-levanter.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-levanter.log - secrets: inherit - - # test-te: - # needs: build-upstream-pax + # test-levanter: + # needs: build-levanter # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a # uses: ./.github/workflows/_test_unit.yaml # with: - # TEST_NAME: te + # TEST_NAME: levanter # EXECUTE: | - # docker run -i --gpus all --shm-size=1g -v $PWD:/log \ - # ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-te.log - # pip install pytest-reportlog - # pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax + # docker run -i --gpus all --shm-size=1g \ + # ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-levanter.log + # pip install flake8 pytest soundfile librosa + # PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" # EOF # STATISTICS_SCRIPT: | - # summary_line=$(tail -n1 test-te.log) + # summary_line=$(tail -n1 test-levanter.log) # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') # total_tests=$((failed_tests + passed_tests)) # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # TIMEOUT_MINUTES: 120 # ARTIFACTS: | - # test-te.log - # pytest-report.jsonl + # test-levanter.log + # secrets: inherit + + # # test-te: + # # needs: build-upstream-pax + # # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # # uses: ./.github/workflows/_test_unit.yaml + # # with: + # # TEST_NAME: te + # # EXECUTE: | + # # docker run -i --gpus all --shm-size=1g -v $PWD:/log \ + # # ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ + # # bash <<"EOF" |& tee test-te.log + # # pip install pytest-reportlog + # # pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax + # # EOF + # # STATISTICS_SCRIPT: | + # # summary_line=$(tail -n1 test-te.log) + # # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + # # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + # # total_tests=$((failed_tests + passed_tests)) + # # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # # TIMEOUT_MINUTES: 120 + # # ARTIFACTS: | + # # test-te.log + # # pytest-report.jsonl + # # secrets: inherit + + # test-gemma: + # needs: build-gemma + # uses: ./.github/workflows/_test_unit.yaml + # if: inputs.ARCHITECTURE == 'amd64' + # with: + # TEST_NAME: gemma + # EXECUTE: | + # docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ + # bash -ec \ + # "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-gemma.log) + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-gemma.log # secrets: inherit - test-gemma: - needs: build-gemma - uses: ./.github/workflows/_test_unit.yaml - if: inputs.ARCHITECTURE == 'amd64' - with: - TEST_NAME: gemma - EXECUTE: | - docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ - bash -ec \ - "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-gemma.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-gemma.log - secrets: inherit - - test-maxtext: - needs: build-maxtext - if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners - uses: ./.github/workflows/_test_maxtext.yaml - with: - MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-maxtext: + # needs: build-maxtext + # if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners + # uses: ./.github/workflows/_test_maxtext.yaml + # with: + # MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit test-axlearn-eks: needs: build-axlearn @@ -663,7 +663,7 @@ jobs: yq -i ea ' select(di == 0).metadata.name = strenv(JOB_NAME) | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) - | select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}" + | select(di == 0).spec.template.spec.containers[0].env[0].value = "${{ github.run_id }}" | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ .github/eks-workflow-files/axlearn/axlearn-job.yml git diff .github/eks-workflow-files/axlearn/axlearn-job.yml diff --git a/README.md b/README.md index f6778d128..bca12d6e5 100644 --- a/README.md +++ b/README.md @@ -269,7 +269,7 @@ We support and test the following JAX frameworks and model architectures. More d From 6c6cd345f3075d73d0cbc6ea318504341b1f9f1d Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 10 Mar 2025 11:50:57 +0100 Subject: [PATCH 3/7] Fix action --- .github/actions/submit-delete-k8s-job/action.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml index 6788ebe3b..ffff89751 100644 --- a/.github/actions/submit-delete-k8s-job/action.yml +++ b/.github/actions/submit-delete-k8s-job/action.yml @@ -14,7 +14,6 @@ runs: steps: - name: Submit and Delete Kubernetes job uses: ./.github/actions/with-post-step - shell: bash -eo pipefail with: main: | echo "Submit K8s job ${{ inputs.job-config-file }}" @@ -73,5 +72,5 @@ runs: exit 1 fi post: | - echo "Deleting K8s job: ${{ input.job-name }}" + echo "Deleting K8s job: ${{ inputs.job-name }}" kubectl delete -f "${{ inputs.job-config-file }}" From aa63ac6e06a8540055f6ba522a9dedc12a1e8ce3 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 10 Mar 2025 12:36:28 +0100 Subject: [PATCH 4/7] fix the bash shell and remember to mount the volume --- .github/actions/submit-delete-k8s-job/action.yml | 1 + .github/eks-workflow-files/axlearn/axlearn-job.yml | 3 +++ 2 files changed, 4 insertions(+) diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml index ffff89751..44b630ec4 100644 --- a/.github/actions/submit-delete-k8s-job/action.yml +++ b/.github/actions/submit-delete-k8s-job/action.yml @@ -14,6 +14,7 @@ runs: steps: - name: Submit and Delete Kubernetes job uses: ./.github/actions/with-post-step + shell: bash with: main: | echo "Submit K8s job ${{ inputs.job-config-file }}" diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index 1c0a57e4f..5c65975a6 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -47,3 +47,6 @@ spec: volumes: - name: output emptyDir: {} + - name: s3-storage + persistentVolumeClaim: + claimName: s3-pvc From 0ae1b83f63f4dbb1cacf88a415f55e17250c70b5 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 10 Mar 2025 14:06:42 +0100 Subject: [PATCH 5/7] start working on the shell of the action --- .github/actions/submit-delete-k8s-job/action.yml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml index 44b630ec4..fd76be0d8 100644 --- a/.github/actions/submit-delete-k8s-job/action.yml +++ b/.github/actions/submit-delete-k8s-job/action.yml @@ -14,7 +14,6 @@ runs: steps: - name: Submit and Delete Kubernetes job uses: ./.github/actions/with-post-step - shell: bash with: main: | echo "Submit K8s job ${{ inputs.job-config-file }}" @@ -37,7 +36,7 @@ runs: # Detect job parallelism parallelism=$(kubectl get job/"${{ inputs.job-name }}" -o jsonpath='{.spec.parallelism}') # if parallelism is not set, use default value of 1 - if [[ -z "${parallelism}" ]]; then + if [ -z "${parallelism}" ]; then echo "No parallelism specified, defaulting to 1" parallelism=1 fi @@ -48,10 +47,10 @@ runs: successes="${status[1]:-0}" total=$((failures + successes)) - if [[ $total -lt $parallelism ]]; then + if [ $total -lt $parallelism ]; then # neither "failed" nor "succeeded", so wait sleep 1 - elif [[ $total -eq $parallelism ]]; then + elif [ $total -eq $parallelism ]; then # we have total=parallelism => either X successes or X failures # In any case, the job is done break @@ -63,11 +62,11 @@ runs: done # If job indicates a failure try to print out the info - if [[ $failures -gt 0 ]]; then + if [ $failures -gt 0 ]; then echo "Job ${{ inputs.job-name }} has $failures failures" # this is for batch jobs only pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} -o name) - if [[ -n "${pods}" ]]; then + if [ -n "${pods}" ]; then kubectl describe ${pods} fi exit 1 From 8f65cd4a2705c6d21f5637e5a6e87027b88ce2a5 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 10 Mar 2025 15:42:16 +0100 Subject: [PATCH 6/7] try to fix using posix-sh-compatible --- .github/actions/submit-delete-k8s-job/action.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml index fd76be0d8..c98da4b98 100644 --- a/.github/actions/submit-delete-k8s-job/action.yml +++ b/.github/actions/submit-delete-k8s-job/action.yml @@ -42,7 +42,7 @@ runs: fi # Check whether the job succeeded or failed - while readarray -d : -t status < <(kubectl get job/${{ inputs.job-name }} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do + while IFS=: read -r failures successes; do failures="${status[0]:-0}" successes="${status[1]:-0}" total=$((failures + successes)) @@ -59,7 +59,9 @@ runs: echo "Unexpected number of completed pods ${total} with parallelism ${parallelism}" exit 255 fi - done + done < Date: Mon, 10 Mar 2025 16:38:22 +0100 Subject: [PATCH 7/7] test on name of the volume and location --- .github/eks-workflow-files/axlearn/axlearn-job.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index 5c65975a6..090ad19f3 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -27,10 +27,10 @@ spec: sync wait # copy results to the mounted s3 bucket - mkdir -p /opt/jax-toolbox-eks-output/axlearn/${RUN_ID} - cp /opt/output/summary.txt /opt/jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt + mkdir -p /jax-toolbox-eks-output/axlearn/${RUN_ID} + cp /opt/output/summary.txt /jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt # copy all the log files - cp /opt/output/*.log /opt/jax-toolbox-eks-output/axlearn/${RUN_ID}/. + cp /opt/output/*.log /jax-toolbox-eks-output/axlearn/${RUN_ID}/. env: - name: RUN_ID value: PLACEHOLDER @@ -41,7 +41,7 @@ spec: - name: output mountPath: /opt/output - name: s3-storage - mountPath: /opt/jax-toolbox-eks-output + mountPath: /jax-toolbox-eks-output imagePullSecrets: - name: PLACEHOLDER volumes:
- +
- +