diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml index dbeabe668..c98da4b98 100644 --- a/.github/actions/submit-delete-k8s-job/action.yml +++ b/.github/actions/submit-delete-k8s-job/action.yml @@ -16,7 +16,7 @@ runs: uses: ./.github/actions/with-post-step with: main: | - echo "Submit K8s job" + echo "Submit K8s job ${{ inputs.job-config-file }}" kubectl apply -f "${{ inputs.job-config-file }}" # Wait for job to be craeted @@ -32,6 +32,47 @@ runs: # Stream logs kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }} - - post: | + + # Detect job parallelism + parallelism=$(kubectl get job/"${{ inputs.job-name }}" -o jsonpath='{.spec.parallelism}') + # if parallelism is not set, use default value of 1 + if [ -z "${parallelism}" ]; then + echo "No parallelism specified, defaulting to 1" + parallelism=1 + fi + + # Check whether the job succeeded or failed + while IFS=: read -r failures successes; do + failures="${status[0]:-0}" + successes="${status[1]:-0}" + total=$((failures + successes)) + + if [ $total -lt $parallelism ]; then + # neither "failed" nor "succeeded", so wait + sleep 1 + elif [ $total -eq $parallelism ]; then + # we have total=parallelism => either X successes or X failures + # In any case, the job is done + break + else + # Log here + echo "Unexpected number of completed pods ${total} with parallelism ${parallelism}" + exit 255 + fi + done <> $GITHUB_OUTPUT - test-distribution: - runs-on: ubuntu-22.04 - strategy: - matrix: - TEST_SCRIPT: - - extra-only-distribution.sh - - mirror-only-distribution.sh - - upstream-only-distribution.sh - - local-patch-distribution.sh - fail-fast: false - steps: - - name: Print environment variables - run: env - - name: Set git login for tests - run: | - git config --global user.email "jax@nvidia.com" - git config --global user.name "JAX-Toolbox CI" - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v4 - - name: Run integration test ${{ matrix.TEST_SCRIPT }} - run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} - - test-jax: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: jax - EXECUTE: | - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-backend-independent.log - test-jax.sh -b backend-independent - EOF - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee tee test-gpu.log - nvidia-cuda-mps-control -d - test-jax.sh -b gpu - EOF - STATISTICS_SCRIPT: | - errors=$(cat test-*.log | grep -c 'ERROR:' || true) - failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) - passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-backend-independent.log - test-gpu.log - secrets: inherit - - test-nsys-jax: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: nsys-jax - EXECUTE: | - set -o pipefail - num_tests=0 - num_failures=0 - # Run the pytest-driven tests; failure is explicitly handled below so set +e to - # avoid an early abort here. - set +e - docker run -i --shm-size=1g --gpus all \ - -v $PWD:/opt/output \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-nsys-jax.log - # nsys-jax is already installed, this is just adding the test dependencies - pip install pytest-reportlog nsys-jax[test] - # abuse knowledge that nsys-jax is installed editable, so the tests exist - test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') - pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" - EOF - set -e - GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') - for mode in 1-process 2-process process-per-gpu; do - DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" - if [[ "${mode}" == "1-process" ]]; then - PROCESS_COUNT=1 - ARGS="" - elif [[ "${mode}" == "2-process" ]]; then - # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that - # this will flush out more bugs than process-per-node or process-per-GPU. - PROCESS_COUNT=2 - ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" - else - PROCESS_COUNT=${GPUS_PER_NODE} - ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" - fi - for collection in full partial; do - NSYS_JAX="nsys-jax" - if [[ "${mode}" == "1-process" ]]; then - # We will not run nsys-jax-combine, so run analyses eagerly - NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" - fi - NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" - if [[ "${collection}" == "partial" ]]; then - NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" - # nvbug/4801401 - NSYS_JAX+=" --sample=none" - fi - set +e - ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ - -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log - num_failures=$((num_failures + ($? != 0))) - set -e - num_tests=$((num_tests + 1)) - done - if [[ "${mode}" != "1-process" ]]; then - # Run nsys-jax-combine - NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" - for (( i=0; i> $GITHUB_ENV - echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV - exit $num_failures - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-nsys-jax.log) - num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) - num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) - num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT - ARTIFACTS: | - # pytest-driven part - test-nsys-jax.log - pytest-report.jsonl - # nsys-jax logfiles - *process-*-execution.log - # nsys-jax output for the case that doesn't use nsys-jax-combine - 1-process-*-execution-0.zip - # nsys-jax-combine output/logfiles - *process*-*-execution.zip - *-execution-combine.log - secrets: inherit - - #test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test - #runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does - #not already have nsys-jax installed - test-nsys-jax-archive: - needs: test-nsys-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - strategy: - matrix: - os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] - runs-on: ${{ matrix.os }} - steps: - - name: Download nsys-jax output .zip files - uses: actions/download-artifact@v4 - with: - name: nsys-jax-unit-test-A100 - - name: Extract archives and execute install scripts - run: | - pip install virtualenv # for install.sh - for zip in $(ls *.zip); do - ZIP="${PWD}/${zip}" - pushd $(mktemp -d) - unzip "${ZIP}" - ls -l - # TODO: verify this isn't needed, or make sure it isn't needed - chmod 755 install.sh - # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout - # Skip executing Jupyter lab - NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh - popd - done - - test-nsys-jax-eks: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - runs-on: eks - env: - JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: ${{ github.run_id }}-nsys-jax - POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess - steps: - - name: Check out the repository - uses: actions/checkout@v4 - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: K8s GHCR store and delete token - id: store-token - uses: ./.github/actions/store-delete-k8s-ghcr - - name: Configure Kubernetes job - run: | - yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) - | select(di == 1).metadata.name = strenv(JOB_NAME) - | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" - | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) - | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \ - .github/eks-workflow-files/job.yml - git diff .github/eks-workflow-files/job.yml - - name: Submit Kubernetes job - uses: ./.github/actions/submit-delete-k8s-job - with: - job-config-file: .github/eks-workflow-files/job.yml - job-name: ${{ env.JOB_NAME }} - - name: Configure post-processing job - run: | - export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" - yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) - | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) - | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" - | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ - .github/eks-workflow-files/post-process-job.yml - git diff .github/eks-workflow-files/post-process-job.yml - - name: Submit post process Kubernetes job - uses: ./.github/actions/submit-delete-k8s-job - with: - job-config-file: .github/eks-workflow-files/post-process-job.yml - job-name: ${{ env.POSTPROCESS_JOB_NAME }} - - # test-equinox: - # needs: build-equinox + # test-distribution: + # runs-on: ubuntu-22.04 + # strategy: + # matrix: + # TEST_SCRIPT: + # - extra-only-distribution.sh + # - mirror-only-distribution.sh + # - upstream-only-distribution.sh + # - local-patch-distribution.sh + # fail-fast: false + # steps: + # - name: Print environment variables + # run: env + # - name: Set git login for tests + # run: | + # git config --global user.email "jax@nvidia.com" + # git config --global user.name "JAX-Toolbox CI" + # - name: Check out the repository under ${GITHUB_WORKSPACE} + # uses: actions/checkout@v4 + # - name: Run integration test ${{ matrix.TEST_SCRIPT }} + # run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} + + # test-jax: + # needs: build-jax # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a # uses: ./.github/workflows/_test_unit.yaml # with: - # IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} - # TEST_NAME: equinox + # TEST_NAME: jax # EXECUTE: | - # docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ - # bash -exc -o pipefail \ - # 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-backend-independent.log + # test-jax.sh -b backend-independent + # EOF + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee tee test-gpu.log + # nvidia-cuda-mps-control -d + # test-jax.sh -b gpu + # EOF # STATISTICS_SCRIPT: | - # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # errors=$(cat test-*.log | grep -c 'ERROR:' || true) + # failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) + # passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) # total_tests=$((failed_tests + passed_tests)) # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT # ARTIFACTS: | - # test-equinox.log + # test-backend-independent.log + # test-gpu.log # secrets: inherit - # test-te-multigpu: - # needs: build-upstream-pax + # test-nsys-jax: + # needs: build-jax # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - # uses: ./.github/workflows/_test_te.yaml + # uses: ./.github/workflows/_test_unit.yaml # with: - # TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} + # TEST_NAME: nsys-jax + # EXECUTE: | + # set -o pipefail + # num_tests=0 + # num_failures=0 + # # Run the pytest-driven tests; failure is explicitly handled below so set +e to + # # avoid an early abort here. + # set +e + # docker run -i --shm-size=1g --gpus all \ + # -v $PWD:/opt/output \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-nsys-jax.log + # # nsys-jax is already installed, this is just adding the test dependencies + # pip install pytest-reportlog nsys-jax[test] + # # abuse knowledge that nsys-jax is installed editable, so the tests exist + # test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') + # pytest --report-log=/opt/output/pytest-report.jsonl "${test_path}" + # EOF + # set -e + # GPUS_PER_NODE=$(nvidia-smi -L | grep -c '^GPU') + # for mode in 1-process 2-process process-per-gpu; do + # DOCKER="docker run --shm-size=1g --gpus all --env XLA_FLAGS=--xla_gpu_enable_command_buffer= --env XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 -v ${PWD}:/opt/output ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}" + # if [[ "${mode}" == "1-process" ]]; then + # PROCESS_COUNT=1 + # ARGS="" + # elif [[ "${mode}" == "2-process" ]]; then + # # Use two processes with GPUS_PER_NODE/2 GPUs per process in the hope that + # # this will flush out more bugs than process-per-node or process-per-GPU. + # PROCESS_COUNT=2 + # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process $((GPUS_PER_NODE/2)) --distributed" + # else + # PROCESS_COUNT=${GPUS_PER_NODE} + # ARGS="--process-id RANK --process-count ${PROCESS_COUNT} --coordinator-address 127.0.0.1:12345 --gpus-per-process 1 --distributed" + # fi + # for collection in full partial; do + # NSYS_JAX="nsys-jax" + # if [[ "${mode}" == "1-process" ]]; then + # # We will not run nsys-jax-combine, so run analyses eagerly + # NSYS_JAX+=" --nsys-jax-analysis communication --nsys-jax-analysis summary" + # fi + # NSYS_JAX+=" --output=/opt/output/${mode}-${collection}-execution-%q{RANK}" + # if [[ "${collection}" == "partial" ]]; then + # NSYS_JAX+=" --capture-range=cudaProfilerApi --capture-range-end=stop" + # # nvbug/4801401 + # NSYS_JAX+=" --sample=none" + # fi + # set +e + # ${DOCKER} parallel-launch RANK ${PROCESS_COUNT} ${NSYS_JAX} \ + # -- jax-nccl-test ${ARGS} |& tee ${mode}-${collection}-execution.log + # num_failures=$((num_failures + ($? != 0))) + # set -e + # num_tests=$((num_tests + 1)) + # done + # if [[ "${mode}" != "1-process" ]]; then + # # Run nsys-jax-combine + # NSYS_JAX_COMBINE="nsys-jax-combine --analysis communication --analysis summary --output=/opt/output/${mode}-${collection}-execution.zip" + # for (( i=0; i> $GITHUB_ENV + # echo "NSYS_JAX_FAIL_COUNT=${num_failures}" >> $GITHUB_ENV + # exit $num_failures + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-nsys-jax.log) + # num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + # total_tests=$(( NSYS_JAX_TEST_COUNT + passed_tests + failed_tests )) + # num_passed=$(( passed_tests + NSYS_JAX_TEST_COUNT - NSYS_JAX_FAIL_COUNT )) + # num_failed=$(( failed_tests + NSYS_JAX_FAIL_COUNT )) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${num_passed}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${num_failed}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # # pytest-driven part + # test-nsys-jax.log + # pytest-report.jsonl + # # nsys-jax logfiles + # *process-*-execution.log + # # nsys-jax output for the case that doesn't use nsys-jax-combine + # 1-process-*-execution-0.zip + # # nsys-jax-combine output/logfiles + # *process*-*-execution.zip + # *-execution-combine.log # secrets: inherit - test-upstream-t5x: - needs: build-upstream-t5x - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_upstream_t5x.yaml - with: - T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + #test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test + #runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does + #not already have nsys-jax installed + # test-nsys-jax-archive: + # needs: test-nsys-jax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # strategy: + # matrix: + # os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] + # runs-on: ${{ matrix.os }} + # steps: + # - name: Download nsys-jax output .zip files + # uses: actions/download-artifact@v4 + # with: + # name: nsys-jax-unit-test-A100 + # - name: Extract archives and execute install scripts + # run: | + # pip install virtualenv # for install.sh + # for zip in $(ls *.zip); do + # ZIP="${PWD}/${zip}" + # pushd $(mktemp -d) + # unzip "${ZIP}" + # ls -l + # # TODO: verify this isn't needed, or make sure it isn't needed + # chmod 755 install.sh + # # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout + # # Skip executing Jupyter lab + # NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh + # popd + # done + + # test-nsys-jax-eks: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # runs-on: eks + # env: + # JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + # JOB_NAME: ${{ github.run_id }}-nsys-jax + # POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess + # steps: + # - name: Check out the repository + # uses: actions/checkout@v4 + # - name: Login to GitHub Container Registry + # uses: docker/login-action@v3 + # with: + # registry: ghcr.io + # username: ${{ github.repository_owner }} + # password: ${{ secrets.GITHUB_TOKEN }} + # - name: K8s GHCR store and delete token + # id: store-token + # uses: ./.github/actions/store-delete-k8s-ghcr + # - name: Configure Kubernetes job + # run: | + # yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) + # | select(di == 1).metadata.name = strenv(JOB_NAME) + # | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" + # | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) + # | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \ + # .github/eks-workflow-files/job.yml + # git diff .github/eks-workflow-files/job.yml + # - name: Submit Kubernetes job + # uses: ./.github/actions/submit-delete-k8s-job + # with: + # job-config-file: .github/eks-workflow-files/job.yml + # job-name: ${{ env.JOB_NAME }} + # - name: Configure post-processing job + # run: | + # export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" + # yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) + # | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) + # | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" + # | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ + # .github/eks-workflow-files/post-process-job.yml + # git diff .github/eks-workflow-files/post-process-job.yml + # - name: Submit post process Kubernetes job + # uses: ./.github/actions/submit-delete-k8s-job + # with: + # job-config-file: .github/eks-workflow-files/post-process-job.yml + # job-name: ${{ env.POSTPROCESS_JOB_NAME }} + + # # test-equinox: + # # needs: build-equinox + # # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # # uses: ./.github/workflows/_test_unit.yaml + # # with: + # # IMAGE: ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} + # # TEST_NAME: equinox + # # EXECUTE: | + # # docker run --shm-size=1g --gpus all ${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }} \ + # # bash -exc -o pipefail \ + # # 'pushd /opt/equinox/tests && pip install -r requirements.txt && pytest .' | tee test-equinox.log + # # STATISTICS_SCRIPT: | + # # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # # total_tests=$((failed_tests + passed_tests)) + # # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # # ARTIFACTS: | + # # test-equinox.log + # # secrets: inherit + + # # test-te-multigpu: + # # needs: build-upstream-pax + # # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # # uses: ./.github/workflows/_test_te.yaml + # # with: + # # TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} + # # secrets: inherit + + # test-upstream-t5x: + # needs: build-upstream-t5x + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_upstream_t5x.yaml + # with: + # T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-rosetta-t5x: - needs: build-rosetta-t5x - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_t5x_rosetta.yaml - with: - T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-rosetta-t5x: + # needs: build-rosetta-t5x + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_t5x_rosetta.yaml + # with: + # T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-triton: - needs: build-triton - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: triton - EXECUTE: | - docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ - ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-triton.log - # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on - # actually having a CUDA backend for pytoch - pip install --no-deps torch - python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml - EOF - STATISTICS_SCRIPT: | - curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; - total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) - errors=$(./yq '.testsuites."+@errors"' triton_test.xml) - failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) - passed_tests=$((total_tests - errors - failed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-triton.log - secrets: inherit + # test-triton: + # needs: build-triton + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: triton + # EXECUTE: | + # docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ + # ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-triton.log + # # autotuner tests from jax-triton now hit a triton code path that uses utilities from pytorch; this relies on + # # actually having a CUDA backend for pytoch + # pip install --no-deps torch + # python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml + # EOF + # STATISTICS_SCRIPT: | + # curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; + # total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) + # errors=$(./yq '.testsuites."+@errors"' triton_test.xml) + # failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) + # passed_tests=$((total_tests - errors - failed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-triton.log + # secrets: inherit - test-levanter: - needs: build-levanter - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: levanter - EXECUTE: | - docker run -i --gpus all --shm-size=1g \ - ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-levanter.log - pip install flake8 pytest soundfile librosa - PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" - EOF - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-levanter.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-levanter.log - secrets: inherit - - # test-te: - # needs: build-upstream-pax + # test-levanter: + # needs: build-levanter # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a # uses: ./.github/workflows/_test_unit.yaml # with: - # TEST_NAME: te + # TEST_NAME: levanter # EXECUTE: | - # docker run -i --gpus all --shm-size=1g -v $PWD:/log \ - # ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-te.log - # pip install pytest-reportlog - # pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax + # docker run -i --gpus all --shm-size=1g \ + # ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-levanter.log + # pip install flake8 pytest soundfile librosa + # PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" # EOF # STATISTICS_SCRIPT: | - # summary_line=$(tail -n1 test-te.log) + # summary_line=$(tail -n1 test-levanter.log) # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') # total_tests=$((failed_tests + passed_tests)) # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # TIMEOUT_MINUTES: 120 # ARTIFACTS: | - # test-te.log - # pytest-report.jsonl + # test-levanter.log + # secrets: inherit + + # # test-te: + # # needs: build-upstream-pax + # # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # # uses: ./.github/workflows/_test_unit.yaml + # # with: + # # TEST_NAME: te + # # EXECUTE: | + # # docker run -i --gpus all --shm-size=1g -v $PWD:/log \ + # # ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ + # # bash <<"EOF" |& tee test-te.log + # # pip install pytest-reportlog + # # pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TRANSFORMER_ENGINE}/tests/jax + # # EOF + # # STATISTICS_SCRIPT: | + # # summary_line=$(tail -n1 test-te.log) + # # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + # # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + # # total_tests=$((failed_tests + passed_tests)) + # # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # # TIMEOUT_MINUTES: 120 + # # ARTIFACTS: | + # # test-te.log + # # pytest-report.jsonl + # # secrets: inherit + + # test-gemma: + # needs: build-gemma + # uses: ./.github/workflows/_test_unit.yaml + # if: inputs.ARCHITECTURE == 'amd64' + # with: + # TEST_NAME: gemma + # EXECUTE: | + # docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ + # bash -ec \ + # "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-gemma.log) + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-gemma.log # secrets: inherit - test-gemma: - needs: build-gemma - uses: ./.github/workflows/_test_unit.yaml - if: inputs.ARCHITECTURE == 'amd64' - with: - TEST_NAME: gemma - EXECUTE: | - docker run --shm-size=1g --gpus all ${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }} \ - bash -ec \ - "cd /opt/gemma && pip install -e .[dev] && pytest ." | tee test-gemma.log - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-gemma.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-gemma.log - secrets: inherit - - test-maxtext: - needs: build-maxtext - if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners - uses: ./.github/workflows/_test_maxtext.yaml - with: - MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-maxtext: + # needs: build-maxtext + # if: inputs.ARCHITECTURE == 'amd64' # no arm64 gpu runners + # uses: ./.github/workflows/_test_maxtext.yaml + # with: + # MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit test-axlearn-eks: needs: build-axlearn @@ -663,7 +663,7 @@ jobs: yq -i ea ' select(di == 0).metadata.name = strenv(JOB_NAME) | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) - | select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}" + | select(di == 0).spec.template.spec.containers[0].env[0].value = "${{ github.run_id }}" | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ .github/eks-workflow-files/axlearn/axlearn-job.yml git diff .github/eks-workflow-files/axlearn/axlearn-job.yml @@ -769,4 +769,3 @@ jobs: with: job-config-file: ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml" job-name: ${{ env.JOB_NAME }} - diff --git a/README.md b/README.md index 83053215e..bca12d6e5 100644 --- a/README.md +++ b/README.md @@ -10,12 +10,12 @@ We support and test the following JAX frameworks and model architectures. More d | Framework | Models | Use cases | Container | | :--- | :---: | :---: | :---: | -| [maxtext](./rosetta/rosetta/projects/maxtext)| GPT, LLaMA, Gemma, Mistral, Mixtral | pretraining | `ghcr.io/nvidia/jax:maxtext` | +| [maxtext](./rosetta/rosetta/projects/maxtext)| GPT, LLaMA, Gemma, Mistral, Mixtral | pre-training | `ghcr.io/nvidia/jax:maxtext` | | [t5x](./rosetta/rosetta/projects/t5x) | T5, ViT | pre-training, fine-tuning | `ghcr.io/nvidia/jax:t5x` | | [t5x](./rosetta/rosetta/projects/imagen) | Imagen | pre-training | `ghcr.io/nvidia/t5x:imagen-2023-10-02.v3` | | [big vision](./rosetta/rosetta/projects/paligemma) | PaliGemma | fine-tuning, evaluation | `ghcr.io/nvidia/jax:gemma` | -| levanter | GPT, LLaMA, MPT, Backpacks | pretraining, fine-tuning | `ghcr.io/nvidia/jax:levanter` | -| axlearn | Fuji | pretraining | `gchr.io/nvidia/jax:axlearn` | +| levanter | GPT, LLaMA, MPT, Backpacks | pre-training, fine-tuning | `ghcr.io/nvidia/jax:levanter` | +| axlearn | Fuji | pre-training | `gchr.io/nvidia/jax:axlearn` | # Build Pipeline Status @@ -269,7 +269,7 @@ We support and test the following JAX frameworks and model architectures. More d
- +