@olupton comments work

Steboss · Steboss · commit b2579cb0f03b · 2025-03-03T11:29:56.000Z
diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml
@@ -34,4 +34,4 @@ runs:
           kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }}
           
         post: | 
-          kubectl delete job ${{ inputs.job-name }}
+          kubectl delete -f "${{ inputs.job-config-file }}"
diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -37,7 +37,7 @@ spec:
                 - name: upload
                   image: amazon/aws-cli
                   env:
-                    - name: TEST_DATE
+                    - name: RUN_ID
                       value: PLACEHOLDER
                   command:
                     - sh
@@ -47,7 +47,11 @@ spec:
                         sleep 5
                       done
                       # Upload to S3 bucket
-                      aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${TEST_DATE}/summary.txt
+                      aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt
+                      # Zip the results of all the tests 
+                      tar -czf test_logs.tar.gz /opt/output
+                      # Upload logs to S3 bucket
+                      aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/test_logs.tar.gz
                   volumeMounts:
                     - name: output
                       mountPath: /opt/output
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
@@ -688,11 +688,12 @@ jobs:
     - name: Download logs from S3
       id: log-s3
       run: |
-        mkdir -p /tmp/axlearn-output
-        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt /tmp/axlearn-output/
+        mkdir -p axlearn-output
+        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt axlearn-output/
+        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/test_logs.tar.gz axlearn-output/
 
-        passed_tests=$(grep -c ": PASSED" /tmp/axlearn-output/summary.txt || true)
-        failed_tests=$(grep -c ": FAILED" /tmp/axlearn-output/summary.txt || true)
+        passed_tests=$(grep -c ": PASSED" axlearn-output/summary.txt || true)
+        failed_tests=$(grep -c ": FAILED" axlearn-output/summary.txt || true)
         total_tests=$((failed_tests + passed_tests))
 
         echo "Passed tests: $passed_tests"
@@ -733,7 +734,7 @@ jobs:
         message="Passed $passed_tests out of $total_tests." \
         color=$badge_color \
         to_json schemaVersion label message color \
-        > "badge-axlearn-test"
+        > badge-axlearn-test.json
 
     - name: Upload artifacts
       if: ${{ !cancelled() }}
@@ -742,8 +743,8 @@ jobs:
         name: "artifact-axlearn-test"
         path: |
           sitrep.json
-          "badge-axlearn-test"
-          summary.txt
+          badge-axlearn-test.json
+          axlearn-output/*
 
   # the fuji test will run for 20 minutes only, as per 2025-02-24 
   # is not possible to set the `max_steps` value
@@ -779,5 +780,5 @@ jobs:
       uses: ./.github/actions/submit-delete-k8s-job 
       with:
         job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
-        job-name: ${{ env.JOB_NAME }}
+        job-name: ${{ env.JOB_NAME }}https://docs.google.com/spreadsheets/d/12JIThodWLhf-H7Ob9p3CGZHLjKEPp17ogp9Do5Ofa6U/edit?gid=1030128481#gid=1030128481
 
diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml
@@ -124,4 +124,4 @@ jobs:
       # Clean up in case of errors as well as success
       - name: Delete Kubernetes job
         if: always()
-        run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
+        run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ We support and test the following JAX frameworks and model architectures. More d
 | [t5x](./rosetta/rosetta/projects/imagen) | Imagen | pre-training | `ghcr.io/nvidia/t5x:imagen-2023-10-02.v3` |
 | [big vision](./rosetta/rosetta/projects/paligemma) | PaliGemma | fine-tuning, evaluation | `ghcr.io/nvidia/jax:gemma` |
 | levanter | GPT, LLaMA, MPT, Backpacks | pretraining, fine-tuning | `ghcr.io/nvidia/jax:levanter` |
+| axlearn | Fuji | pretraining | `gchr.io/nvidia/jax:axlearn` | 
 
 # Build Pipeline Status
 <table>
@@ -248,6 +249,30 @@ We support and test the following JAX frameworks and model architectures. More d
         </a>
       </td>
     </tr>
+    <tr>
+      <td>
+        <a href="https://github.com/NVIDIA/JAX-Toolbox/blob/main/.github/container/Dockerfile.axlearn">
+          <img style="height:1em;" src="https://img.shields.io/static/v1?label=&color=gray&logo=docker&message=AXLearn%3D%7Bcore%2CAXLearn%7D">
+        </a>
+      </td>
+      <td>
+        <code>ghcr.io/nvidia/jax:axlearn</code>
+      </td>
+      <td>
+        <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae/#file-final-axlearn-md">
+          <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-axlearn-build-amd64.json&logo=docker&label=amd64">
+        </a>
+        <br>
+        <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae/#file-final-maxtext-md">
+          <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-axlearn-build-arm64.json&logo=docker&label=arm64">
+        </a>
+      </td>
+      <td>
+        <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae#file-badge-maxtext-test-json">
+          <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-axleran-test.json&logo=nvidia&label=A100%20distributed">
+        </a>
+      </td>
+    </tr>
   </tbody>
 </table>
 
diff --git a/rosetta/rosetta/projects/axlearn/README.md b/rosetta/rosetta/projects/axlearn/README.md
@@ -0,0 +1,59 @@
+# AXLearn
+[AXLearn](https://github.com/apple/axlearn) is a deep learning design framework, built on top of JAX and XLA, to support the development of large-scale models. 
+
+
+## Hardware and Software Specifications
+
+Functionality have been validated on AWS p5.48xlarge EKS cluster (8x H100 80G); please refer to the [Configs](#configs) section below for some initial configs and performance numbers. We will continue to populate it with more models and configs. We provide both singlenode and multinode pre-training support. If running on a machine with less than 80G memory, some of the default configurations may run out of memory; if you run out of memory and have more GPUs available, increase your GPU count and decrease your batch size per GPU.
+
+
+## Containers
+We provide a fully built and ready-to-use multi-arch container, bleeding edge: `ghcr.io/nvidia/jax:axlearn`. We also provide nightly dated images with the naming pattern `ghcr.io/nvidia/jax:axlearn-YYYY-MM-DD`, but we encourage you to use the latest ones for the best performance.
+
+*Note*: All paths mentioned in subsequent sections are relative to the top-level directory of the AXLearn repository. When working interactively with containers, make sure you navigate to `/opt/axlearn` before running any commmands.
+
+## Launching a container
+Use the following command to launch a container:
+```
+docker run -ti --gpus=all --net=host --ipc=host -v <WORKSPACE_PATH>:/opt/axlearn/workspace -w /opt/axlearn <CONTAINER> /bin/bash
+```
+where `WORKSPACE_PATH` is the path to the directory where you would like to store any persistent files and `container` is the name of the maxtext container. You can additionally add dataset and vocab paths with the `-v` flag.
+
+## Running a Fuji model
+### Quick Runs
+
+#### EKS Single node: `fuji-3B-v3-flash-single-host`
+Fuji models are defined with 1B, 3B, 7B or 70B parameters. In this example, we deploy the training for a Fuji-3B model, that uses flash attention, and runs on a single host. [Here](scripts/eks-fuji.yaml) we provide an example deployment file. The core point of the deployment is: 
+```bash 
+python3 -m axlearn.common.launch_trainer_main \
+        --module=text.gpt.c4_trainer \
+        --config=${CONFIG} \
+        --trainer_dir=${TRAINER_DIR} \
+        --data_dir=gs://axlearn-public/tensorflow_datasets \
+        --jax_backend=gpu             
+```
+Where `CONFIG="fuji-3B-v3-flash-single-host`. The input dataset is the public tensorflow [C4 dataset](https://www.tensorflow.org/datasets/catalog/c4). 
+
+#### Running a multinode job for `fuji-XB-v2-flash` 
+
+For running a multinode job  we provide a [custom example](scripts/multinode.py). The code access AXLearn directly, it allows to specify a custom dataset, the number of GPUs to use, the global batch size, as well as the `max_sequence_length`. 
+
+
+## XLA Flags
+The [GPU Performance document](../../../docs/GPU_performance.md) provides a detailed description of the XLA flags that can be set to optimize performance. These are the recommended XLA flags to get good performance for AXLearn.
+
+```
+XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
+            --xla_gpu_enable_triton_gemm=false
+            --xla_gpu_enable_command_buffer=
+            --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 
+            --xla_gpu_all_gather_combine_threshold_bytes=1073741824 
+            --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
+            --xla_gpu_enable_pipelined_all_gather=true 
+            --xla_gpu_enable_pipelined_reduce_scatter=true 
+            --xla_gpu_enable_pipelined_all_reduce=true 
+            --xla_gpu_enable_while_loop_double_buffering=true
+            --xla_gpu_enable_all_gather_combine_by_dim=false 
+            --xla_gpu_enable_reduce_scatter_combine_by_dim=false 
+            --xla_disable_hlo_passes=rematerialization"
+```
diff --git a/rosetta/rosetta/projects/axlearn/scripts/eks-fuji.yaml b/rosetta/rosetta/projects/axlearn/scripts/eks-fuji.yaml
@@ -0,0 +1,66 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+    name: axlearn-fuji
+    # Specify any labels for running on a dedicated queue
+spec:
+    completions: 1
+    parallelism: 1
+    template:
+        spec:
+            restartPolicy: Never
+            containers:
+                - name: axlearn-fuji-model
+                  image: gchr.io/nvidia/jax:axlearn
+                  command:
+                    - bash
+                    - -xo
+                    - pipefail
+                    - -c
+                    - |        
+                      BASEDIR="/opt/axlearn"
+                      CONFIG="fuji-3B-v3-flash-single-host"
+                      HLO_DUMP=0
+                      POSTFIX=""
+
+                      AR_THRESHOLD=1073741824
+                      AG_THRESHOLD=8589934592
+                      RS_THRESHOLD=8589934592
+                      BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true
+                          --xla_gpu_enable_highest_priority_async_stream=true
+                          --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
+                          --xla_gpu_all_gather_combine_threshold_bytes=1073741824
+                          --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
+                          --xla_gpu_enable_pipelined_all_gather=true
+                          --xla_gpu_enable_pipelined_reduce_scatter=true
+                          --xla_gpu_enable_pipelined_all_reduce=true
+                          --xla_gpu_enable_while_loop_double_buffering=true
+                          --xla_gpu_enable_triton_gemm=false
+                          --xla_gpu_enable_all_gather_combine_by_dim=false
+                          --xla_gpu_enable_reduce_scatter_combine_by_dim=false
+                          --xla_disable_hlo_passes=rematerialization}
+
+                      export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" 
+                      export TF_GPU_ALLOCATOR=cuda_malloc_async
+
+                      LOG_DIR=${BASEDIR}/logs
+                      TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir
+                      mkdir -p ${TRAINER_DIR}
+
+
+                      python3 -m axlearn.common.launch_trainer_main \
+                          --module=text.gpt.c4_trainer \
+                          --config=${CONFIG} \
+                          --trainer_dir=${TRAINER_DIR} \
+                          --data_dir=gs://axlearn-public/tensorflow_datasets \
+                          --jax_backend=gpu                    
+                  resources:
+                    limits:
+                        nvidia.com/gpu: 8
+                  volumeMounts:
+                    - name: output
+                      mountPath: /opt/output
+            # specify any image secret if needed
+            volumes:
+                - name: output
+                  emptyDir: {}
diff --git a/rosetta/rosetta/projects/axlearn/scripts/multinode.py b/rosetta/rosetta/projects/axlearn/scripts/multinode.py
@@ -0,0 +1,71 @@
+import os
+
+from absl import app, flags
+from axlearn.common.launch_trainer import run_trainer
+from axlearn.common.config import config_for_function
+from axlearn.experiments.text.gpt import c4_trainer
+from axlearn.common.trainer import SpmdTrainer
+
+FLAGS = flags.FLAGS
+FLAGS.set_default("module", "text.gpt.c4_trainer") 
+FLAGS.set_default("config", "fuji-7B-v2-flash")  # Set the model 
+FLAGS.set_default("trainer_dir", "/opt/host/axlearn-checkpoints")  # Set the trainer directory
+
+def main(_):
+    axlearn_path = "/opt/axlearn"  
+    os.environ["PYTHONPATH"] = f"{axlearn_path}:{os.environ.get('PYTHONPATH', '')}"  
+
+    n_gpus = 16 # This can be also an env variable
+    # Base XLA flags
+    base_flags = [
+        "--xla_gpu_enable_latency_hiding_scheduler=true",
+        "--xla_gpu_enable_command_buffer=",
+        "--xla_gpu_enable_highest_priority_async_stream=true",
+        "--xla_gpu_all_reduce_combine_threshold_bytes=1073741824",
+        "--xla_gpu_all_gather_combine_threshold_bytes=1073741824",
+        "--xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824",
+        "--xla_gpu_enable_pipelined_all_gather=true",
+        "--xla_gpu_enable_pipelined_reduce_scatter=true",
+        "--xla_gpu_enable_pipelined_all_reduce=true",
+        "--xla_gpu_enable_while_loop_double_buffering=true",
+        "--xla_gpu_enable_triton_gemm=false",
+        "--xla_gpu_enable_all_gather_combine_by_dim=false",
+        "--xla_gpu_enable_reduce_scatter_combine_by_dim=false",
+        "--xla_disable_hlo_passes=rematerialization",
+    ]
+    # Get existing flags from environment with proper fallback.
+    existing_xla_flags = os.environ.get("XLA_FLAGS", "").split()
+    # XLA flags
+    os.environ.update({
+        "XLA_FLAGS": " ".join([
+            *base_flags,
+            *existing_xla_flags
+        ])})
+
+    os.environ.update({
+        "DATA_DIR":"gs://axlearn-public/tensorflow_datasets", # Set up your input dataset
+        "NUM_PROCESSES":f"{n_gpus}", 
+        "DISTRIBUTED_COORDINATOR":"127.0.0.1:8080", 
+        "PROCESS_ID":"0",
+    })
+
+    # Raw config
+    config_fn = c4_trainer.named_trainer_configs()[FLAGS.config]
+    trainer_config: SpmdTrainer.Config = config_for_function(config_fn).fn()
+
+    trainer_config.max_step = 100 # Set the max number of steps to run
+    trainer_config.dir = "/opt/host/axlearn-checkpoints"  # Use 'dir' instead of 'model_dir'
+    trainer_config.input.input_dispatcher.global_logical_batch_size = 8 # Tune the batch size for training
+    #trainer_config.input.source.max_sequence_length = 2048 # Tune the max sequence length if running in OOM
+    trainer_config.checkpointer.save_policy.n = 500  # Save every 500 steps
+    trainer_config.checkpointer.keep_every_n_steps = 500  # Keep checkpoints
+    trainer_config.summary_writer.write_every_n_steps = 100  # Log every 100 steps
+
+    run_trainer(
+        trainer_config=trainer_config,
+    )
+
+
+if __name__ == "__main__":
+    from absl import app
+    app.run(main)