fix the 3B model run on k8s

Steboss · Steboss · commit 0b1a61f8728c · 2025-02-25T11:29:51.000Z
diff --git a/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml b/.github/eks-workflow-files/axlearn/axlearn-3B-model.yml
@@ -13,7 +13,7 @@ spec:
     spec:
       restartPolicy: Never
       containers:
-      - name: axlearn-fuji-1B
+      - name: axlearn-fuji-3B
         image: PLACEHOLDER
         command:
           - bash
@@ -23,31 +23,23 @@ spec:
           - |        
 
             BASEDIR="/opt/axlearn"
-            CONFIG="fuji-1B-v3-flash-single-host"
-            HLO_DUMP=0
-            POSTFIX=""
-
-            AR_THRESHOLD=1073741824
-            AG_THRESHOLD=8589934592
-            RS_THRESHOLD=8589934592
-            XLA_BASE_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
-                            --xla_gpu_enable_triton_gemm=false
-                            --xla_gpu_enable_highest_priority_async_stream=true
-                            --xla_gpu_all_gather_combine_threshold_bytes=${AG_THRESHOLD}
-                            --xla_gpu_reduce_scatter_combine_threshold_bytes=${RS_THRESHOLD}
-                            --xla_gpu_enable_pipelined_all_gather=true
-                            --xla_gpu_enable_pipelined_reduce_scatter=true
-                            --xla_gpu_enable_nccl_comm_splitting=false"
-
-            export XLA_PYTHON_CLIENT_PREALLOCATE=false
-            export TF_GPU_ALLOCATOR=cuda_malloc_async
-            export XLA_FLAGS="${XLA_BASE_FLAGS}"
-
-            export NCCL_BUFFSIZE=8388608 
-            export NCCL_P2P_NET_CHUNKSIZE=524288
-            export NCCL_LAUNCH_MODE=GROUP
-            export NCCL_DEBUG=INFO
+            CONFIG="fuji-3B-v3-flash-single-host"
+            BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true
+                 --xla_gpu_enable_highest_priority_async_stream=true
+                 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
+                 --xla_gpu_all_gather_combine_threshold_bytes=1073741824
+                 --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
+                 --xla_gpu_enable_pipelined_all_gather=true
+                 --xla_gpu_enable_pipelined_reduce_scatter=true
+                 --xla_gpu_enable_pipelined_all_reduce=true
+                 --xla_gpu_enable_while_loop_double_buffering=true
+                 --xla_gpu_enable_triton_gemm=false
+                 --xla_gpu_enable_all_gather_combine_by_dim=false
+                 --xla_gpu_enable_reduce_scatter_combine_by_dim=false
+                 --xla_disable_hlo_passes=rematerialization}
 
+            export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" 
+            
             LOG_DIR=${BASEDIR}/logs
             TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir
             mkdir -p ${TRAINER_DIR}
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
@@ -749,14 +749,14 @@ jobs:
 
   # the fuji test will run for 20 minutes only, as per 2025-02-24 
   # is not possible to set the `max_steps` value
-  test-axlearn-fuji-1B-eks:
+  test-axlearn-fuji-3B-eks:
     needs: build-axlearn
     if: inputs.ARCHITECTURE == 'amd64'
     runs-on: eks
     env:
       AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: axlearn-fuji-1b-${{ github.run_id }}
-      TOKEN_NAME: axlearn-fuji-1b-${{ github.run_id }}-token
+      JOB_NAME: axlearn-fuji-3b-${{ github.run_id }}
+      TOKEN_NAME: axlearn-fuji-3b-${{ github.run_id }}-token
     steps:
     - name: Check out the repository
       uses: actions/checkout@v4
@@ -776,12 +776,12 @@ jobs:
            select(di == 0).metadata.name = strenv(JOB_NAME)
           | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
           | select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
-        .github/eks-workflow-files/axlearn/axlearn-1B-model.yml
-        git diff .github/eks-workflow-files/axlearn/axlearn-1B-model.yml
+        .github/eks-workflow-files/axlearn/axlearn-3B-model.yml
+        git diff .github/eks-workflow-files/axlearn/axlearn-3B-model.yml
 
     - name: Submit & delete axlearn test 
       uses: ./.github/actions/submit-delete-k8s-job 
       with:
-        job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-1B-model.yml"
+        job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-3B-model.yml"
         job-name: ${{ env.JOB_NAME }}