fix the fuji eks model

Steboss · Steboss · commit e3a9e4e78f0e · 2025-02-26T18:40:40.000Z
diff --git a/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
@@ -13,14 +13,50 @@ spec:
         spec:
             restartPolicy: Never
             containers:
-                - name: axlearn-fuji
+                - name: axlearn-fuji-model
                   image: PLACEHOLDER
                   command:
                     - bash
                     - -xo
                     - pipefail
                     - -c
-                    - "\nBASEDIR=\"/opt/axlearn\"\nCONFIG=\"fuji-3B-v3-flash-single-host\"\nBASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true\n     --xla_gpu_enable_highest_priority_async_stream=true\n     --xla_gpu_all_reduce_combine_threshold_bytes=1073741824\n     --xla_gpu_all_gather_combine_threshold_bytes=1073741824\n     --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824\n     --xla_gpu_enable_pipelined_all_gather=true\n     --xla_gpu_enable_pipelined_reduce_scatter=true\n     --xla_gpu_enable_pipelined_all_reduce=true\n     --xla_gpu_enable_while_loop_double_buffering=true\n     --xla_gpu_enable_triton_gemm=false\n     --xla_gpu_enable_all_gather_combine_by_dim=false\n     --xla_gpu_enable_reduce_scatter_combine_by_dim=false\n     --xla_disable_hlo_passes=rematerialization}\n\nexport XLA_FLAGS=\"$BASE_XLA_FLAGS ${XLA_FLAGS:-}\" \n\nLOG_DIR=${BASEDIR}/logs\nTRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir\nmkdir -p ${TRAINER_DIR}\n\npython3  -m axlearn.common.launch_trainer_main  \\\n    --module=text.gpt.c4_trainer \\\n    --config=${CONFIG} \\\n    --trainer_dir=${TRAINER_DIR} \\\n    --data_dir=gs://axlearn-public/tensorflow_datasets \\\n    --jax_backend=gpu \n"
+                    - |        
+                      BASEDIR="/opt/axlearn"
+                      CONFIG="fuji-3B-v3-flash-single-host"
+                      HLO_DUMP=0
+                      POSTFIX=""
+
+                      AR_THRESHOLD=1073741824
+                      AG_THRESHOLD=8589934592
+                      RS_THRESHOLD=8589934592
+                      BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true
+                          --xla_gpu_enable_highest_priority_async_stream=true
+                          --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
+                          --xla_gpu_all_gather_combine_threshold_bytes=1073741824
+                          --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
+                          --xla_gpu_enable_pipelined_all_gather=true
+                          --xla_gpu_enable_pipelined_reduce_scatter=true
+                          --xla_gpu_enable_pipelined_all_reduce=true
+                          --xla_gpu_enable_while_loop_double_buffering=true
+                          --xla_gpu_enable_triton_gemm=false
+                          --xla_gpu_enable_all_gather_combine_by_dim=false
+                          --xla_gpu_enable_reduce_scatter_combine_by_dim=false
+                          --xla_disable_hlo_passes=rematerialization}
+
+                      export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" 
+                      export TF_GPU_ALLOCATOR=cuda_malloc_async
+
+                      LOG_DIR=${BASEDIR}/logs
+                      TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir
+                      mkdir -p ${TRAINER_DIR}
+
+
+                      python3 -m axlearn.common.launch_trainer_main \
+                          --module=text.gpt.c4_trainer \
+                          --config=${CONFIG} \
+                          --trainer_dir=${TRAINER_DIR} \
+                          --data_dir=gs://axlearn-public/tensorflow_datasets \
+                          --jax_backend=gpu                    
                   resources:
                     limits:
                         nvidia.com/gpu: 8