Skip to content

Commit 0b1a61f

Browse files
committed
fix the 3B model run on k8s
1 parent c200dea commit 0b1a61f

File tree

2 files changed

+23
-31
lines changed

2 files changed

+23
-31
lines changed

.github/eks-workflow-files/axlearn/axlearn-1B-model.yml .github/eks-workflow-files/axlearn/axlearn-3B-model.yml

+17-25
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ spec:
1313
spec:
1414
restartPolicy: Never
1515
containers:
16-
- name: axlearn-fuji-1B
16+
- name: axlearn-fuji-3B
1717
image: PLACEHOLDER
1818
command:
1919
- bash
@@ -23,31 +23,23 @@ spec:
2323
- |
2424
2525
BASEDIR="/opt/axlearn"
26-
CONFIG="fuji-1B-v3-flash-single-host"
27-
HLO_DUMP=0
28-
POSTFIX=""
29-
30-
AR_THRESHOLD=1073741824
31-
AG_THRESHOLD=8589934592
32-
RS_THRESHOLD=8589934592
33-
XLA_BASE_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
34-
--xla_gpu_enable_triton_gemm=false
35-
--xla_gpu_enable_highest_priority_async_stream=true
36-
--xla_gpu_all_gather_combine_threshold_bytes=${AG_THRESHOLD}
37-
--xla_gpu_reduce_scatter_combine_threshold_bytes=${RS_THRESHOLD}
38-
--xla_gpu_enable_pipelined_all_gather=true
39-
--xla_gpu_enable_pipelined_reduce_scatter=true
40-
--xla_gpu_enable_nccl_comm_splitting=false"
41-
42-
export XLA_PYTHON_CLIENT_PREALLOCATE=false
43-
export TF_GPU_ALLOCATOR=cuda_malloc_async
44-
export XLA_FLAGS="${XLA_BASE_FLAGS}"
45-
46-
export NCCL_BUFFSIZE=8388608
47-
export NCCL_P2P_NET_CHUNKSIZE=524288
48-
export NCCL_LAUNCH_MODE=GROUP
49-
export NCCL_DEBUG=INFO
26+
CONFIG="fuji-3B-v3-flash-single-host"
27+
BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true
28+
--xla_gpu_enable_highest_priority_async_stream=true
29+
--xla_gpu_all_reduce_combine_threshold_bytes=1073741824
30+
--xla_gpu_all_gather_combine_threshold_bytes=1073741824
31+
--xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
32+
--xla_gpu_enable_pipelined_all_gather=true
33+
--xla_gpu_enable_pipelined_reduce_scatter=true
34+
--xla_gpu_enable_pipelined_all_reduce=true
35+
--xla_gpu_enable_while_loop_double_buffering=true
36+
--xla_gpu_enable_triton_gemm=false
37+
--xla_gpu_enable_all_gather_combine_by_dim=false
38+
--xla_gpu_enable_reduce_scatter_combine_by_dim=false
39+
--xla_disable_hlo_passes=rematerialization}
5040
41+
export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}"
42+
5143
LOG_DIR=${BASEDIR}/logs
5244
TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir
5345
mkdir -p ${TRAINER_DIR}

.github/workflows/_ci.yaml

+6-6
Original file line numberDiff line numberDiff line change
@@ -749,14 +749,14 @@ jobs:
749749
750750
# the fuji test will run for 20 minutes only, as per 2025-02-24
751751
# is not possible to set the `max_steps` value
752-
test-axlearn-fuji-1B-eks:
752+
test-axlearn-fuji-3B-eks:
753753
needs: build-axlearn
754754
if: inputs.ARCHITECTURE == 'amd64'
755755
runs-on: eks
756756
env:
757757
AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
758-
JOB_NAME: axlearn-fuji-1b-${{ github.run_id }}
759-
TOKEN_NAME: axlearn-fuji-1b-${{ github.run_id }}-token
758+
JOB_NAME: axlearn-fuji-3b-${{ github.run_id }}
759+
TOKEN_NAME: axlearn-fuji-3b-${{ github.run_id }}-token
760760
steps:
761761
- name: Check out the repository
762762
uses: actions/checkout@v4
@@ -776,12 +776,12 @@ jobs:
776776
select(di == 0).metadata.name = strenv(JOB_NAME)
777777
| select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
778778
| select(di == 0).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
779-
.github/eks-workflow-files/axlearn/axlearn-1B-model.yml
780-
git diff .github/eks-workflow-files/axlearn/axlearn-1B-model.yml
779+
.github/eks-workflow-files/axlearn/axlearn-3B-model.yml
780+
git diff .github/eks-workflow-files/axlearn/axlearn-3B-model.yml
781781
782782
- name: Submit & delete axlearn test
783783
uses: ./.github/actions/submit-delete-k8s-job
784784
with:
785-
job-config-file: ".github/eks-workflow-files/axlearn/axlearn-1B-model.yml"
785+
job-config-file: ".github/eks-workflow-files/axlearn/axlearn-3B-model.yml"
786786
job-name: ${{ env.JOB_NAME }}
787787

0 commit comments

Comments
 (0)