13
13
spec :
14
14
restartPolicy : Never
15
15
containers :
16
- - name : axlearn-fuji-1B
16
+ - name : axlearn-fuji-3B
17
17
image : PLACEHOLDER
18
18
command :
19
19
- bash
@@ -23,31 +23,23 @@ spec:
23
23
- |
24
24
25
25
BASEDIR="/opt/axlearn"
26
- CONFIG="fuji-1B-v3-flash-single-host"
27
- HLO_DUMP=0
28
- POSTFIX=""
29
-
30
- AR_THRESHOLD=1073741824
31
- AG_THRESHOLD=8589934592
32
- RS_THRESHOLD=8589934592
33
- XLA_BASE_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
34
- --xla_gpu_enable_triton_gemm=false
35
- --xla_gpu_enable_highest_priority_async_stream=true
36
- --xla_gpu_all_gather_combine_threshold_bytes=${AG_THRESHOLD}
37
- --xla_gpu_reduce_scatter_combine_threshold_bytes=${RS_THRESHOLD}
38
- --xla_gpu_enable_pipelined_all_gather=true
39
- --xla_gpu_enable_pipelined_reduce_scatter=true
40
- --xla_gpu_enable_nccl_comm_splitting=false"
41
-
42
- export XLA_PYTHON_CLIENT_PREALLOCATE=false
43
- export TF_GPU_ALLOCATOR=cuda_malloc_async
44
- export XLA_FLAGS="${XLA_BASE_FLAGS}"
45
-
46
- export NCCL_BUFFSIZE=8388608
47
- export NCCL_P2P_NET_CHUNKSIZE=524288
48
- export NCCL_LAUNCH_MODE=GROUP
49
- export NCCL_DEBUG=INFO
26
+ CONFIG="fuji-3B-v3-flash-single-host"
27
+ BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true
28
+ --xla_gpu_enable_highest_priority_async_stream=true
29
+ --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
30
+ --xla_gpu_all_gather_combine_threshold_bytes=1073741824
31
+ --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
32
+ --xla_gpu_enable_pipelined_all_gather=true
33
+ --xla_gpu_enable_pipelined_reduce_scatter=true
34
+ --xla_gpu_enable_pipelined_all_reduce=true
35
+ --xla_gpu_enable_while_loop_double_buffering=true
36
+ --xla_gpu_enable_triton_gemm=false
37
+ --xla_gpu_enable_all_gather_combine_by_dim=false
38
+ --xla_gpu_enable_reduce_scatter_combine_by_dim=false
39
+ --xla_disable_hlo_passes=rematerialization}
50
40
41
+ export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}"
42
+
51
43
LOG_DIR=${BASEDIR}/logs
52
44
TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir
53
45
mkdir -p ${TRAINER_DIR}
0 commit comments