Skip to content

Commit d1a36e5

Browse files
committed
Replace redundant preprocess_data.py with preprocess_data_partitions.py
1 parent 05fb948 commit d1a36e5

12 files changed

+793
-450
lines changed

.gitlab-ci.yml

+8-8
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ unit_tests:
4040
- export BUILD_DIR=`pwd`
4141
- export RUN_NAME=resume_${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes
4242
- echo "In case of error check ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs."
43-
- export TP_SIZE PP_SIZE NUM_NODES MAX_STEPS
43+
- export TP_SIZE PP_SIZE NUM_NODES MAX_STEPS
4444
- export DATA_DIR=$DATA_DIR
4545
- echo "Run name is $RUN_NAME"
4646
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints
@@ -65,7 +65,7 @@ unit_tests:
6565
# Gitlab logs collapsible section markers
6666
- echo -e "\e[0Ksection_end:`date +%s`:slurm_setup\r\e[0K"
6767
# Follow output of the job
68-
- echo "Finished job"
68+
- echo "Finished job"
6969
- export SLURM_STATE=$(sacct -j "${SLURM_JOBID}" --format State --parsable2 --noheader |& head -n 1)
7070
- echo "Slurm job state $SLURM_STATE"
7171
- if [[ "$SLURM_STATE" != "COMPLETED" ]]; then echo "Slurm job did not complete. See ${SELENE_ADLR_CI_PATH}/${CI_PIPELINE_ID}/${RUN_NAME}/results directory for result logs. Skipping pytest."; exit 1; fi
@@ -79,7 +79,7 @@ unit_tests:
7979
- if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
8080
when: always
8181
- if: $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
82-
when: always
82+
when: always
8383
allow_failure: false
8484

8585
.selene_test_launcher: &selene-test-launcher
@@ -146,7 +146,7 @@ unit_tests:
146146
- if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
147147
when: always
148148
- if: $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
149-
when: always
149+
when: always
150150
allow_failure: false
151151

152152
train.te_gpt3.345m_tp2_pp2_1node_50steps:
@@ -199,7 +199,7 @@ train.gpt3.345m_tp1_pp2_1node_50steps:
199199
NUM_NODES: 1
200200
MAX_STEPS: 50
201201
TIME_LIMIT: "20:00"
202-
TEST_LEVEL: L0
202+
TEST_LEVEL: L0
203203

204204
train.gpt3.345m_tp1_pp4_1node_50steps:
205205
<<: *selene-test-launcher
@@ -224,7 +224,7 @@ resume.checkpoint.gpt3.345m_tp1_pp2_1node:
224224
PP_SIZE: 2
225225
NUM_NODES: 1
226226
TIME_LIMIT: "30:00"
227-
TEST_LEVEL: L0
227+
TEST_LEVEL: L0
228228

229229
train.bert.345m_tp4_pp1_1node_50steps:
230230
<<: *selene-test-launcher
@@ -260,7 +260,7 @@ train.bert.345m_tp1_pp2_1node_50steps:
260260
NUM_NODES: 1
261261
MAX_STEPS: 50
262262
TIME_LIMIT: "20:00"
263-
TEST_LEVEL: L0
263+
TEST_LEVEL: L0
264264

265265
train.bert.345m_tp1_pp4_1node_50steps:
266266
<<: *selene-test-launcher
@@ -284,7 +284,7 @@ resume.checkpoint.bert.345m_tp1_pp2_1node:
284284
PP_SIZE: 2
285285
NUM_NODES: 1
286286
TIME_LIMIT: "30:00"
287-
TEST_LEVEL: L0
287+
TEST_LEVEL: L0
288288

289289
cleanup.selene:
290290
tags:

README.md

+3-4
Original file line numberDiff line numberDiff line change
@@ -102,13 +102,12 @@ The training data requires preprocessing. First, place your training data in a l
102102

103103
The name of the `text` field of the json can be changed by using the `--json-key` flag in [`preprocess_data.py`](./tools/preprocess_data.py) The other metadata are optional and are not used in training.
104104

105-
The loose json is then processed into a binary format for training. To convert the json into mmap, cached index file, or the lazy loader format use `preprocess_data.py`. Set the `--dataset-impl` flag to `mmap`, `cached`, or `lazy`, respectively (default is `mmap`). An example script to prepare data for BERT training is:
105+
The loose json is then processed into a binary format for training. To convert the json into mmap format use `preprocess_data.py`. An example script to prepare data for BERT training is:
106106
<pre>
107107
python tools/preprocess_data.py \
108108
--input my-corpus.json \
109109
--output-prefix my-bert \
110-
--vocab bert-vocab.txt \
111-
--dataset-impl mmap \
110+
--vocab-file bert-vocab.txt \
112111
--tokenizer-type BertWordPieceLowerCase \
113112
--split-sentences
114113
</pre>
@@ -125,7 +124,7 @@ Some minor modifications are required for GPT data preprocessing, namely, the ad
125124
python tools/preprocess_data.py \
126125
--input my-corpus.json \
127126
--output-prefix my-gpt2 \
128-
--vocab gpt2-vocab.json \
127+
--vocab-file gpt2-vocab.json \
129128
--dataset-impl mmap \
130129
--tokenizer-type GPT2BPETokenizer \
131130
--merge-file gpt2-merges.txt \

cluster_scripts/debug_gpt3.sh

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#! /bin/bash
2+
3+
4+
NAME=gpt3_126m_2_2_debug
5+
BASE_DIR=/lustre/fsw/adlr/adlr-nlp/jbarker/next-llm/source
6+
SCRIPTS=${BASE_DIR}/scripts
7+
MEGATRON=${BASE_DIR}/megatron-lm
8+
OUTPUT_DIR=${BASE_DIR}/output/debug
9+
LOGDIR=${OUTPUT_DIR}/logs/${NAME}
10+
CHECKPOINT_DIR=/lustre/fsw/adlr/adlr-nlp/jbarker/checkpoints/${NAME}
11+
TENSORBOARD_DIR=${OUTPUT_DIR}/tensorboard/${NAME}
12+
13+
WORLD_SIZE=8
14+
15+
# Get the data blend
16+
. /lustre/fsw/adlr/adlr-nlp-large/data/gpt3/gpt3_blend.sh
17+
18+
TRAIN_COMMAND=(
19+
${MEGATRON}/pretrain_gpt.py
20+
--exit-duration-in-mins 230
21+
--tensor-model-parallel-size 1
22+
--pipeline-model-parallel-size 8
23+
--num-layers 24
24+
--hidden-size 768
25+
--num-attention-heads 12
26+
--seq-length 2048
27+
--max-position-embeddings 2048
28+
--micro-batch-size 1
29+
--global-batch-size 8
30+
--train-samples 192000000
31+
--lr-decay-samples 166400000
32+
--lr-warmup-samples 162761
33+
--lr 6.0e-4
34+
--min-lr 6.0e-5
35+
--lr-decay-style cosine
36+
--log-interval 10
37+
--exit-interval 1000
38+
--log-num-zeros-in-grad
39+
--eval-iters 200
40+
--eval-interval 2000
41+
--data-path ${DATA_BLEND}
42+
--vocab-file /lustre/fsw/adlr/adlr-nlp-large/data/bpe/gpt2-vocab.json
43+
--merge-file /lustre/fsw/adlr/adlr-nlp-large/data/bpe/gpt2-merges.txt
44+
--split 98,2,0
45+
--clip-grad 1.0
46+
--weight-decay 0.1
47+
--adam-beta1 0.9
48+
--adam-beta2 0.95
49+
--init-method-std 0.023
50+
--log-params-norm
51+
--log-num-zeros-in-grad
52+
--timing-log-level 0
53+
--bf16
54+
--DDP-impl local
55+
--save-interval 1000
56+
--save ${CHECKPOINT_DIR}
57+
)
58+
59+
# --num-layers-per-virtual-pipeline-stage 1
60+
61+
# --use-flash-attn
62+
63+
# --load ${CHECKPOINT_DIR}
64+
65+
CUDA_DEVICE_MAX_CONNECTIONS=1 \
66+
torchrun --nproc_per_node ${WORLD_SIZE} ${TRAIN_COMMAND[*]}
67+
68+
# --global-batch-size 256
69+
# --rampup-batch-size 32 32 1953125

cluster_scripts/debug_nextllm.sh

+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#! /bin/bash
2+
3+
export CUBLAS_WORKSPACE_CONFIG=:16:8
4+
5+
NAME=nextllm_determinism_debug
6+
BASE_DIR=/lustre/fsw/adlr/adlr-nlp/jbarker/next-llm
7+
SCRIPTS=${BASE_DIR}/scripts
8+
MEGATRON=${BASE_DIR}/source/megatron-lm
9+
OUTPUT_DIR=${BASE_DIR}/output/debug
10+
LOGDIR=${OUTPUT_DIR}/logs/${NAME}
11+
CHECKPOINT_DIR=/lustre/fsw/adlr/adlr-nlp/jbarker/checkpoints/${NAME}
12+
TENSORBOARD_DIR=${OUTPUT_DIR}/tensorboard/${NAME}
13+
14+
WORLD_SIZE=8
15+
16+
# Get the data blend
17+
. /lustre/fsw/adlr/adlr-nlp/data/pile-cc1-cc2-shuf/gpt3_blend.sh
18+
19+
BPE_DIR="/lustre/fsw/adlr/adlr-nlp/data/pile-cc1-cc2-shuf/bpe"
20+
21+
TRAIN_COMMAND=(
22+
${MEGATRON}/pretrain_gpt.py
23+
--exit-duration-in-mins 230 \
24+
--tensor-model-parallel-size 8 \
25+
--pipeline-model-parallel-size 8 \
26+
#--num-layers-per-virtual-pipeline-stage 1 \
27+
--recompute-activations \
28+
--sequence-parallel \
29+
--num-layers 24 \
30+
--hidden-size 768 \
31+
--num-attention-heads 24 \
32+
--seq-length 2048 \
33+
--max-position-embeddings 2048 \
34+
--micro-batch-size 1 \
35+
--global-batch-size 8 \
36+
--train-samples 192000000 \
37+
--lr-decay-samples 166400000 \
38+
--lr-warmup-samples 244141 \
39+
--lr 1.0e-4 \
40+
--min-lr 1.0e-5 \
41+
--lr-decay-style cosine \
42+
--log-interval 1 \
43+
--eval-iters 50 \
44+
--eval-interval 2000 \
45+
--data-path ${DATA_BLEND} \
46+
--vocab-file ${BPE_DIR}/gpt2-vocab.json \
47+
--merge-file ${BPE_DIR}/gpt2-merges.txt \
48+
--save-interval 20000 \
49+
--save ${CHECKPOINT_DIR} \
50+
--load ${CHECKPOINT_DIR} \
51+
--exit-interval 1 \
52+
--split 98,2,0 \
53+
--clip-grad 1.0 \
54+
--weight-decay 0.1 \
55+
--adam-beta1 0.9 \
56+
--adam-beta2 0.95 \
57+
--init-method-std 0.01 \
58+
--log-params-norm \
59+
--log-num-zeros-in-grad \
60+
--bf16 \
61+
--DDP-impl local \
62+
--tensorboard-dir ${TENSORBOARD_DIR} \
63+
--timing-log-level 1 \
64+
--timing-log-option minmax \
65+
)
66+
67+
# --num-layers-per-virtual-pipeline-stage 1
68+
69+
# --use-flash-attn
70+
71+
# --load ${CHECKPOINT_DIR}
72+
73+
CUDA_DEVICE_MAX_CONNECTIONS=1 \
74+
CUBLAS_WORKSPACE_CONFIG=:16:8 \
75+
torchrun --nproc_per_node ${WORLD_SIZE} ${TRAIN_COMMAND[*]}
76+
77+
# --global-batch-size 256
78+
# --rampup-batch-size 32 32 1953125
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#!/bin/bash
2+
3+
#SBATCH -p luna -A adlr -t 04:00:00 --dependency=singleton --nodes=1 --exclusive --mem=0 --overcommit --ntasks-per-node=8 --job-name=adlr-nlp:foundation-model-medium_dp1_adaptve_routing-22.12-noflash-repeat
4+
5+
export CUDA_DEVICE_MAX_CONNECTIONS=1
6+
export NCCL_IB_SL=1
7+
8+
BRANCH=${1}
9+
COMMIT=${2}
10+
CONTAINER=${3}
11+
NUMBER=${4}
12+
13+
NAME="foundation-model-medium_dp1_adaptive_routing-22.12-noflash-${NUMBER}"
14+
15+
SOURCE="/lustre/fsw/adlr/adlr-nlp/jbarker/next-llm/source/megatron-lm"
16+
OUTPUT="/lustre/fsw/adlr/adlr-nlp/jbarker/next-llm/output/pretraining.${BRANCH}.${COMMIT}.${CONTAINER}/${NAME}/"
17+
18+
SCRIPTS_DIR="/lustre/fsw/adlr/adlr-nlp/jbarker/next-llm/source/"
19+
20+
CHECKPOINTS_DIR="${OUTPUT}/checkpoints"
21+
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
22+
LOGS_DIR="${OUTPUT}/logs"
23+
24+
mkdir -p ${CHECKPOINTS_DIR}
25+
mkdir -p ${TENSORBOARD_DIR}
26+
mkdir -p ${LOGS_DIR}
27+
28+
# CHECKPOINT_DIR="/lustre/fsw/adlr/adlr-nlp/mshoeybi/checkpoints/foundation_model/speed/${NAME}"
29+
30+
# Get the data blend
31+
. /lustre/fsw/adlr/adlr-nlp/data/pile-cc1-cc2-shuf/gpt3_blend.sh
32+
33+
BPE_DIR="/lustre/fsw/adlr/adlr-nlp/data/pile-cc1-cc2-shuf/bpe"
34+
35+
# --num-layers-per-virtual-pipeline-stage 3 \
36+
37+
options=" \
38+
--exit-duration-in-mins 230 \
39+
--exit-interval 100000 \
40+
--tensor-model-parallel-size 8 \
41+
--pipeline-model-parallel-size 1 \
42+
--recompute-activations \
43+
--sequence-parallel \
44+
--num-layers 12 \
45+
--hidden-size 8192 \
46+
--num-attention-heads 64 \
47+
--seq-length 2048 \
48+
--max-position-embeddings 2048 \
49+
--micro-batch-size 1 \
50+
--global-batch-size 16 \
51+
--train-samples 192000000 \
52+
--lr-decay-samples 166400000 \
53+
--lr-warmup-samples 244141 \
54+
--lr 1.0e-4 \
55+
--min-lr 1.0e-5 \
56+
--lr-decay-style cosine \
57+
--log-interval 1 \
58+
--eval-iters 50 \
59+
--eval-interval 2000 \
60+
--data-path ${DATA_BLEND} \
61+
--vocab-file ${BPE_DIR}/gpt2-vocab.json \
62+
--merge-file ${BPE_DIR}/gpt2-merges.txt \
63+
--save-interval 2000 \
64+
--save ${CHECKPOINTS_DIR} \
65+
--load ${CHECKPOINTS_DIR} \
66+
--split 98,2,0 \
67+
--clip-grad 1.0 \
68+
--weight-decay 0.1 \
69+
--adam-beta1 0.9 \
70+
--adam-beta2 0.95 \
71+
--init-method-std 0.01 \
72+
--log-params-norm \
73+
--log-num-zeros-in-grad \
74+
--bf16 \
75+
--DDP-impl local \
76+
--tensorboard-dir ${TENSORBOARD_DIR} \
77+
--timing-log-level 1 \
78+
--timing-log-option minmax \
79+
"
80+
81+
run_cmd="${SCRIPTS_DIR}/bind.sh --cpu=${SCRIPTS_DIR}/dgxa100_ccx.sh --mem=${SCRIPTS_DIR}/dgxa100_ccx.sh python -u ${SOURCE}/pretrain_gpt.py ${options}"
82+
83+
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
84+
85+
# --container-image "gitlab-master.nvidia.com/adlr/megatron-lm/pytorch_flash_att:22.12-py3" \
86+
87+
srun -l \
88+
--container-image nvcr.io#nvidia/pytorch:22.09-py3 \
89+
--container-mounts "/lustre/fsw/adlr:/lustre/fsw/adlr" \
90+
--output=${LOGS_DIR}/%x_%j_$DATETIME.log sh -c "${run_cmd}"
91+
92+
set +x
93+

0 commit comments

Comments
 (0)