Skip to content

Commit 94dbfd1

Browse files
maanug-nvjaredcasper
authored andcommitted
Fix distributed pretraining examples
1 parent 285068c commit 94dbfd1

6 files changed

+333
-212
lines changed

examples/pretrain_bert_distributed.sh

+52-32
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#!/bin/bash
22

3+
export CUDA_DEVICE_MAX_CONNECTIONS=1
4+
35
GPUS_PER_NODE=8
46
# Change for multinode config
57
MASTER_ADDR=localhost
@@ -8,37 +10,55 @@ NNODES=1
810
NODE_RANK=0
911
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
1012

11-
DATA_PATH=<Specify path and file prefix>_text_sentence
1213
CHECKPOINT_PATH=<Specify path>
14+
VOCAB_FILE=<Specify path to file>/bert-vocab.txt
15+
DATA_PATH=<Specify path and file prefix>_text_sentence
16+
17+
DISTRIBUTED_ARGS="
18+
--nproc_per_node $GPUS_PER_NODE \
19+
--nnodes $NNODES \
20+
--node_rank $NODE_RANK \
21+
--master_addr $MASTER_ADDR \
22+
--master_port $MASTER_PORT
23+
"
24+
25+
BERT_ARGS="
26+
--num-layers 24 \
27+
--hidden-size 1024 \
28+
--num-attention-heads 16 \
29+
--seq-length 512 \
30+
--max-position-embeddings 512 \
31+
--micro-batch-size 4 \
32+
--global-batch-size 32 \
33+
--lr 0.0001 \
34+
--train-iters 1000000 \
35+
--lr-decay-iters 990000 \
36+
--lr-decay-style linear \
37+
--min-lr 1.0e-5 \
38+
--weight-decay 1e-2 \
39+
--lr-warmup-fraction .01 \
40+
--clip-grad 1.0 \
41+
--fp16
42+
"
43+
44+
DATA_ARGS="
45+
--data-path $DATA_PATH \
46+
--vocab-file $VOCAB_FILE \
47+
--data-impl mmap \
48+
--split 949,50,1
49+
"
50+
51+
OUTPUT_ARGS="
52+
--log-interval 100 \
53+
--save-interval 10000 \
54+
--eval-interval 1000 \
55+
--eval-iters 10
56+
"
1357

14-
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
15-
16-
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
17-
pretrain_bert.py \
18-
--num-layers 24 \
19-
--hidden-size 1024 \
20-
--num-attention-heads 16 \
21-
--micro-batch-size 4 \
22-
--global-batch-size 32 \
23-
--seq-length 512 \
24-
--max-position-embeddings 512 \
25-
--train-iters 1000000 \
26-
--save $CHECKPOINT_PATH \
27-
--load $CHECKPOINT_PATH \
28-
--data-path $DATA_PATH \
29-
--vocab-file bert-vocab.txt \
30-
--data-impl mmap \
31-
--split 949,50,1 \
32-
--distributed-backend nccl \
33-
--lr 0.0001 \
34-
--lr-decay-style linear \
35-
--min-lr 1.0e-5 \
36-
--lr-decay-iters 990000 \
37-
--weight-decay 1e-2 \
38-
--clip-grad 1.0 \
39-
--lr-warmup-fraction .01 \
40-
--log-interval 100 \
41-
--save-interval 10000 \
42-
--eval-interval 1000 \
43-
--eval-iters 10 \
44-
--fp16
58+
torchrun $DISTRIBUTED_ARGS pretrain_bert.py \
59+
$BERT_ARGS \
60+
$DATA_ARGS \
61+
$OUTPUT_ARGS \
62+
--distributed-backend nccl \
63+
--save $CHECKPOINT_PATH \
64+
--load $CHECKPOINT_PATH
+54-35
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#!/bin/bash
22

3+
export CUDA_DEVICE_MAX_CONNECTIONS=1
4+
35
GPUS_PER_NODE=8
46
# Change for multinode config
57
MASTER_ADDR=localhost
@@ -8,40 +10,57 @@ NNODES=1
810
NODE_RANK=0
911
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
1012

11-
DATA_PATH=<Specify path and file prefix>_text_sentence
12-
VOCAB_FILE=<Specify path to vocab.txt>
1313
CHECKPOINT_PATH=<Specify path>
14+
VOCAB_FILE=<Specify path to file>/bert-vocab.txt
15+
DATA_PATH=<Specify path and file prefix>_text_sentence
16+
17+
DISTRIBUTED_ARGS="
18+
--nproc_per_node $GPUS_PER_NODE \
19+
--nnodes $NNODES \
20+
--node_rank $NODE_RANK \
21+
--master_addr $MASTER_ADDR \
22+
--master_port $MASTER_PORT
23+
"
24+
25+
BERT_ARGS="
26+
--tensor-model-parallel-size 2 \
27+
--pipeline-model-parallel-size 2 \
28+
--num-layers 24 \
29+
--hidden-size 1024 \
30+
--num-attention-heads 16 \
31+
--seq-length 512 \
32+
--max-position-embeddings 512 \
33+
--micro-batch-size 2 \
34+
--global-batch-size 16 \
35+
--lr 0.0001 \
36+
--train-iters 1000000 \
37+
--lr-decay-iters 990000 \
38+
--lr-decay-style linear \
39+
--min-lr 1.0e-5 \
40+
--weight-decay 1e-2 \
41+
--lr-warmup-fraction .01 \
42+
--clip-grad 1.0 \
43+
--fp16
44+
"
45+
46+
DATA_ARGS="
47+
--data-path $DATA_PATH \
48+
--vocab-file $VOCAB_FILE \
49+
--data-impl mmap \
50+
--split 949,50,1
51+
"
52+
53+
OUTPUT_ARGS="
54+
--log-interval 100 \
55+
--save-interval 10000 \
56+
--eval-interval 1000 \
57+
--eval-iters 10
58+
"
1459

15-
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
16-
17-
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
18-
pretrain_bert.py \
19-
--tensor-model-parallel-size 2 \
20-
--pipeline-model-parallel-size 2 \
21-
--num-layers 24 \
22-
--hidden-size 1024 \
23-
--num-attention-heads 16 \
24-
--micro-batch-size 2 \
25-
--global-batch-size 16 \
26-
--seq-length 512 \
27-
--max-position-embeddings 512 \
28-
--train-iters 1000000 \
29-
--save $CHECKPOINT_PATH \
30-
--load $CHECKPOINT_PATH \
31-
--data-path $DATA_PATH \
32-
--vocab-file $VOCAB_FILE \
33-
--data-impl mmap \
34-
--split 949,50,1 \
35-
--distributed-backend nccl \
36-
--lr 0.0001 \
37-
--lr-decay-style linear \
38-
--min-lr 1.0e-5 \
39-
--lr-decay-iters 990000 \
40-
--weight-decay 1e-2 \
41-
--clip-grad 1.0 \
42-
--lr-warmup-fraction .01 \
43-
--log-interval 100 \
44-
--save-interval 10000 \
45-
--eval-interval 1000 \
46-
--eval-iters 10 \
47-
--fp16
60+
torchrun $DISTRIBUTED_ARGS pretrain_bert.py \
61+
$BERT_ARGS \
62+
$DATA_ARGS \
63+
$OUTPUT_ARGS \
64+
--distributed-backend nccl \
65+
--save $CHECKPOINT_PATH \
66+
--load $CHECKPOINT_PATH

examples/pretrain_gpt_distributed.sh

+55-35
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1-
#! /bin/bash
1+
#!/bin/bash
22

33
# Runs the "345M" parameter model
44

5+
export CUDA_DEVICE_MAX_CONNECTIONS=1
6+
57
GPUS_PER_NODE=8
68
# Change for multinode config
79
MASTER_ADDR=localhost
@@ -10,39 +12,57 @@ NNODES=1
1012
NODE_RANK=0
1113
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
1214

13-
DATA_PATH=<Specify path and file prefix>_text_document
1415
CHECKPOINT_PATH=<Specify path>
16+
VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
17+
MERGE_FILE=<Specify path to file>/gpt2-merges.txt
18+
DATA_PATH=<Specify path and file prefix>_text_document
19+
20+
DISTRIBUTED_ARGS="
21+
--nproc_per_node $GPUS_PER_NODE \
22+
--nnodes $NNODES \
23+
--node_rank $NODE_RANK \
24+
--master_addr $MASTER_ADDR \
25+
--master_port $MASTER_PORT
26+
"
27+
28+
GPT_ARGS="
29+
--num-layers 24 \
30+
--hidden-size 1024 \
31+
--num-attention-heads 16 \
32+
--seq-length 1024 \
33+
--max-position-embeddings 1024 \
34+
--micro-batch-size 8 \
35+
--global-batch-size 64 \
36+
--lr 0.00015 \
37+
--train-iters 500000 \
38+
--lr-decay-iters 320000 \
39+
--lr-decay-style cosine \
40+
--min-lr 1.0e-5 \
41+
--weight-decay 1e-2 \
42+
--lr-warmup-fraction .01 \
43+
--clip-grad 1.0 \
44+
--fp16
45+
"
46+
47+
DATA_ARGS="
48+
--data-path $DATA_PATH \
49+
--vocab-file $VOCAB_FILE \
50+
--merge-file $MERGE_FILE \
51+
--data-impl mmap \
52+
--split 949,50,1
53+
"
54+
55+
OUTPUT_ARGS="
56+
--log-interval 100 \
57+
--save-interval 10000 \
58+
--eval-interval 1000 \
59+
--eval-iters 10
60+
"
1561

16-
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
17-
18-
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
19-
pretrain_gpt.py \
20-
--num-layers 24 \
21-
--hidden-size 1024 \
22-
--num-attention-heads 16 \
23-
--micro-batch-size 8 \
24-
--global-batch-size 64 \
25-
--seq-length 1024 \
26-
--max-position-embeddings 1024 \
27-
--train-iters 500000 \
28-
--lr-decay-iters 320000 \
29-
--save $CHECKPOINT_PATH \
30-
--load $CHECKPOINT_PATH \
31-
--data-path $DATA_PATH \
32-
--vocab-file gpt2-vocab.json \
33-
--merge-file gpt2-merges.txt \
34-
--data-impl mmap \
35-
--split 949,50,1 \
36-
--distributed-backend nccl \
37-
--lr 0.00015 \
38-
--lr-decay-style cosine \
39-
--min-lr 1.0e-5 \
40-
--weight-decay 1e-2 \
41-
--clip-grad 1.0 \
42-
--lr-warmup-fraction .01 \
43-
--activations-checkpoint-method uniform \
44-
--log-interval 100 \
45-
--save-interval 10000 \
46-
--eval-interval 1000 \
47-
--eval-iters 10 \
48-
--fp16
62+
torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
63+
$GPT_ARGS \
64+
$DATA_ARGS \
65+
$OUTPUT_ARGS \
66+
--distributed-backend nccl \
67+
--save $CHECKPOINT_PATH \
68+
--load $CHECKPOINT_PATH

0 commit comments

Comments
 (0)