-
Notifications
You must be signed in to change notification settings - Fork 102
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* update * update * update * fix * add comment explaining shifted_sigmas * update
- Loading branch information
Showing
14 changed files
with
840 additions
and
10 deletions.
There are no files selected for viewing
162 changes: 162 additions & 0 deletions
162
examples/training/sft/cogview4/raider_white_tarot/train.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
#!/bin/bash | ||
|
||
set -e -x | ||
|
||
# export TORCH_LOGS="+dynamo,recompiles,graph_breaks" | ||
# export TORCHDYNAMO_VERBOSE=1 | ||
export WANDB_MODE="offline" | ||
export NCCL_P2P_DISABLE=1 | ||
export TORCH_NCCL_ENABLE_MONITORING=0 | ||
export FINETRAINERS_LOG_LEVEL="DEBUG" | ||
|
||
# Finetrainers supports multiple backends for distributed training. Select your favourite and benchmark the differences! | ||
# BACKEND="accelerate" | ||
BACKEND="ptd" | ||
|
||
# In this setting, I'm using 2 GPUs on a 4-GPU node for training | ||
NUM_GPUS=2 | ||
CUDA_VISIBLE_DEVICES="2,3" | ||
|
||
# Check the JSON files for the expected JSON format | ||
TRAINING_DATASET_CONFIG="examples/training/sft/cogview4/raider_white_tarot/training.json" | ||
VALIDATION_DATASET_FILE="examples/training/sft/cogview4/raider_white_tarot/validation.json" | ||
|
||
# Depending on how many GPUs you have available, choose your degree of parallelism and technique! | ||
DDP_1="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 1 --cp_degree 1 --tp_degree 1" | ||
DDP_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 2 --dp_shards 1 --cp_degree 1 --tp_degree 1" | ||
DDP_4="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 4 --dp_shards 1 --cp_degree 1 --tp_degree 1" | ||
FSDP_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 2 --cp_degree 1 --tp_degree 1" | ||
FSDP_4="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 4 --cp_degree 1 --tp_degree 1" | ||
HSDP_2_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 2 --dp_shards 2 --cp_degree 1 --tp_degree 1" | ||
|
||
# Parallel arguments | ||
parallel_cmd=( | ||
$DDP_2 | ||
) | ||
|
||
# Model arguments | ||
model_cmd=( | ||
--model_name "cogview4" | ||
--pretrained_model_name_or_path "THUDM/CogView4-6B" | ||
) | ||
|
||
# Dataset arguments | ||
# Here, we know that the dataset size if about ~80 images. In `training.json`, we duplicate the same | ||
# dataset 3 times for multi-resolution training. This gives us a total of about 240 images. Since | ||
# we're using 2 GPUs for training, we can split the data into 120 images per GPU and precompute | ||
# all embeddings at once, instead of doing it on-the-fly which would be slower (the ideal usecase | ||
# of not using `--precomputation_once` is when you're training on large datasets) | ||
dataset_cmd=( | ||
--dataset_config $TRAINING_DATASET_CONFIG | ||
--dataset_shuffle_buffer_size 32 | ||
--precomputation_items 120 | ||
--precomputation_once | ||
) | ||
|
||
# Dataloader arguments | ||
dataloader_cmd=( | ||
--dataloader_num_workers 0 | ||
) | ||
|
||
# Diffusion arguments | ||
diffusion_cmd=( | ||
--flow_weighting_scheme "logit_normal" | ||
) | ||
|
||
# Training arguments | ||
# We target just the attention projections layers for LoRA training here. | ||
# You can modify as you please and target any layer (regex is supported) | ||
training_cmd=( | ||
--training_type "lora" | ||
--seed 42 | ||
--batch_size 1 | ||
--train_steps 5000 | ||
--rank 32 | ||
--lora_alpha 32 | ||
--target_modules "transformer_blocks.*(to_q|to_k|to_v|to_out.0)" | ||
--gradient_accumulation_steps 1 | ||
--gradient_checkpointing | ||
--checkpointing_steps 1000 | ||
--checkpointing_limit 2 | ||
# --resume_from_checkpoint 3000 | ||
--enable_slicing | ||
--enable_tiling | ||
) | ||
|
||
# Optimizer arguments | ||
optimizer_cmd=( | ||
--optimizer "adamw" | ||
--lr 3e-5 | ||
--lr_scheduler "constant_with_warmup" | ||
--lr_warmup_steps 1000 | ||
--lr_num_cycles 1 | ||
--beta1 0.9 | ||
--beta2 0.99 | ||
--weight_decay 1e-4 | ||
--epsilon 1e-8 | ||
--max_grad_norm 1.0 | ||
) | ||
|
||
# Validation arguments | ||
validation_cmd=( | ||
--validation_dataset_file "$VALIDATION_DATASET_FILE" | ||
--validation_steps 500 | ||
) | ||
|
||
# Miscellaneous arguments | ||
miscellaneous_cmd=( | ||
--tracker_name "finetrainers-cogview4" | ||
--output_dir "/raid/aryan/cogview4" | ||
--init_timeout 600 | ||
--nccl_timeout 600 | ||
--report_to "wandb" | ||
) | ||
|
||
# Execute the training script | ||
if [ "$BACKEND" == "accelerate" ]; then | ||
|
||
ACCELERATE_CONFIG_FILE="" | ||
if [ "$NUM_GPUS" == 1 ]; then | ||
ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_1.yaml" | ||
elif [ "$NUM_GPUS" == 2 ]; then | ||
ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_2.yaml" | ||
elif [ "$NUM_GPUS" == 4 ]; then | ||
ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_4.yaml" | ||
elif [ "$NUM_GPUS" == 8 ]; then | ||
ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_8.yaml" | ||
fi | ||
|
||
accelerate launch --config_file "$ACCELERATE_CONFIG_FILE" --gpu_ids $CUDA_VISIBLE_DEVICES train.py \ | ||
"${parallel_cmd[@]}" \ | ||
"${model_cmd[@]}" \ | ||
"${dataset_cmd[@]}" \ | ||
"${dataloader_cmd[@]}" \ | ||
"${diffusion_cmd[@]}" \ | ||
"${training_cmd[@]}" \ | ||
"${optimizer_cmd[@]}" \ | ||
"${validation_cmd[@]}" \ | ||
"${miscellaneous_cmd[@]}" | ||
|
||
elif [ "$BACKEND" == "ptd" ]; then | ||
|
||
export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES | ||
|
||
torchrun \ | ||
--standalone \ | ||
--nnodes=1 \ | ||
--nproc_per_node=$NUM_GPUS \ | ||
--rdzv_backend c10d \ | ||
--rdzv_endpoint="localhost:0" \ | ||
train.py \ | ||
"${parallel_cmd[@]}" \ | ||
"${model_cmd[@]}" \ | ||
"${dataset_cmd[@]}" \ | ||
"${dataloader_cmd[@]}" \ | ||
"${diffusion_cmd[@]}" \ | ||
"${training_cmd[@]}" \ | ||
"${optimizer_cmd[@]}" \ | ||
"${validation_cmd[@]}" \ | ||
"${miscellaneous_cmd[@]}" | ||
fi | ||
|
||
echo -ne "-------------------- Finished executing script --------------------\n\n" |
34 changes: 34 additions & 0 deletions
34
examples/training/sft/cogview4/raider_white_tarot/training.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
{ | ||
"datasets": [ | ||
{ | ||
"data_root": "multimodalart/1920-raider-waite-tarot-public-domain", | ||
"dataset_type": "image", | ||
"id_token": "TRTCRD", | ||
"image_resolution_buckets": [ | ||
[1280, 720] | ||
], | ||
"reshape_mode": "bicubic", | ||
"remove_common_llm_caption_prefixes": true | ||
}, | ||
{ | ||
"data_root": "multimodalart/1920-raider-waite-tarot-public-domain", | ||
"dataset_type": "image", | ||
"id_token": "TRTCRD", | ||
"image_resolution_buckets": [ | ||
[512, 512] | ||
], | ||
"reshape_mode": "center_crop", | ||
"remove_common_llm_caption_prefixes": true | ||
}, | ||
{ | ||
"data_root": "multimodalart/1920-raider-waite-tarot-public-domain", | ||
"dataset_type": "image", | ||
"id_token": "TRTCRD", | ||
"image_resolution_buckets": [ | ||
[768, 768] | ||
], | ||
"reshape_mode": "center_crop", | ||
"remove_common_llm_caption_prefixes": true | ||
} | ||
] | ||
} |
68 changes: 68 additions & 0 deletions
68
examples/training/sft/cogview4/raider_white_tarot/validation.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
{ | ||
"data": [ | ||
{ | ||
"caption": "TRTCRD a trtcrd of a knight mounting a running horse wearing an armor and holding a staff, \"knight of wands\"", | ||
"image_path": null, | ||
"video_path": null, | ||
"num_inference_steps": 50, | ||
"height": 1280, | ||
"width": 720 | ||
}, | ||
{ | ||
"caption": "TRTCRD a trtcrd of a woman sitting on a throne, wearing a crown and holding a trophee, \"queen of cups\"", | ||
"image_path": null, | ||
"video_path": null, | ||
"num_inference_steps": 50, | ||
"height": 1280, | ||
"width": 720 | ||
}, | ||
{ | ||
"caption": "TRTCRD a trtcrd of a knight holding the cup while mounts on a stationary horse", | ||
"image_path": null, | ||
"video_path": null, | ||
"num_inference_steps": 50, | ||
"height": 1280, | ||
"width": 720 | ||
}, | ||
{ | ||
"caption": "TRTCRD a trtcrd of a person in a red robe holding a scale and giving coins to two kneeling figures, surrounded by six pentacles", | ||
"image_path": null, | ||
"video_path": null, | ||
"num_inference_steps": 50, | ||
"height": 1280, | ||
"width": 720 | ||
}, | ||
{ | ||
"caption": "TRTCRD a trtcrd of a knight holding the cup while mounts on a stationary horse", | ||
"image_path": null, | ||
"video_path": null, | ||
"num_inference_steps": 50, | ||
"height": 512, | ||
"width": 512 | ||
}, | ||
{ | ||
"caption": "TRTCRD a trtcrd of a person in a red robe holding a scale and giving coins to two kneeling figures, surrounded by six pentacles", | ||
"image_path": null, | ||
"video_path": null, | ||
"num_inference_steps": 50, | ||
"height": 512, | ||
"width": 512 | ||
}, | ||
{ | ||
"caption": "TRTCRD a trtcrd of a knight holding the cup while mounts on a stationary horse", | ||
"image_path": null, | ||
"video_path": null, | ||
"num_inference_steps": 50, | ||
"height": 768, | ||
"width": 768 | ||
}, | ||
{ | ||
"caption": "TRTCRD a trtcrd of a person in a red robe holding a scale and giving coins to two kneeling figures, surrounded by six pentacles", | ||
"image_path": null, | ||
"video_path": null, | ||
"num_inference_steps": 50, | ||
"height": 768, | ||
"width": 768 | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .base_specification import CogView4ModelSpecification |
Oops, something went wrong.