Add 3DGS dataset example for Wan (#290)

a-r-r-o-w · Mar 4, 2025 · 17e8481 · 17e8481
1 parent ea69aaf
commit 17e8481
Show file tree

Hide file tree

Showing 3 changed files with 244 additions and 0 deletions.
diff --git a/examples/training/sft/wan/3dgs_dissolve/train.sh b/examples/training/sft/wan/3dgs_dissolve/train.sh
@@ -0,0 +1,162 @@
+#!/bin/bash
+
+set -e -x
+
+# export TORCH_LOGS="+dynamo,recompiles,graph_breaks"
+# export TORCHDYNAMO_VERBOSE=1
+export WANDB_MODE="offline"
+export NCCL_P2P_DISABLE=1
+export TORCH_NCCL_ENABLE_MONITORING=0
+export FINETRAINERS_LOG_LEVEL="DEBUG"
+
+# Finetrainers supports multiple backends for distributed training. Select your favourite and benchmark the differences!
+# BACKEND="accelerate"
+BACKEND="ptd"
+
+# In this setting, I'm using 2 GPUs on a 4-GPU node for training
+NUM_GPUS=2
+CUDA_VISIBLE_DEVICES="2,3"
+
+# Check the JSON files for the expected JSON format
+TRAINING_DATASET_CONFIG="examples/training/sft/wan/3dgs_dissolve/training.json"
+VALIDATION_DATASET_FILE="examples/training/sft/wan/3dgs_dissolve/validation.json"
+
+# Depending on how many GPUs you have available, choose your degree of parallelism and technique!
+DDP_1="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 1 --cp_degree 1 --tp_degree 1"
+DDP_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 2 --dp_shards 1 --cp_degree 1 --tp_degree 1"
+DDP_4="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 4 --dp_shards 1 --cp_degree 1 --tp_degree 1"
+FSDP_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 2 --cp_degree 1 --tp_degree 1"
+FSDP_4="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 4 --cp_degree 1 --tp_degree 1"
+HSDP_2_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 2 --dp_shards 2 --cp_degree 1 --tp_degree 1"
+
+# Parallel arguments
+parallel_cmd=(
+  $DDP_2
+)
+
+# Model arguments
+model_cmd=(
+  --model_name "wan"
+  --pretrained_model_name_or_path "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
+)
+
+# Dataset arguments
+# Here, we know that the dataset size if about ~100 videos. Since we're using 2 GPUs, we precompute
+# embeddings of 50 dataset items per GPU. Also, we're using a very small dataset for finetuning, so
+# we are okay with precomputing embeddings once and re-using them without having to worry about disk
+# space. Currently, however, every new training run performs precomputation even if it's not required
+# (which is something we've to improve [TODO(aryan)])
+dataset_cmd=(
+  --dataset_config $TRAINING_DATASET_CONFIG
+  --dataset_shuffle_buffer_size 10
+  --precomputation_items 100
+  --precomputation_once
+)
+
+# Dataloader arguments
+dataloader_cmd=(
+  --dataloader_num_workers 0
+)
+
+# Diffusion arguments
+diffusion_cmd=(
+  --flow_weighting_scheme "logit_normal"
+)
+
+# Training arguments
+# We target just the attention projections layers for LoRA training here.
+# You can modify as you please and target any layer (regex is supported)
+training_cmd=(
+  --training_type "lora"
+  --seed 42
+  --batch_size 1
+  --train_steps 5000
+  --rank 32
+  --lora_alpha 32
+  --target_modules "blocks.*(to_q|to_k|to_v|to_out.0)"
+  --gradient_accumulation_steps 1
+  --gradient_checkpointing
+  --checkpointing_steps 500
+  --checkpointing_limit 2
+  # --resume_from_checkpoint 3000
+  --enable_slicing
+  --enable_tiling
+)
+
+# Optimizer arguments
+optimizer_cmd=(
+  --optimizer "adamw"
+  --lr 5e-5
+  --lr_scheduler "constant_with_warmup"
+  --lr_warmup_steps 1000
+  --lr_num_cycles 1
+  --beta1 0.9
+  --beta2 0.99
+  --weight_decay 1e-4
+  --epsilon 1e-8
+  --max_grad_norm 1.0
+)
+
+# Validation arguments
+validation_cmd=(
+  --validation_dataset_file "$VALIDATION_DATASET_FILE"
+  --validation_steps 500
+)
+
+# Miscellaneous arguments
+miscellaneous_cmd=(
+  --tracker_name "finetrainers-wan"
+  --output_dir "/raid/aryan/wan"
+  --init_timeout 600
+  --nccl_timeout 600
+  --report_to "wandb"
+)
+
+# Execute the training script
+if [ "$BACKEND" == "accelerate" ]; then
+
+  ACCELERATE_CONFIG_FILE=""
+  if [ "$NUM_GPUS" == 1 ]; then
+    ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_1.yaml"
+  elif [ "$NUM_GPUS" == 2 ]; then
+    ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_2.yaml"
+  elif [ "$NUM_GPUS" == 4 ]; then
+    ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_4.yaml"
+  elif [ "$NUM_GPUS" == 8 ]; then
+    ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_8.yaml"
+  fi
+
+  accelerate launch --config_file "$ACCELERATE_CONFIG_FILE" --gpu_ids $CUDA_VISIBLE_DEVICES train.py \
+    "${parallel_cmd[@]}" \
+    "${model_cmd[@]}" \
+    "${dataset_cmd[@]}" \
+    "${dataloader_cmd[@]}" \
+    "${diffusion_cmd[@]}" \
+    "${training_cmd[@]}" \
+    "${optimizer_cmd[@]}" \
+    "${validation_cmd[@]}" \
+    "${miscellaneous_cmd[@]}"
+
+elif [ "$BACKEND" == "ptd" ]; then
+
+  export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
+
+  torchrun \
+    --standalone \
+    --nnodes=1 \
+    --nproc_per_node=$NUM_GPUS \
+    --rdzv_backend c10d \
+    --rdzv_endpoint="localhost:0" \
+    train.py \
+      "${parallel_cmd[@]}" \
+      "${model_cmd[@]}" \
+      "${dataset_cmd[@]}" \
+      "${dataloader_cmd[@]}" \
+      "${diffusion_cmd[@]}" \
+      "${training_cmd[@]}" \
+      "${optimizer_cmd[@]}" \
+      "${validation_cmd[@]}" \
+      "${miscellaneous_cmd[@]}"
+fi
+
+echo -ne "-------------------- Finished executing script --------------------\n\n"
diff --git a/examples/training/sft/wan/3dgs_dissolve/training.json b/examples/training/sft/wan/3dgs_dissolve/training.json
@@ -0,0 +1,24 @@
+{
+  "datasets": [
+    {
+      "data_root": "finetrainers/3dgs-dissolve",
+      "dataset_type": "video",
+      "id_token": "3DGS_DISSOLVE",
+      "video_resolution_buckets": [
+        [49, 480, 832]
+      ],
+      "reshape_mode": "bicubic",
+      "remove_common_llm_caption_prefixes": true
+    },
+    {
+      "data_root": "finetrainers/3dgs-dissolve",
+      "dataset_type": "video",
+      "id_token": "3DGS_DISSOLVE",
+      "video_resolution_buckets": [
+        [81, 480, 832]
+      ],
+      "reshape_mode": "bicubic",
+      "remove_common_llm_caption_prefixes": true
+    }
+  ]
+}
diff --git a/examples/training/sft/wan/3dgs_dissolve/validation.json b/examples/training/sft/wan/3dgs_dissolve/validation.json
@@ -0,0 +1,58 @@
+{
+  "data": [
+    {
+      "caption": "A spacecraft, rendered in a 3D appearance, ascends into the night sky, leaving behind a trail of fiery exhaust. As it climbs higher, the exhaust gradually transforms into a burst of red sparks, creating a dramatic and dynamic visual effect against the dark backdrop.",
+      "image_path": null,
+      "video_path": null,
+      "num_inference_steps": 50,
+      "height": 480,
+      "width": 832,
+      "num_frames": 49
+    },
+    {
+      "caption": "3DGS_DISSOLVE A spacecraft, rendered in a 3D appearance, ascends into the night sky, leaving behind a trail of fiery exhaust. As it climbs higher, the exhaust gradually transforms into a burst of red sparks, creating a dramatic and dynamic visual effect against the dark backdrop.",
+      "image_path": null,
+      "video_path": null,
+      "num_inference_steps": 50,
+      "height": 480,
+      "width": 832,
+      "num_frames": 49
+    },
+    {
+      "caption": "3DGS_DISSOLVE A spacecraft, rendered in a 3D appearance, ascends into the night sky, leaving behind a trail of fiery exhaust. As it climbs higher, the exhaust gradually transforms into a burst of red sparks, creating a dramatic and dynamic visual effect against the dark backdrop.",
+      "image_path": null,
+      "video_path": null,
+      "num_inference_steps": 50,
+      "height": 480,
+      "width": 832,
+      "num_frames": 81
+    },
+    {
+      "caption": "3DGS_DISSOLVE A vintage-style treasure chest, rendered in a 3D appearance, stands prominently against a dark background. As the scene progresses, the chest begins to emit a glowing light, which intensifies until it evaporates into a burst of red sparks, creating a dramatic and mysterious atmosphere.",
+      "image_path": null,
+      "video_path": null,
+      "num_inference_steps": 50,
+      "height": 480,
+      "width": 832,
+      "num_frames": 49
+    },
+    {
+      "caption": "3DGS_DISSOLVE A glowing, fiery cube in a 3D appearance begins to spin and rotate, its edges shimmering with intense light. As it continues to spin, the cube gradually evaporates into a burst of red sparks that scatter across the screen, creating a dynamic and mesmerizing visual effect against the dark background.",
+      "image_path": null,
+      "video_path": null,
+      "num_inference_steps": 50,
+      "height": 480,
+      "width": 832,
+      "num_frames": 49
+    },
+    {
+      "caption": "3DGS_DISSOLVE A dynamic explosion unfolds in a 3D appearance, beginning as a concentrated burst of intense orange flames. As the fire intensifies, it rapidly expands outward, transitioning into a vibrant display of red sparks that scatter across the frame. The sparks continue to evolve, evaporating into a burst of red sparks against the dark backdrop, creating a mesmerizing visual spectacle.",
+      "image_path": null,
+      "video_path": null,
+      "num_inference_steps": 50,
+      "height": 480,
+      "width": 832,
+      "num_frames": 49
+    }
+  ]
+}