HunyuanVideo ModelSpec (#287)

* update * remove old files * update * update
a-r-r-o-w · Mar 5, 2025 · b8d9a52 · b8d9a52
1 parent 17e8481
commit b8d9a52
Show file tree

Hide file tree

Showing 16 changed files with 1,006 additions and 416 deletions.
diff --git a/examples/training/sft/hunyuan_video/modal_labs_dissolve/train.sh b/examples/training/sft/hunyuan_video/modal_labs_dissolve/train.sh
@@ -0,0 +1,158 @@
+#!/bin/bash
+
+set -e -x
+
+# export TORCH_LOGS="+dynamo,recompiles,graph_breaks"
+# export TORCHDYNAMO_VERBOSE=1
+export WANDB_MODE="offline"
+export NCCL_P2P_DISABLE=1
+export TORCH_NCCL_ENABLE_MONITORING=0
+export FINETRAINERS_LOG_LEVEL="DEBUG"
+
+# Finetrainers supports multiple backends for distributed training. Select your favourite and benchmark the differences!
+# BACKEND="accelerate"
+BACKEND="ptd"
+
+# In this setting, I'm using 2 GPUs on a 4-GPU node for training
+NUM_GPUS=8
+CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+
+# Check the JSON files for the expected JSON format
+TRAINING_DATASET_CONFIG="examples/training/sft/hunyuan_video/modal_labs_dissolve/training.json"
+VALIDATION_DATASET_FILE="examples/training/sft/hunyuan_video/modal_labs_dissolve/validation.json"
+
+# Depending on how many GPUs you have available, choose your degree of parallelism and technique!
+DDP_1="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 1 --cp_degree 1 --tp_degree 1"
+DDP_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 2 --dp_shards 1 --cp_degree 1 --tp_degree 1"
+DDP_4="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 4 --dp_shards 1 --cp_degree 1 --tp_degree 1"
+FSDP_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 2 --cp_degree 1 --tp_degree 1"
+FSDP_4="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 1 --dp_shards 4 --cp_degree 1 --tp_degree 1"
+HSDP_2_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 2 --dp_shards 2 --cp_degree 1 --tp_degree 1"
+HSDP_4_2="--parallel_backend $BACKEND --pp_degree 1 --dp_degree 4 --dp_shards 2 --cp_degree 1 --tp_degree 1"
+
+# Parallel arguments
+parallel_cmd=(
+  $HSDP_4_2
+)
+
+# Model arguments
+model_cmd=(
+  --model_name "hunyuan_video"
+  --pretrained_model_name_or_path "hunyuanvideo-community/HunyuanVideo"
+)
+
+# Dataset arguments
+dataset_cmd=(
+  --dataset_config $TRAINING_DATASET_CONFIG
+  --dataset_shuffle_buffer_size 10
+  --precomputation_items 10
+  --precomputation_once
+)
+
+# Dataloader arguments
+dataloader_cmd=(
+  --dataloader_num_workers 0
+)
+
+# Diffusion arguments
+diffusion_cmd=(
+  --flow_weighting_scheme "logit_normal"
+)
+
+# Training arguments
+# We target just the attention projections layers for LoRA training here.
+# You can modify as you please and target any layer (regex is supported)
+training_cmd=(
+  --training_type "lora"
+  --seed 42
+  --batch_size 1
+  --train_steps 3000
+  --rank 32
+  --lora_alpha 32
+  --target_modules "(transformer_blocks|single_transformer_blocks).*(to_q|to_k|to_v|to_out.0|add_q_proj|add_k_proj|add_v_proj|to_add_out)"
+  --gradient_accumulation_steps 1
+  --gradient_checkpointing
+  --checkpointing_steps 500
+  --checkpointing_limit 2
+  # --resume_from_checkpoint 3000
+  --enable_slicing
+  --enable_tiling
+)
+
+# Optimizer arguments
+optimizer_cmd=(
+  --optimizer "adamw"
+  --lr 3e-5
+  --lr_scheduler "constant_with_warmup"
+  --lr_warmup_steps 1000
+  --lr_num_cycles 1
+  --beta1 0.9
+  --beta2 0.99
+  --weight_decay 1e-4
+  --epsilon 1e-8
+  --max_grad_norm 1.0
+)
+
+# Validation arguments
+validation_cmd=(
+  --validation_dataset_file "$VALIDATION_DATASET_FILE"
+  --validation_steps 500
+)
+
+# Miscellaneous arguments
+miscellaneous_cmd=(
+  --tracker_name "finetrainers-hunyuanvideo"
+  --output_dir "/fsx/aryan/lora-training/hunyuanvideo"
+  --init_timeout 600
+  --nccl_timeout 600
+  --report_to "wandb"
+)
+
+# Execute the training script
+if [ "$BACKEND" == "accelerate" ]; then
+
+  ACCELERATE_CONFIG_FILE=""
+  if [ "$NUM_GPUS" == 1 ]; then
+    ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_1.yaml"
+  elif [ "$NUM_GPUS" == 2 ]; then
+    ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_2.yaml"
+  elif [ "$NUM_GPUS" == 4 ]; then
+    ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_4.yaml"
+  elif [ "$NUM_GPUS" == 8 ]; then
+    ACCELERATE_CONFIG_FILE="accelerate_configs/uncompiled_8.yaml"
+  fi
+
+  accelerate launch --config_file "$ACCELERATE_CONFIG_FILE" --gpu_ids $CUDA_VISIBLE_DEVICES train.py \
+    "${parallel_cmd[@]}" \
+    "${model_cmd[@]}" \
+    "${dataset_cmd[@]}" \
+    "${dataloader_cmd[@]}" \
+    "${diffusion_cmd[@]}" \
+    "${training_cmd[@]}" \
+    "${optimizer_cmd[@]}" \
+    "${validation_cmd[@]}" \
+    "${miscellaneous_cmd[@]}"
+
+elif [ "$BACKEND" == "ptd" ]; then
+
+  export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES
+
+  torchrun \
+    --standalone \
+    --nnodes=1 \
+    --nproc_per_node=$NUM_GPUS \
+    --rdzv_backend c10d \
+    --rdzv_endpoint="localhost:0" \
+    train.py \
+      "${parallel_cmd[@]}" \
+      "${model_cmd[@]}" \
+      "${dataset_cmd[@]}" \
+      "${dataloader_cmd[@]}" \
+      "${diffusion_cmd[@]}" \
+      "${training_cmd[@]}" \
+      "${optimizer_cmd[@]}" \
+      "${validation_cmd[@]}" \
+      "${miscellaneous_cmd[@]}"
+fi
+
+echo -ne "-------------------- Finished executing script --------------------\n\n"
diff --git a/examples/training/sft/hunyuan_video/modal_labs_dissolve/training.json b/examples/training/sft/hunyuan_video/modal_labs_dissolve/training.json
@@ -0,0 +1,24 @@
+{
+  "datasets": [
+    {
+      "data_root": "modal-labs/dissolve",
+      "dataset_type": "video",
+      "id_token": "MODAL_DISSOLVE",
+      "video_resolution_buckets": [
+        [49, 480, 768]
+      ],
+      "reshape_mode": "bicubic",
+      "remove_common_llm_caption_prefixes": true
+    },
+    {
+      "data_root": "modal-labs/dissolve",
+      "dataset_type": "video",
+      "id_token": "MODAL_DISSOLVE",
+      "video_resolution_buckets": [
+        [81, 480, 768]
+      ],
+      "reshape_mode": "bicubic",
+      "remove_common_llm_caption_prefixes": true
+    }
+  ]
+}
diff --git a/examples/training/sft/hunyuan_video/modal_labs_dissolve/validation.json b/examples/training/sft/hunyuan_video/modal_labs_dissolve/validation.json
@@ -0,0 +1,76 @@
+{
+  "data": [
+    {
+      "caption": "MODAL_DISSOLVE A meticulously detailed, antique-style vase, featuring mottled beige and brown hues and two small handles, sits centrally on a dark brown circular pedestal.  The vase, seemingly made of clay or porcelain, begins to dissolve from the bottom up.  The disintegration process is rapid but not explosive, with a cloud of fine, light tan dust forming and rising in a swirling, almost ethereal column that expands outwards before slowly descending. The dust particles are individually visible as they float, and the overall effect is one of delicate disintegration rather than shattering.  Finally, only the empty pedestal and the intricately patterned marble floor remain.",
+      "image_path": null,
+      "video_path": null,
+      "num_inference_steps": 30,
+      "height": 480,
+      "width": 768,
+      "num_frames": 49
+    },
+    {
+      "caption": "MODAL_DISSOLVE Close-up view of a sloth resting on a thick tree branch within a dense, sun-dappled forest.  The sloth's body, initially clearly defined, begins to subtly disintegrate.  The process starts with a light dusting of particles from its lower back and rump. This quickly intensifies, with a visible cloud of fine, sparkling dust billowing outwards as the sloth's form gradually vanishes. The dissolution proceeds in a wave-like manner, moving from rear to front.  The head and arms are the last parts to disappear, leaving only scattered motes of dust that slowly disperse amongst the leaves, blending seamlessly with the forest environment. The overall effect is dreamlike and ethereal.",
+      "image_path": null,
+      "video_path": null,
+      "num_inference_steps": 30,
+      "height": 480,
+      "width": 768,
+      "num_frames": 49
+    },
+    {
+      "caption": "MODAL_DISSOLVE High-resolution video depicting the complete digital dissolution of an orange Porsche 911 GT3 RS within a garage environment. The car's dissolution proceeds in three discernible stages: (1) Initial shimmering along the car's edges and body panels, creating a subtle, high-frequency displacement effect. (2) Rapid disintegration of the vehicle into a dense cloud of primarily orange and black particles, varying in size and opacity; particle motion exhibits both outward and swirling movements. (3) Complete disappearance of the car, leaving behind only a remaining, smaller, seemingly fiery-textured rubber duck model. The overall effect resembles a controlled explosion or rapid combustion, creating a dynamic, visually complex transformation. The garage's lighting and shadows remain consistent throughout the dissolution, providing clear visual contrast.",
+      "image_path": null,
+      "video_path": null,
+      "num_inference_steps": 30,
+      "height": 480,
+      "width": 768,
+      "num_frames": 49
+    },
+    {
+      "caption": "MODAL_DISSOLVE High-resolution video depicting the complete disintegration of a white origami crane. The disintegration process is initiated at the head of the crane and proceeds in a generally downward direction.  The disintegration manifests as the rapid breakdown of paper fibers into a cloud of fine particulate matter. The particle size appears consistent, with a texture similar to very fine powder. The rate of disintegration increases over time, resulting in a visually dynamic and texturally complex effect.  The background consists of a dark-stained wooden surface, providing a high-contrast setting that highlights the white particles' dispersal and movement. The final state shows only residual particulate matter scattered sparsely on the surface.",
+      "image_path": null,
+      "video_path": null,
+      "num_inference_steps": 30,
+      "height": 480,
+      "width": 768,
+      "num_frames": 49
+    },
+    {
+      "caption": "MODAL_DISSOLVE A meticulously detailed, antique-style vase, featuring mottled beige and brown hues and two small handles, sits centrally on a dark brown circular pedestal.  The vase, seemingly made of clay or porcelain, begins to dissolve from the bottom up.  The disintegration process is rapid but not explosive, with a cloud of fine, light tan dust forming and rising in a swirling, almost ethereal column that expands outwards before slowly descending. The dust particles are individually visible as they float, and the overall effect is one of delicate disintegration rather than shattering.  Finally, only the empty pedestal and the intricately patterned marble floor remain.",
+      "image_path": null,
+      "video_path": null,
+      "num_inference_steps": 30,
+      "height": 480,
+      "width": 768,
+      "num_frames": 81
+    },
+    {
+      "caption": "MODAL_DISSOLVE Close-up view of a sloth resting on a thick tree branch within a dense, sun-dappled forest.  The sloth's body, initially clearly defined, begins to subtly disintegrate.  The process starts with a light dusting of particles from its lower back and rump. This quickly intensifies, with a visible cloud of fine, sparkling dust billowing outwards as the sloth's form gradually vanishes. The dissolution proceeds in a wave-like manner, moving from rear to front.  The head and arms are the last parts to disappear, leaving only scattered motes of dust that slowly disperse amongst the leaves, blending seamlessly with the forest environment. The overall effect is dreamlike and ethereal.",
+      "image_path": null,
+      "video_path": null,
+      "num_inference_steps": 30,
+      "height": 480,
+      "width": 768,
+      "num_frames": 81
+    },
+    {
+      "caption": "MODAL_DISSOLVE High-resolution video depicting the complete digital dissolution of an orange Porsche 911 GT3 RS within a garage environment. The car's dissolution proceeds in three discernible stages: (1) Initial shimmering along the car's edges and body panels, creating a subtle, high-frequency displacement effect. (2) Rapid disintegration of the vehicle into a dense cloud of primarily orange and black particles, varying in size and opacity; particle motion exhibits both outward and swirling movements. (3) Complete disappearance of the car, leaving behind only a remaining, smaller, seemingly fiery-textured rubber duck model. The overall effect resembles a controlled explosion or rapid combustion, creating a dynamic, visually complex transformation. The garage's lighting and shadows remain consistent throughout the dissolution, providing clear visual contrast.",
+      "image_path": null,
+      "video_path": null,
+      "num_inference_steps": 30,
+      "height": 480,
+      "width": 768,
+      "num_frames": 81
+    },
+    {
+      "caption": "MODAL_DISSOLVE High-resolution video depicting the complete disintegration of a white origami crane. The disintegration process is initiated at the head of the crane and proceeds in a generally downward direction.  The disintegration manifests as the rapid breakdown of paper fibers into a cloud of fine particulate matter. The particle size appears consistent, with a texture similar to very fine powder. The rate of disintegration increases over time, resulting in a visually dynamic and texturally complex effect.  The background consists of a dark-stained wooden surface, providing a high-contrast setting that highlights the white particles' dispersal and movement. The final state shows only residual particulate matter scattered sparsely on the surface.",
+      "image_path": null,
+      "video_path": null,
+      "num_inference_steps": 30,
+      "height": 480,
+      "width": 768,
+      "num_frames": 81
+    }
+  ]
+}
diff --git a/finetrainers/config.py b/finetrainers/config.py
@@ -3,15 +3,15 @@
 
 from .models import ModelSpecification
 from .models.cogvideox import CogVideoXModelSpecification
-from .models.hunyuan_video import HUNYUAN_VIDEO_T2V_FULL_FINETUNE_CONFIG, HUNYUAN_VIDEO_T2V_LORA_CONFIG
+from .models.hunyuan_video import HunyuanVideoModelSpecification
 from .models.ltx_video import LTXVideoModelSpecification
 from .models.wan import WanModelSpecification
 
 
 class ModelType(str, Enum):
+    COGVIDEOX = "cogvideox"
     HUNYUAN_VIDEO = "hunyuan_video"
     LTX_VIDEO = "ltx_video"
-    COGVIDEOX = "cogvideox"
     WAN = "wan"
 
 
@@ -22,8 +22,8 @@ class TrainingType(str, Enum):
 
 SUPPORTED_MODEL_CONFIGS = {
     ModelType.HUNYUAN_VIDEO: {
-        TrainingType.LORA: HUNYUAN_VIDEO_T2V_LORA_CONFIG,
-        TrainingType.FULL_FINETUNE: HUNYUAN_VIDEO_T2V_FULL_FINETUNE_CONFIG,
+        TrainingType.LORA: HunyuanVideoModelSpecification,
+        TrainingType.FULL_FINETUNE: HunyuanVideoModelSpecification,
     },
     ModelType.LTX_VIDEO: {
         TrainingType.LORA: LTXVideoModelSpecification,

diff --git a/finetrainers/data/dataset.py b/finetrainers/data/dataset.py
@@ -801,8 +801,9 @@ def _has_data_caption_file_pairs(root: Union[pathlib.Path, List[str]], remote: b
     else:
         caption_files = [file for file in root if file.endswith(".txt")]
         for caption_file in caption_files:
+            caption_file = pathlib.Path(caption_file)
             for extension in [*constants.SUPPORTED_IMAGE_FILE_EXTENSIONS, *constants.SUPPORTED_VIDEO_FILE_EXTENSIONS]:
-                data_filename = caption_file.with_suffix(f".{extension}")
+                data_filename = caption_file.with_suffix(f".{extension}").name
                 if data_filename in root:
                     return True
         return False

diff --git a/finetrainers/models/hunyuan_video/__init__.py b/finetrainers/models/hunyuan_video/__init__.py
@@ -1,2 +1 @@
-from .full_finetune import HUNYUAN_VIDEO_T2V_FULL_FINETUNE_CONFIG
-from .lora import HUNYUAN_VIDEO_T2V_LORA_CONFIG
+from .base_specification import HunyuanVideoModelSpecification