format check

hao-ai-lab · jzhang38 · Jan 14, 2025 · Dec 19, 2024 · Dec 19, 2024 · Dec 19, 2024
commit e357df9fe249542b3ba43536998c91a33110c6f4
diff --git a/fastvideo/sample/sample_t2v_hunyuan_hf.py b/fastvideo/sample/sample_t2v_hunyuan_hf.py
@@ -2,7 +2,7 @@
 import torch.distributed as dist
 from diffusers import BitsAndBytesConfig
 from diffusers.utils import export_to_video
-import imageio as iio 
+import imageio as iio
 import math
 import numpy as np
 import io
@@ -17,17 +17,20 @@
 from fastvideo.models.hunyuan_hf.pipeline_hunyuan import HunyuanVideoPipeline
 from fastvideo.models.hunyuan_hf.modeling_hunyuan import HunyuanVideoTransformer3DModel
 
+
 def initialize_distributed():
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
     local_rank = int(os.getenv("RANK", 0))
     world_size = int(os.getenv("WORLD_SIZE", 1))
     print("world_size", world_size)
     torch.cuda.set_device(local_rank)
-    dist.init_process_group(
-        backend="nccl", init_method="env://", world_size=world_size, rank=local_rank
-    )
+    dist.init_process_group(backend="nccl",
+                            init_method="env://",
+                            world_size=world_size,
+                            rank=local_rank)
     initialize_sequence_parallel_state(world_size)
-
+
+
 def inference(args):
     initialize_distributed()
     print(nccl_info.sp_size)
@@ -36,29 +39,35 @@ def inference(args):
     weight_dtype = torch.bfloat16
 
     if args.transformer_path is not None:
-        transformer = HunyuanVideoTransformer3DModel.from_pretrained(args.transformer_path)
+        transformer = HunyuanVideoTransformer3DModel.from_pretrained(
+            args.transformer_path)
     else:
         transformer = HunyuanVideoTransformer3DModel.from_pretrained(
-            args.model_path, subfolder="transformer/", torch_dtype=weight_dtype
-        )
+            args.model_path,
+            subfolder="transformer/",
+            torch_dtype=weight_dtype)
 
-    pipe = HunyuanVideoPipeline.from_pretrained(
-        args.model_path, transformer=transformer, torch_dtype=weight_dtype
-    )
+    pipe = HunyuanVideoPipeline.from_pretrained(args.model_path,
+                                                transformer=transformer,
+                                                torch_dtype=weight_dtype)
 
     pipe.enable_vae_tiling()
 
     if args.lora_checkpoint_dir is not None:
         print(f"Loading LoRA weights from {args.lora_checkpoint_dir}")
-        config_path = os.path.join(args.lora_checkpoint_dir, "lora_config.json")
+        config_path = os.path.join(args.lora_checkpoint_dir,
+                                   "lora_config.json")
         with open(config_path, "r") as f:
             lora_config_dict = json.load(f)
         rank = lora_config_dict["lora_params"]["lora_rank"]
         lora_alpha = lora_config_dict["lora_params"]["lora_alpha"]
         lora_scaling = lora_alpha / rank
-        pipe.load_lora_weights(args.lora_checkpoint_dir, adapter_name="default")
+        pipe.load_lora_weights(args.lora_checkpoint_dir,
+                               adapter_name="default")
         pipe.set_adapters(["default"], [lora_scaling])
-        print(f"Successfully Loaded LoRA weights from {args.lora_checkpoint_dir}")
+        print(
+            f"Successfully Loaded LoRA weights from {args.lora_checkpoint_dir}"
+        )
     if args.cpu_offload:
         pipe.enable_model_cpu_offload(device)
     else:
@@ -67,18 +76,13 @@ def inference(args):
     # Generate videos from the input prompt
 
     if args.prompt_embed_path is not None:
-        prompt_embeds = (
-            torch.load(args.prompt_embed_path, map_location="cpu", weights_only=True)
-            .to(device)
-            .unsqueeze(0)
-        )
-        encoder_attention_mask = (
-            torch.load(
-                args.encoder_attention_mask_path, map_location="cpu", weights_only=True
-            )
-            .to(device)
-            .unsqueeze(0)
-        )
+        prompt_embeds = (torch.load(args.prompt_embed_path,
+                                    map_location="cpu",
+                                    weights_only=True).to(device).unsqueeze(0))
+        encoder_attention_mask = (torch.load(
+            args.encoder_attention_mask_path,
+            map_location="cpu",
+            weights_only=True).to(device).unsqueeze(0))
         prompts = None
     elif args.prompt_path is not None:
         prompts = [line.strip() for line in open(args.prompt_path, "r")]
@@ -121,10 +125,11 @@ def inference(args):
                 num_inference_steps=args.num_inference_steps,
                 generator=generator,
             ).frames
-    
+
         if nccl_info.global_rank <= 0:
             export_to_video(videos[0], args.output_path + ".mp4", fps=24)
 
+
 def inference_quantization(args):
     torch.manual_seed(args.seed)
     device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -138,7 +143,8 @@ def inference_quantization(args):
          "5. Camera angles, movements, and transitions used in the video."
          "6. Thematic and aesthetic concepts associated with the scene, i.e. realistic, futuristic, fairy tale, etc<|eot_id|>"
          "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"),
-        "crop_start":95,
+        "crop_start":
+        95,
     }
     model_id = args.model_path
 
@@ -213,6 +219,7 @@ def inference_quantization(args):
             round(torch.cuda.max_memory_allocated(device="cuda") / 1024**3, 3),
             "GiB")
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
@@ -243,10 +250,14 @@ def inference_quantization(args):
         default="flow",
         help="Denoise type for noised inputs.",
     )
-    parser.add_argument("--seed", type=int, default=None, help="Seed for evaluation.")
-    parser.add_argument(
-        "--neg_prompt", type=str, default=None, help="Negative prompt for sampling."
-    )
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Seed for evaluation.")
+    parser.add_argument("--neg_prompt",
+                        type=str,
+                        default=None,
+                        help="Negative prompt for sampling.")
     parser.add_argument(
         "--guidance_scale",
         type=float,
@@ -259,12 +270,14 @@ def inference_quantization(args):
         default=6.0,
         help="Embedded classifier free guidance scale.",
     )
-    parser.add_argument(
-        "--flow_shift", type=int, default=7, help="Flow shift parameter."
-    )
-    parser.add_argument(
-        "--batch_size", type=int, default=1, help="Batch size for inference."
-    )
+    parser.add_argument("--flow_shift",
+                        type=int,
+                        default=7,
+                        help="Flow shift parameter.")
+    parser.add_argument("--batch_size",
+                        type=int,
+                        default=1,
+                        help="Batch size for inference.")
     parser.add_argument(
         "--num_videos",
         type=int,
@@ -275,22 +288,26 @@ def inference_quantization(args):
         "--load-key",
         type=str,
         default="module",
-        help="Key to load the model states. 'module' for the main model, 'ema' for the EMA model.",
+        help=
+        "Key to load the model states. 'module' for the main model, 'ema' for the EMA model.",
     )
     parser.add_argument(
         "--dit-weight",
         type=str,
-        default="data/hunyuan/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt",
+        default=
+        "data/hunyuan/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt",
     )
     parser.add_argument(
         "--reproduce",
         action="store_true",
-        help="Enable reproducibility by setting random seeds and deterministic algorithms.",
+        help=
+        "Enable reproducibility by setting random seeds and deterministic algorithms.",
     )
     parser.add_argument(
         "--disable-autocast",
         action="store_true",
-        help="Disable autocast for denoising loop and vae decoding in pipeline sampling.",
+        help=
+        "Disable autocast for denoising loop and vae decoding in pipeline sampling.",
     )
 
     # Flow Matching
@@ -299,13 +316,15 @@ def inference_quantization(args):
         action="store_true",
         help="If reverse, learning/sampling from t=1 -> t=0.",
     )
-    parser.add_argument(
-        "--flow-solver", type=str, default="euler", help="Solver for flow matching."
-    )
+    parser.add_argument("--flow-solver",
+                        type=str,
+                        default="euler",
+                        help="Solver for flow matching.")
     parser.add_argument(
         "--use-linear-quadratic-schedule",
         action="store_true",
-        help="Use linear quadratic schedule for flow matching. Following MovieGen (https://ai.meta.com/static-resource/movie-gen-research-paper)",
+        help=
+        "Use linear quadratic schedule for flow matching. Following MovieGen (https://ai.meta.com/static-resource/movie-gen-research-paper)",
     )
     parser.add_argument(
         "--linear-schedule-end",
@@ -317,17 +336,20 @@ def inference_quantization(args):
     # Model parameters
     parser.add_argument("--model", type=str, default="HYVideo-T/2-cfgdistill")
     parser.add_argument("--latent-channels", type=int, default=16)
-    parser.add_argument(
-        "--precision", type=str, default="bf16", choices=["fp32", "fp16", "bf16", "fp8"]
-    )
-    parser.add_argument(
-        "--rope-theta", type=int, default=256, help="Theta used in RoPE."
-    )
+    parser.add_argument("--precision",
+                        type=str,
+                        default="bf16",
+                        choices=["fp32", "fp16", "bf16", "fp8"])
+    parser.add_argument("--rope-theta",
+                        type=int,
+                        default=256,
+                        help="Theta used in RoPE.")
 
     parser.add_argument("--vae", type=str, default="884-16c-hy")
-    parser.add_argument(
-        "--vae-precision", type=str, default="fp16", choices=["fp32", "fp16", "bf16"]
-    )
+    parser.add_argument("--vae-precision",
+                        type=str,
+                        default="fp16",
+                        choices=["fp32", "fp16", "bf16"])
     parser.add_argument("--vae-tiling", action="store_true", default=True)
 
     parser.add_argument("--text-encoder", type=str, default="llm")
@@ -340,10 +362,12 @@ def inference_quantization(args):
     parser.add_argument("--text-states-dim", type=int, default=4096)
     parser.add_argument("--text-len", type=int, default=256)
     parser.add_argument("--tokenizer", type=str, default="llm")
-    parser.add_argument("--prompt-template", type=str, default="dit-llm-encode")
-    parser.add_argument(
-        "--prompt-template-video", type=str, default="dit-llm-encode-video"
-    )
+    parser.add_argument("--prompt-template",
+                        type=str,
+                        default="dit-llm-encode")
+    parser.add_argument("--prompt-template-video",
+                        type=str,
+                        default="dit-llm-encode-video")
     parser.add_argument("--hidden-state-skip-layer", type=int, default=2)
     parser.add_argument("--apply-final-norm", action="store_true")
 
@@ -362,4 +386,4 @@ def inference_quantization(args):
     if args.quantization:
         inference_quantization(args)
     else:
-        inference(args)
+        inference(args)