StableAvatar

Runtime error

App Files Files Community

dangthr commited on Sep 1

Commit

c5c8aa3

verified ·

1 Parent(s): a2a9a31

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -154

app.py CHANGED Viewed

@@ -44,7 +44,7 @@ def filter_kwargs(cls, kwargs):
     return filtered_kwargs
 def download_file(url, local_path):
-    """從 URL 下載檔案"""
     if url.startswith(('http://', 'https://')):
         print(f"從 {url} 下載檔案中...")
         try:
@@ -65,12 +65,15 @@ def download_file(url, local_path):
         print(f"錯誤：檔案或 URL 不存在： {url}")
         return None
-def setup_models(repo_root):
     """載入所有必要的模型和設定"""
     pretrained_model_name_or_path = os.path.join(repo_root, "Wan2.1-Fun-V1.1-1.3B-InP")
     pretrained_wav2vec_path = os.path.join(repo_root, "wav2vec2-base-960h")
-    config = OmegaConf.load("deepspeed_config/wan2.1/wan_civitai.yaml")
     sampler_name = "Flow"
     print("正在載入 Tokenizer...")
@@ -97,7 +100,7 @@ def setup_models(repo_root):
     print("正在載入 CLIP Image Encoder...")
     clip_image_encoder = CLIPModel.from_pretrained(os.path.join(pretrained_model_name_or_path, config['image_encoder_kwargs'].get('image_encoder_subpath', 'image_encoder'))).eval()
-    print("正在載入 Transformer 3D...")
     transformer3d = WanTransformer3DFantasyModel.from_pretrained(
         os.path.join(pretrained_model_name_or_path, config['transformer_additional_kwargs'].get('transformer_subpath', 'transformer')),
         transformer_additional_kwargs=OmegaConf.to_container(config['transformer_additional_kwargs']),
@@ -105,99 +108,55 @@ def setup_models(repo_root):
         torch_dtype=dtype,
     )
-    scheduler_class = {
-        "Flow": FlowMatchEulerDiscreteScheduler,
-    }[sampler_name]
-    scheduler = scheduler_class(
-        **filter_kwargs(scheduler_class, OmegaConf.to_container(config['scheduler_kwargs']))
-    )
     print("正在建立 Pipeline...")
     pipeline = WanI2VTalkingInferenceLongPipeline(
-        tokenizer=tokenizer,
-        text_encoder=text_encoder,
-        vae=vae,
-        transformer=transformer3d,
-        clip_image_encoder=clip_image_encoder,
-        scheduler=scheduler,
-        wav2vec_processor=wav2vec_processor,
-        wav2vec=wav2vec,
     )
     return pipeline, transformer3d, vae
 def run_inference(
-    pipeline,
-    transformer3d,
-    vae,
-    image_path,
-    audio_path,
-    prompt,
-    negative_prompt,
-    seed,
-    output_filename,
-    gpu_memory_mode="model_cpu_offload",
-    teacache_threshold=0,
-    num_skip_start_steps=5,
-    width=512,
-    height=512,
-    guidance_scale=6.0,
-    num_inference_steps=50,
-    text_guide_scale=3.0,
-    audio_guide_scale=5.0,
-    motion_frame=25,
-    fps=25,
-    overlap_window_length=10,
-    overlapping_weight_scheme="uniform",
-    clip_sample_n_frames=81,
 ):
-    """
-    執行推理以生成影片。
-    Args:
-        pipeline: 推理 pipeline。
-        transformer3d: 3D transformer 模型。
-        vae: VAE 模型。
-        image_path (str): 輸入圖片的路徑。
-        audio_path (str): 輸入音訊的路徑。
-        prompt (str): 正面提示詞。
-        negative_prompt (str): 負面提示詞。
-        seed (int): 隨機種子，-1 表示隨機。
-        output_filename (str): 輸出影片的檔案名稱（不含副檔名）。
-        ... 其他生成參數
-    """
     if seed < 0:
         seed = random.randint(0, np.iinfo(np.int32).max)
     print(f"使用的種子： {seed}")
-    # --- 記憶體優化設定 ---
     if gpu_memory_mode == "sequential_cpu_offload":
-        replace_parameters_by_name(transformer3d, ["modulation", ], device=device)
-        transformer3d.freqs = transformer3d.freqs.to(device=device)
         pipeline.enable_sequential_cpu_offload(device=device)
-    elif gpu_memory_mode == "model_cpu_offload_and_qfloat8":
-        convert_model_weight_to_float8(transformer3d, exclude_module_name=["modulation", ])
-        convert_weight_dtype_wrapper(transformer3d, dtype)
-        pipeline.enable_model_cpu_offload(device=device)
     elif gpu_memory_mode == "model_cpu_offload":
         pipeline.enable_model_cpu_offload(device=device)
     else:
         pipeline.to(device=device)
-    # --- TeaCache 加速 ---
-    if teacache_threshold > 0:
-        coefficients = get_teacache_coefficients(pipeline.transformer.config._name_or_path)
-        pipeline.transformer.enable_teacache(
-            coefficients,
-            num_inference_steps,
-            teacache_threshold,
-            num_skip_start_steps=num_skip_start_steps,
-        )
-    # --- 開始推理 ---
     with torch.no_grad():
         print("正在準備輸入資料...")
-        video_length = int((clip_sample_n_frames - 1) // vae.config.temporal_compression_ratio * vae.config.temporal_compression_ratio) + 1 if clip_sample_n_frames != 1 else 1
         input_video, input_video_mask, clip_image = get_image_to_video_latent(image_path, None, video_length=video_length, sample_size=[height, width])
         sr = 16000
@@ -205,27 +164,13 @@ def run_inference(
         print("Pipeline 執行中... 這可能需要一些時間。")
         sample = pipeline(
-            prompt,
-            num_frames=video_length,
-            negative_prompt=negative_prompt,
-            width=width,
-            height=height,
-            guidance_scale=guidance_scale,
-            generator=torch.Generator().manual_seed(seed),
-            num_inference_steps=num_inference_steps,
-            video=input_video,
-            mask_video=input_video_mask,
-            clip_image=clip_image,
-            text_guide_scale=text_guide_scale,
-            audio_guide_scale=audio_guide_scale,
-            vocal_input_values=vocal_input,
-            motion_frame=motion_frame,
-            fps=fps,
-            sr=sr,
-            cond_file_path=image_path,
-            overlap_window_length=overlap_window_length,
-            seed=seed,
-            overlapping_weight_scheme=overlapping_weight_scheme,
         ).videos
         print("正在儲存影片...")
@@ -242,7 +187,6 @@ def run_inference(
             output_video_with_audio
         ], check=True)
-        # 刪除無音訊的暫存影片
         os.remove(video_path)
     print(f"✅ 生成完成！影片已儲存至： {output_video_with_audio}")
@@ -250,92 +194,69 @@ def run_inference(
 def main():
     parser = argparse.ArgumentParser(description="StableAvatar 命令列推理工具")
-    # --- 主要參數 ---
     parser.add_argument('--prompt', type=str, default="a beautiful woman is talking, masterpiece, best quality", help='正面提示詞')
-    parser.add_argument('--input_image', type=str, default="./example_case/case-1/reference.png", help='輸入圖片的路徑或 URL')
-    parser.add_argument('--input_audio', type=str, default="./example_case/case-1/audio.wav", help='輸入音訊的路徑或 URL')
     parser.add_argument('--seed', type=int, default=42, help='隨機種子，-1 表示隨機')
-    # --- 生成參數 ---
     parser.add_argument('--negative_prompt', type=str, default="vivid color, static, blur details, text, style, painting, picture, still, gray, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, malformed, deformed, bad anatomy, fused fingers, still image, messy background, many people in the background, walking backwards", help='負面提示詞')
     parser.add_argument('--width', type=int, default=512, help='影片寬度')
     parser.add_argument('--height', type=int, default=512, help='影片高度')
     parser.add_argument('--num_inference_steps', type=int, default=50, help='推理步數')
     parser.add_argument('--fps', type=int, default=25, help='影片幀率')
-    # --- 模型與優化參數 ---
-    parser.add_argument('--repo_id', type=str, default="FrancisRing/StableAvatar", help='Hugging Face 模型的 Repo ID')
-    parser.add_argument('--gpu_memory_mode', type=str, default="model_cpu_offload", choices=["Normal", "model_cpu_offload", "model_cpu_offloadand_qfloat8", "sequential_cpu_offload"], help='GPU 記憶體優化模式')
     args = parser.parse_args()
-    # --- 1. 下載模型 ---
     print("--- 步驟 1: 正在檢查並下載模型 ---")
-    REPO_ID = args.repo_id
     repo_root = snapshot_download(
-        repo_id=REPO_ID,
-        allow_patterns=[
-            "StableAvatar-1.3B/*",
-            "Wan2.1-Fun-V1.1-1.3B-InP/*",
-            "wav2vec2-base-960h/*",
-            "assets/**",
-            "Kim_Vocal_2.onnx",
-            "example_case/**", # 確保範例檔案被下載
-            "deepspeed_config/**",
-        ],
     )
     print("模型檔案已準備就緒。")
-    # --- 2. 處理輸入檔案 ---
     print("\n--- 步驟 2: 正在處理輸入檔案 ---")
     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-    # 建立暫存目錄
     temp_dir = f"temp_{timestamp}"
     os.makedirs(temp_dir, exist_ok=True)
-    # 處理圖片
-    image_ext = os.path.splitext(args.input_image)[1] or '.png'
-    local_image_path = os.path.join(temp_dir, f"input_image{image_ext}")
-    final_image_path = download_file(args.input_image, local_image_path)
     if not final_image_path:
-        shutil.rmtree(temp_dir)
-        return
-    # 處理音訊
-    audio_ext = os.path.splitext(args.input_audio)[1] or '.wav'
-    local_audio_path = os.path.join(temp_dir, f"input_audio{audio_ext}")
-    final_audio_path = download_file(args.input_audio, local_audio_path)
     if not final_audio_path:
-        shutil.rmtree(temp_dir)
-        return
-    # --- 3. 載入模型 ---
-    print("\n--- 步驟 3: 正在載入模型（這可能需要一些時間） ---")
-    pipeline, transformer3d, vae = setup_models(repo_root)
     print("模型載入完成。")
-    # --- 4. 執行推理 ---
     print("\n--- 步驟 4: 開始執行推理 ---")
     run_inference(
-        pipeline=pipeline,
-        transformer3d=transformer3d,
-        vae=vae,
-        image_path=final_image_path,
-        audio_path=final_audio_path,
-        prompt=args.prompt,
-        negative_prompt=args.negative_prompt,
-        seed=args.seed,
-        output_filename=f"output_{timestamp}",
-        gpu_memory_mode=args.gpu_memory_mode,
-        width=args.width,
-        height=args.height,
-        num_inference_steps=args.num_inference_steps,
-        fps=args.fps,
     )
-    # --- 5. 清理 ---
     print("\n--- 步驟 5: 清理暫存檔案 ---")
     try:
         shutil.rmtree(temp_dir)
@@ -345,3 +266,4 @@ def main():
 if __name__ == "__main__":
     main()

     return filtered_kwargs
 def download_file(url, local_path):
+    """從 URL 下載檔案，如果 URL 是本地路徑則直接返回"""
     if url.startswith(('http://', 'https://')):
         print(f"從 {url} 下載檔案中...")
         try:
         print(f"錯誤：檔案或 URL 不存在： {url}")
         return None
+def setup_models(repo_root, model_version):
     """載入所有必要的模型和設定"""
     pretrained_model_name_or_path = os.path.join(repo_root, "Wan2.1-Fun-V1.1-1.3B-InP")
     pretrained_wav2vec_path = os.path.join(repo_root, "wav2vec2-base-960h")
+    config_path = os.path.join(repo_root, "deepspeed_config/wan2.1/wan_civitai.yaml")
+    if not os.path.exists(config_path):
+         raise FileNotFoundError(f"設定檔未找到: {config_path}")
+    config = OmegaConf.load(config_path)
     sampler_name = "Flow"
     print("正在載入 Tokenizer...")
     print("正在載入 CLIP Image Encoder...")
     clip_image_encoder = CLIPModel.from_pretrained(os.path.join(pretrained_model_name_or_path, config['image_encoder_kwargs'].get('image_encoder_subpath', 'image_encoder'))).eval()
+    print("正在載入 Transformer 3D 基礎模型...")
     transformer3d = WanTransformer3DFantasyModel.from_pretrained(
         os.path.join(pretrained_model_name_or_path, config['transformer_additional_kwargs'].get('transformer_subpath', 'transformer')),
         transformer_additional_kwargs=OmegaConf.to_container(config['transformer_additional_kwargs']),
         torch_dtype=dtype,
     )
+    # <<< FIX 1: 載入 StableAvatar 專用權重 >>>
+    if model_version == "square":
+        transformer_path = os.path.join(repo_root, "StableAvatar-1.3B", "transformer3d-square.pt")
+    else: # rec_vec
+        transformer_path = os.path.join(repo_root, "StableAvatar-1.3B", "transformer3d-rec-vec.pt")
+    if os.path.exists(transformer_path):
+        print(f"正在從 {transformer_path} 載入 StableAvatar 權重...")
+        state_dict = torch.load(transformer_path, map_location="cpu")
+        state_dict = state_dict["state_dict"] if "state_dict" in state_dict else state_dict
+        m, u = transformer3d.load_state_dict(state_dict, strict=False)
+        print(f"StableAvatar 權重載入成功。 Missing keys: {len(m)}; Unexpected keys: {len(u)}")
+    else:
+        raise FileNotFoundError(f"找不到 StableAvatar 權重檔案：{transformer_path}。請確保模型已完整下載。")
+    # <<< END OF FIX 1 >>>
+    scheduler_class = { "Flow": FlowMatchEulerDiscreteScheduler }[sampler_name]
+    scheduler = scheduler_class(**filter_kwargs(scheduler_class, OmegaConf.to_container(config['scheduler_kwargs'])))
     print("正在建立 Pipeline...")
     pipeline = WanI2VTalkingInferenceLongPipeline(
+        tokenizer=tokenizer, text_encoder=text_encoder, vae=vae,
+        transformer=transformer3d, clip_image_encoder=clip_image_encoder,
+        scheduler=scheduler, wav2vec_processor=wav2vec_processor, wav2vec=wav2vec,
     )
     return pipeline, transformer3d, vae
 def run_inference(
+    pipeline, transformer3d, vae, image_path, audio_path, prompt,
+    negative_prompt, seed, output_filename, gpu_memory_mode="model_cpu_offload",
+    width=512, height=512, num_inference_steps=50, fps=25, **kwargs
 ):
+    """執行推理以生成影片。"""
     if seed < 0:
         seed = random.randint(0, np.iinfo(np.int32).max)
     print(f"使用的種子： {seed}")
     if gpu_memory_mode == "sequential_cpu_offload":
         pipeline.enable_sequential_cpu_offload(device=device)
     elif gpu_memory_mode == "model_cpu_offload":
         pipeline.enable_model_cpu_offload(device=device)
     else:
         pipeline.to(device=device)
     with torch.no_grad():
         print("正在準備輸入資料...")
+        # 由於 get_image_to_video_latent 內部有自己的 vae.config 引用，所以此處警告可忽略
+        video_length = 81
         input_video, input_video_mask, clip_image = get_image_to_video_latent(image_path, None, video_length=video_length, sample_size=[height, width])
         sr = 16000
         print("Pipeline 執行中... 這可能需要一些時間。")
         sample = pipeline(
+            prompt, num_frames=video_length, negative_prompt=negative_prompt,
+            width=width, height=height, guidance_scale=6.0,
+            generator=torch.Generator().manual_seed(seed), num_inference_steps=num_inference_steps,
+            video=input_video, mask_video=input_video_mask, clip_image=clip_image,
+            text_guide_scale=3.0, audio_guide_scale=5.0, vocal_input_values=vocal_input,
+            motion_frame=25, fps=fps, sr=sr, cond_file_path=image_path,
+            overlap_window_length=10, seed=seed, overlapping_weight_scheme="uniform",
         ).videos
         print("正在儲存影片...")
             output_video_with_audio
         ], check=True)
         os.remove(video_path)
     print(f"✅ 生成完成！影片已儲存至： {output_video_with_audio}")
 def main():
     parser = argparse.ArgumentParser(description="StableAvatar 命令列推理工具")
     parser.add_argument('--prompt', type=str, default="a beautiful woman is talking, masterpiece, best quality", help='正面提示詞')
+    parser.add_argument('--input_image', type=str, default="example_case/case-6/reference.png", help='輸入圖片的路徑或 URL')
+    parser.add_argument('--input_audio', type=str, default="example_case/case-6/audio.wav", help='輸入音訊的路徑或 URL')
     parser.add_argument('--seed', type=int, default=42, help='隨機種子，-1 表示隨機')
     parser.add_argument('--negative_prompt', type=str, default="vivid color, static, blur details, text, style, painting, picture, still, gray, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, malformed, deformed, bad anatomy, fused fingers, still image, messy background, many people in the background, walking backwards", help='負面提示詞')
     parser.add_argument('--width', type=int, default=512, help='影片寬度')
     parser.add_argument('--height', type=int, default=512, help='影片高度')
     parser.add_argument('--num_inference_steps', type=int, default=50, help='推理步數')
     parser.add_argument('--fps', type=int, default=25, help='影片幀率')
+    parser.add_argument('--gpu_memory_mode', type=str, default="model_cpu_offload", choices=["Normal", "model_cpu_offload"], help='GPU 記憶體優化模式')
+    parser.add_argument('--model_version', type=str, default="square", choices=["square", "rec_vec"], help='StableAvatar 模型版本')
     args = parser.parse_args()
     print("--- 步驟 1: 正在檢查並下載模型 ---")
     repo_root = snapshot_download(
+        repo_id="FrancisRing/StableAvatar",
+        allow_patterns=["StableAvatar-1.3B/*", "Wan2.1-Fun-V1.1-1.3B-InP/*", "wav2vec2-base-960h/*", "example_case/**", "deepspeed_config/**"],
     )
     print("模型檔案已準備就緒。")
     print("\n--- 步驟 2: 正在處理輸入檔案 ---")
     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
     temp_dir = f"temp_{timestamp}"
     os.makedirs(temp_dir, exist_ok=True)
+    # <<< FIX 2: 穩健的路徑處理 >>>
+    # 處理圖片路徑
+    input_image_path = args.input_image
+    # 如果不是 URL 且不是絕對路徑，就視為相對於 repo_root 的路徑
+    if not input_image_path.startswith(('http', '/')):
+        input_image_path = os.path.join(repo_root, input_image_path)
+    local_image_path = os.path.join(temp_dir, os.path.basename(input_image_path))
+    final_image_path = download_file(input_image_path, local_image_path)
     if not final_image_path:
+        shutil.rmtree(temp_dir); return
+    # 處理音訊路徑
+    input_audio_path = args.input_audio
+    if not input_audio_path.startswith(('http', '/')):
+        input_audio_path = os.path.join(repo_root, input_audio_path)
+    local_audio_path = os.path.join(temp_dir, os.path.basename(input_audio_path))
+    final_audio_path = download_file(input_audio_path, local_audio_path)
     if not final_audio_path:
+        shutil.rmtree(temp_dir); return
+    # <<< END OF FIX 2 >>>
+    print("\n--- 步驟 3: 正在載入模型 ---")
+    pipeline, transformer3d, vae = setup_models(repo_root, args.model_version)
     print("模型載入完成。")
     print("\n--- 步驟 4: 開始執行推理 ---")
     run_inference(
+        pipeline=pipeline, transformer3d=transformer3d, vae=vae,
+        image_path=final_image_path, audio_path=final_audio_path,
+        prompt=args.prompt, negative_prompt=args.negative_prompt,
+        seed=args.seed, output_filename=f"output_{timestamp}",
+        gpu_memory_mode=args.gpu_memory_mode, width=args.width,
+        height=args.height, num_inference_steps=args.num_inference_steps,
+        fps=args.fps
     )
     print("\n--- 步驟 5: 清理暫存檔案 ---")
     try:
         shutil.rmtree(temp_dir)
 if __name__ == "__main__":
     main()