Spaces:

jonluca
/

HunyuanVideo

Runtime error

File size: 7,310 Bytes

d3091ae
 
 
 
 
 
 
792b20f
b46297c
d3091ae
 
53bdc9e
d3091ae
 
 
a914cfe
8564a6a
d3091ae
5adf87d
a07ca9d
566bd68
 
 
509de15
566bd68
 
 
 
509de15
566bd68
 
 
 
a07ca9d
566bd68
 
 
 
 
 
 
 
 
41a9d22
d3091ae
566bd68
29be167
d3091ae
5d1f182
 
 
 
 
 
 
 
 
4426f1f
5d1f182
 
 
 
 
 
 
 
 
 
 
 
 
 
d3091ae
 
 
 
 
 
 
 
 
 
4426f1f
4c6de9b
4426f1f
4c6de9b
 
d3091ae
 
 
 
4426f1f
c4d33cd
 
 
566bd68
d3091ae
 
 
566bd68
942edfa
566bd68
942edfa
566bd68
 
 
d3091ae
 
566bd68
690f172
d3091ae
690f172
 
 
 
 
 
 
 
d3091ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
792b20f
d3091ae
 
 
 
 
 
 
 
792b20f
d3091ae
792b20f
8029cd1
 
d3091ae
5adf87d
d3091ae
 
 
8029cd1
792b20f
d3091ae
5adf87d
792b20f
070c1ad
8395168
070c1ad
 
8395168
d3091ae
 
5d1f182
d3091ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5cc09f2
792b20f

import os
import time
from pathlib import Path
from loguru import logger
from datetime import datetime
import gradio as gr
import random
import spaces
import torch

from hyvideo.utils.file_utils import save_videos_grid
from hyvideo.utils.preprocess_text_encoder_tokenizer_utils import preprocess_text_encoder_tokenizer
from hyvideo.config import parse_args
from hyvideo.inference import HunyuanVideoSampler
from hyvideo.constants import NEGATIVE_PROMPT
from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
from diffusers.utils import export_to_video

from huggingface_hub import snapshot_download

# if torch.cuda.device_count() > 0:
#     snapshot_download(repo_id="tencent/HunyuanVideo", repo_type="model", local_dir="ckpts", force_download=False)
#     snapshot_download(repo_id="xtuner/llava-llama-3-8b-v1_1-transformers", repo_type="model", local_dir="ckpts/llava-llama-3-8b-v1_1-transformers", force_download=True)

#     class Args:
#         def __init__(self, input_dir, output_dir):
#             self.input_dir = input_dir
#             self.output_dir = output_dir

#     # Create the object
#     args = Args("ckpts/llava-llama-3-8b-v1_1-transformers", "ckpts/text_encoder")
#     preprocess_text_encoder_tokenizer(args)
#     snapshot_download(repo_id="openai/clip-vit-large-patch14", repo_type="model", local_dir="ckpts/text_encoder_2", force_download=True)

def initialize_model():
    model_id = "hunyuanvideo-community/HunyuanVideo"

    transformer = HunyuanVideoTransformer3DModel.from_pretrained(
        model_id, subfolder="transformer", torch_dtype=torch.bfloat16
    )
    model = HunyuanVideoPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch.float16)
    model.vae.enable_tiling()
    model.to("cuda")
    return model

model = initialize_model()

def generate_video(
    prompt,
    resolution,
    video_length,
    seed,
    num_inference_steps,
    guidance_scale,
    flow_shift,
    embedded_guidance_scale
):
    print("generate_video (prompt: " + prompt + ")")
    return generate_video_gpu(
        model,
        prompt,
        resolution,
        video_length,
        seed,
        num_inference_steps,
        guidance_scale,
        flow_shift,
        embedded_guidance_scale
    )

@spaces.GPU(duration=120)
def generate_video_gpu(
    model,
    prompt,
    resolution,
    video_length,
    seed,
    num_inference_steps,
    guidance_scale,
    flow_shift,
    embedded_guidance_scale
):
    print("generate_video_gpu (prompt: " + prompt + ")")
    if torch.cuda.device_count() == 0:
        gr.Warning("Set this space to GPU config to make it work.")
        return None
    
    seed = None if seed == -1 else seed
    width, height = resolution.split("x")
    width, height = int(width), int(height)
    negative_prompt = "" # not applicable in the inference
    print("Predicting video...")
    generator = None
    if seed is not None:
        generator = torch.manual_seed(seed)
    frames: List[PIL.Image.Image] = model(
        prompt=prompt,
        height=height,
        width=width, 
        num_frames=video_length,
        generator=generator,
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale
    ).frames[0]

    output_video = export_to_video(frames, fps=15)
    return video_path


def create_demo(model_path):
    with gr.Blocks() as demo:
        if torch.cuda.device_count() == 0:
            with gr.Row():
                gr.HTML("""
                    <p style="background-color: red;"><big><big><big><b>⚠️To use <i>Hunyuan Video</i>, <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/HunyuanVideo?duplicate=true">duplicate this space</a> and set a GPU with 80 GB VRAM.</b>
    
                    You can't use <i>Hunyuan Video</i> directly here because this space runs on a CPU, which is not enough for <i>Hunyuan Video</i>. Please provide <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/HunyuanVideo/discussions/new">feedback</a> if you have issues.
                    </big></big></big></p>
                    """)
        gr.Markdown("# Hunyuan Video Generation")
        
        with gr.Row():
            with gr.Column():
                prompt = gr.Textbox(label="Prompt", value="A cat walks on the grass, realistic style.")
                with gr.Row():
                    resolution = gr.Dropdown(
                        choices=[
                            # 720p
                            ("1280x720 (16:9, 720p)", "1280x720"),
                            ("720x1280 (9:16, 720p)", "720x1280"), 
                            ("1104x832 (4:3, 720p)", "1104x832"),
                            ("832x1104 (3:4, 720p)", "832x1104"),
                            ("960x960 (1:1, 720p)", "960x960"),
                            # 540p
                            ("960x544 (16:9, 540p)", "960x544"),
                            ("544x960 (9:16, 540p)", "544x960"),
                            ("832x624 (4:3, 540p)", "832x624"), 
                            ("624x832 (3:4, 540p)", "624x832"),
                            ("720x720 (1:1, 540p)", "720x720"),
                        ],
                        value="832x624",
                        label="Resolution"
                    )
                    video_length = gr.Dropdown(
                        label="Video Length",
                        choices=[
                            ("2s(65f)", 65),
                            ("5s(129f)", 129),
                        ],
                        value=65,
                    )
                num_inference_steps = gr.Slider(1, 100, value=5, step=1, label="Number of Inference Steps")
                
                with gr.Accordion("Advanced Options", open=False):
                    with gr.Column():
                        seed = gr.Slider(label="Seed (-1 for random)", value=-1, minimum=-1, maximum=2**63 - 1, step=1)
                        guidance_scale = gr.Slider(1.0, 20.0, value=1.0, step=0.5, label="Guidance Scale")
                        flow_shift = gr.Slider(0.0, 10.0, value=7.0, step=0.1, label="Flow Shift") 
                        embedded_guidance_scale = gr.Slider(1.0, 20.0, value=6.0, step=0.5, label="Embedded Guidance Scale")

                generate_btn = gr.Button(value = "🚀 Generate Video", variant = "primary")
            
            with gr.Row():
                output = gr.Video(label = "Generated Video", autoplay = True)

        gr.Markdown("""
## **Alternatives**
If you can't use _Hunyuan Video_, you can use _[CogVideoX](https://huggingface.co/spaces/THUDM/CogVideoX-5B-Space)_ or _[LTX Video Playground](https://huggingface.co/spaces/Lightricks/LTX-Video-Playground)_ instead.
                    """)
        
        generate_btn.click(
            fn=generate_video,
            inputs=[
                prompt,
                resolution,
                video_length,
                seed,
                num_inference_steps,
                guidance_scale,
                flow_shift,
                embedded_guidance_scale
            ],
            outputs=output
        )
    
    return demo

if __name__ == "__main__":
    os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
    demo = create_demo("ckpts")
    demo.queue(10).launch()