import os import time from pathlib import Path from loguru import logger from datetime import datetime import gradio as gr import random import spaces import torch from hyvideo.utils.file_utils import save_videos_grid from hyvideo.utils.preprocess_text_encoder_tokenizer_utils import preprocess_text_encoder_tokenizer from hyvideo.config import parse_args from hyvideo.inference import HunyuanVideoSampler from hyvideo.constants import NEGATIVE_PROMPT from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel from diffusers.utils import export_to_video from huggingface_hub import snapshot_download # if torch.cuda.device_count() > 0: # snapshot_download(repo_id="tencent/HunyuanVideo", repo_type="model", local_dir="ckpts", force_download=False) # snapshot_download(repo_id="xtuner/llava-llama-3-8b-v1_1-transformers", repo_type="model", local_dir="ckpts/llava-llama-3-8b-v1_1-transformers", force_download=True) # class Args: # def __init__(self, input_dir, output_dir): # self.input_dir = input_dir # self.output_dir = output_dir # # Create the object # args = Args("ckpts/llava-llama-3-8b-v1_1-transformers", "ckpts/text_encoder") # preprocess_text_encoder_tokenizer(args) # snapshot_download(repo_id="openai/clip-vit-large-patch14", repo_type="model", local_dir="ckpts/text_encoder_2", force_download=True) def initialize_model(): model_id = "hunyuanvideo-community/HunyuanVideo" transformer = HunyuanVideoTransformer3DModel.from_pretrained( model_id, subfolder="transformer", torch_dtype=torch.bfloat16 ) model = HunyuanVideoPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch.float16) model.vae.enable_tiling() model.to("cuda") return model model = initialize_model() def generate_video( prompt, resolution, video_length, seed, num_inference_steps, guidance_scale, flow_shift, embedded_guidance_scale ): print("generate_video (prompt: " + prompt + ")") return generate_video_gpu( model, prompt, resolution, video_length, seed, num_inference_steps, guidance_scale, flow_shift, embedded_guidance_scale ) @spaces.GPU(duration=120) def generate_video_gpu( model, prompt, resolution, video_length, seed, num_inference_steps, guidance_scale, flow_shift, embedded_guidance_scale ): print("generate_video_gpu (prompt: " + prompt + ")") if torch.cuda.device_count() == 0: gr.Warning("Set this space to GPU config to make it work.") return None seed = None if seed == -1 else seed width, height = resolution.split("x") width, height = int(width), int(height) negative_prompt = "" # not applicable in the inference print("Predicting video...") generator = None if seed is not None: generator = torch.manual_seed(seed) frames: List[PIL.Image.Image] = model( prompt=prompt, height=height, width=width, num_frames=video_length, generator=generator, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale ).frames[0] output_video = export_to_video(frames, fps=15) return video_path def create_demo(model_path): with gr.Blocks() as demo: if torch.cuda.device_count() == 0: with gr.Row(): gr.HTML("""

⚠️To use Hunyuan Video, duplicate this space and set a GPU with 80 GB VRAM. You can't use Hunyuan Video directly here because this space runs on a CPU, which is not enough for Hunyuan Video. Please provide feedback if you have issues.

""") gr.Markdown("# Hunyuan Video Generation") with gr.Row(): with gr.Column(): prompt = gr.Textbox(label="Prompt", value="A cat walks on the grass, realistic style.") with gr.Row(): resolution = gr.Dropdown( choices=[ # 720p ("1280x720 (16:9, 720p)", "1280x720"), ("720x1280 (9:16, 720p)", "720x1280"), ("1104x832 (4:3, 720p)", "1104x832"), ("832x1104 (3:4, 720p)", "832x1104"), ("960x960 (1:1, 720p)", "960x960"), # 540p ("960x544 (16:9, 540p)", "960x544"), ("544x960 (9:16, 540p)", "544x960"), ("832x624 (4:3, 540p)", "832x624"), ("624x832 (3:4, 540p)", "624x832"), ("720x720 (1:1, 540p)", "720x720"), ], value="832x624", label="Resolution" ) video_length = gr.Dropdown( label="Video Length", choices=[ ("2s(65f)", 65), ("5s(129f)", 129), ], value=65, ) num_inference_steps = gr.Slider(1, 100, value=5, step=1, label="Number of Inference Steps") with gr.Accordion("Advanced Options", open=False): with gr.Column(): seed = gr.Slider(label="Seed (-1 for random)", value=-1, minimum=-1, maximum=2**63 - 1, step=1) guidance_scale = gr.Slider(1.0, 20.0, value=1.0, step=0.5, label="Guidance Scale") flow_shift = gr.Slider(0.0, 10.0, value=7.0, step=0.1, label="Flow Shift") embedded_guidance_scale = gr.Slider(1.0, 20.0, value=6.0, step=0.5, label="Embedded Guidance Scale") generate_btn = gr.Button(value = "🚀 Generate Video", variant = "primary") with gr.Row(): output = gr.Video(label = "Generated Video", autoplay = True) gr.Markdown(""" ## **Alternatives** If you can't use _Hunyuan Video_, you can use _[CogVideoX](https://huggingface.co/spaces/THUDM/CogVideoX-5B-Space)_ or _[LTX Video Playground](https://huggingface.co/spaces/Lightricks/LTX-Video-Playground)_ instead. """) generate_btn.click( fn=generate_video, inputs=[ prompt, resolution, video_length, seed, num_inference_steps, guidance_scale, flow_shift, embedded_guidance_scale ], outputs=output ) return demo if __name__ == "__main__": os.environ["GRADIO_ANALYTICS_ENABLED"] = "False" demo = create_demo("ckpts") demo.queue(10).launch()