import gradio as gr import torch import torchvision from diffusers import I2VGenXLPipeline, DiffusionPipeline from torchvision.transforms.functional import to_tensor from PIL import Image from utils import create_progress_updater if gr.NO_RELOAD: n_sdxl_steps = 50 n_i2v_steps = 50 high_noise_frac = 0.8 negative_prompt = "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms" generator = torch.manual_seed(8888) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") total_steps = n_sdxl_steps + n_i2v_steps print("Device:", device) base = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True, ) refiner = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-refiner-1.0", text_encoder_2=base.text_encoder_2, vae=base.vae, torch_dtype=torch.float16, use_safetensors=True, variant="fp16", ) pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16") base.to("cuda") refiner.to("cuda") pipeline.to("cuda") base.unet = torch.compile(base.unet, mode="reduce-overhead", fullgraph=True) refiner.unet = torch.compile(refiner.unet, mode="reduce-overhead", fullgraph=True) pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True) def generate(prompt: str, progress=gr.Progress()): progress((0, 100), desc="Generating first frame...") image = base( prompt=prompt, num_inference_steps=n_sdxl_steps, denoising_end=high_noise_frac, output_type="latent", callback_on_step_end=create_progress_updater( start=0, total=total_steps, desc="Generating first frame...", progress=progress, ), ).images[0] progress((n_sdxl_steps * high_noise_frac, total_steps), desc="Refining first frame...") image = refiner( prompt=prompt, num_inference_steps=n_sdxl_steps, denoising_start=high_noise_frac, image=image, callback_on_step_end=create_progress_updater( start=n_sdxl_steps * high_noise_frac, total=total_steps, desc="Refining first frame...", progress=progress, ), ).images[0] image = to_tensor(image) progress((n_sdxl_steps, total_steps), desc="Generating video...") frames: list[Image.Image] = pipeline( prompt=prompt, image=image, num_inference_steps=50, negative_prompt=negative_prompt, guidance_scale=9.0, generator=generator, decode_chunk_size=2, num_frames=32, ).frames[0] progress((total_steps - 1, total_steps), desc="Finalizing...") frames = [to_tensor(frame.convert("RGB")).mul(255).byte().permute(1, 2, 0) for frame in frames] frames = torch.stack(frames) torchvision.io.write_video("video.mp4", frames, fps=16) return "video.mp4" app = gr.Interface( fn=generate, inputs=["text"], outputs=gr.Video() ) if __name__ == "__main__": app.launch()