import gradio as gr import torch import torchvision from diffusers import I2VGenXLPipeline, DiffusionPipeline from torchvision.transforms.functional import to_tensor from PIL import Image if gr.NO_RELOAD: n_steps = 50 high_noise_frac = 0.8 negative_prompt = "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms" generator = torch.manual_seed(8888) base = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True, ) refiner = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-refiner-1.0", text_encoder_2=base.text_encoder_2, vae=base.vae, torch_dtype=torch.float16, use_safetensors=True, variant="fp16", ) pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16") base.to("cuda") refiner.to("cuda") pipeline.to("cuda") base.unet = torch.compile(base.unet, mode="reduce-overhead", fullgraph=True) refiner.unet = torch.compile(refiner.unet, mode="reduce-overhead", fullgraph=True) pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True) def generate(prompt: str): image = base( prompt=prompt, num_inference_steps=n_steps, denoising_end=high_noise_frac, output_type="latent", ).images[0] image = refiner( prompt=prompt, num_inference_steps=n_steps, denoising_start=high_noise_frac, image=image, ).images[0] image = to_tensor(image) frames: list[Image.Image] = pipeline( prompt=prompt, image=image, num_inference_steps=50, negative_prompt=negative_prompt, guidance_scale=9.0, generator=generator, ).frames[0] frames = [to_tensor(frame.convert("RGB")).mul(255).byte().permute(1, 2, 0) for frame in frames] frames = torch.stack(frames) torchvision.io.write_video("video.mp4", frames, fps=8) return "video.mp4" app = gr.Interface( fn=generate, inputs=["text"], outputs=gr.Video() ) if __name__ == "__main__": app.launch()