|
import gradio as gr |
|
import torch |
|
import torchvision |
|
from diffusers import I2VGenXLPipeline, DiffusionPipeline |
|
from torchvision.transforms.functional import to_tensor |
|
from PIL import Image |
|
|
|
if gr.NO_RELOAD: |
|
n_steps = 40 |
|
high_noise_frac = 0.8 |
|
negative_prompt = "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms" |
|
generator = torch.manual_seed(8888) |
|
|
|
base = DiffusionPipeline.from_pretrained( |
|
"stabilityai/stable-diffusion-xl-base-1.0", |
|
torch_dtype=torch.float16, |
|
variant="fp16", |
|
use_safetensors=True, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
base.enable_model_cpu_offload() |
|
pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16") |
|
pipeline.enable_model_cpu_offload() |
|
pipeline.unet.enable_forward_chunking() |
|
|
|
def generate(prompt: str): |
|
image = base( |
|
prompt=prompt, |
|
num_inference_steps=n_steps, |
|
|
|
|
|
).images[0] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
image.save("frame.jpg") |
|
image = to_tensor(image) |
|
frames: list[Image.Image] = pipeline( |
|
prompt=prompt, |
|
image=image, |
|
num_inference_steps=50, |
|
negative_prompt=negative_prompt, |
|
guidance_scale=9.0, |
|
generator=generator, |
|
decode_chunk_size=6, |
|
).frames[0] |
|
frames = [to_tensor(frame.convert("RGB")).mul(255).byte().permute(1, 2, 0) for frame in frames] |
|
frames = torch.stack(frames) |
|
torchvision.io.write_video("video.mp4", frames, fps=4) |
|
return "video.mp4" |
|
|
|
app = gr.Interface( |
|
fn=generate, |
|
inputs=["text"], |
|
outputs=gr.Video() |
|
) |
|
|
|
if __name__ == "__main__": |
|
app.launch() |
|
|
|
|