import gradio as gr import torch import os import uuid import random from glob import glob from pathlib import Path from typing import Optional from diffusers import StableVideoDiffusionPipeline from diffusers.utils import load_image, export_to_video from PIL import Image from huggingface_hub import hf_hub_download pipe = StableVideoDiffusionPipeline.from_pretrained( "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16" ) pipe.to("cuda") pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) max_64_bit_int = 2**63 - 1 # Function to sample video from the input image def sample( image: Image, seed: Optional[int] = 42, randomize_seed: bool = True, motion_bucket_id: int = 127, fps_id: int = 6, version: str = "svd_xt", cond_aug: float = 0.02, decoding_t: int = 3, # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary. device: str = "cuda", output_folder: str = "outputs", ): if image.mode == "RGBA": image = image.convert("RGB") if randomize_seed: seed = random.randint(0, max_64_bit_int) generator = torch.manual_seed(seed) os.makedirs(output_folder, exist_ok=True) base_count = len(glob(os.path.join(output_folder, "*.mp4"))) video_path = os.path.join(output_folder, f"{base_count:06d}.mp4") frames = pipe(image, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=0.1, num_frames=25).frames[0] export_to_video(frames, video_path, fps=fps_id) torch.manual_seed(seed) return video_path, seed # Function to resize the uploaded image def resize_image(image, output_size=(1024, 576)): target_aspect = output_size[0] / output_size[1] image_aspect = image.width / image.height if image_aspect > target_aspect: new_height = output_size[1] new_width = int(new_height * image_aspect) resized_image = image.resize((new_width, new_height), Image.LANCZOS) left = (new_width - output_size[0]) / 2 top = 0 right = (new_width + output_size[0]) / 2 bottom = output_size[1] else: new_width = output_size[0] new_height = int(new_width / image_aspect) resized_image = image.resize((new_width, new_height), Image.LANCZOS) left = 0 top = (new_height - output_size[1]) / 2 right = output_size[0] bottom = (new_height + output_size[1]) / 2 cropped_image = resized_image.crop((left, top, right, bottom)) return cropped_image # Dynamically load image files from the 'images' directory def get_example_images(): image_dir = "images/" image_files = glob(os.path.join(image_dir, "*.png")) + glob(os.path.join(image_dir, "*.jpg")) return image_files # Gradio interface setup with gr.Blocks() as demo: gr.Markdown('''# Stable Video Diffusion using Image 2 Video XT #### Research release: generate `4s` vid from a single image at (`25 frames` at `6 fps`).''') with gr.Row(): with gr.Column(): image = gr.Image(label="Upload your image", type="pil") generate_btn = gr.Button("Generate") video = gr.Video() with gr.Accordion("Advanced options", open=False): seed = gr.Slider(label="Seed", value=42, randomize=True, minimum=0, maximum=max_64_bit_int, step=1) randomize_seed = gr.Checkbox(label="Randomize seed", value=True) motion_bucket_id = gr.Slider(label="Motion bucket id", value=127, minimum=1, maximum=255) fps_id = gr.Slider(label="Frames per second", value=6, minimum=5, maximum=30) image.upload(fn=resize_image, inputs=image, outputs=image, queue=False) generate_btn.click(fn=sample, inputs=[image, seed, randomize_seed, motion_bucket_id, fps_id], outputs=[video, seed], api_name="video") # Dynamically load examples from the filesystem example_images = get_example_images() gr.Examples( examples=example_images, inputs=image, outputs=[video, seed], fn=sample, cache_examples=True, ) if __name__ == "__main__": demo.queue(max_size=20) demo.launch(share=True)