import spaces import gradio as gr # import gradio.helpers import torch import os from glob import glob from pathlib import Path from typing import Optional from PIL import Image from diffusers.utils import load_image, export_to_video from pipeline import StableVideoDiffusionPipeline import random from safetensors import safe_open from lcm_scheduler import AnimateLCMSVDStochasticIterativeScheduler def get_safetensors_files(): models_dir = "./safetensors" safetensors_files = [ f for f in os.listdir(models_dir) if f.endswith(".safetensors") ] return safetensors_files def model_select(selected_file): print("load model weights", selected_file) pipe.unet.cpu() file_path = os.path.join("./safetensors", selected_file) state_dict = {} with safe_open(file_path, framework="pt", device="cpu") as f: for key in f.keys(): state_dict[key] = f.get_tensor(key) missing, unexpected = pipe.unet.load_state_dict(state_dict, strict=True) pipe.unet.cuda() del state_dict return noise_scheduler = AnimateLCMSVDStochasticIterativeScheduler( num_train_timesteps=40, sigma_min=0.002, sigma_max=700.0, sigma_data=1.0, s_noise=1.0, rho=7, clip_denoised=False, ) pipe = StableVideoDiffusionPipeline.from_pretrained( "stabilityai/stable-video-diffusion-img2vid-xt", scheduler=noise_scheduler, torch_dtype=torch.float16, variant="fp16", ) pipe.to("cuda") pipe.enable_model_cpu_offload() # for smaller cost model_select("AnimateLCM-SVD-xt-1.1.safetensors") # pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) # for faster inference max_64_bit_int = 2**63 - 1 @spaces.GPU def sample( image: Image, seed: Optional[int] = 42, randomize_seed: bool = False, motion_bucket_id: int = 80, fps_id: int = 8, max_guidance_scale: float = 1.2, min_guidance_scale: float = 1, width: int = 1024, height: int = 576, num_inference_steps: int = 4, decoding_t: int = 4, # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary. output_folder: str = "outputs_gradio", ): if image.mode == "RGBA": image = image.convert("RGB") if randomize_seed: seed = random.randint(0, max_64_bit_int) generator = torch.manual_seed(seed) os.makedirs(output_folder, exist_ok=True) base_count = len(glob(os.path.join(output_folder, "*.mp4"))) video_path = os.path.join(output_folder, f"{base_count:06d}.mp4") with torch.autocast("cuda"): frames = pipe( image, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, height=height, width=width, num_inference_steps=num_inference_steps, min_guidance_scale=min_guidance_scale, max_guidance_scale=max_guidance_scale, ).frames[0] export_to_video(frames, video_path, fps=fps_id) torch.manual_seed(seed) return video_path, seed with gr.Blocks() as demo: with gr.Row(): with gr.Column(): image = gr.Image(label="Upload your image", type="pil") generate_btn = gr.Button("Generate") video = gr.Video() seed = gr.Slider( label="Seed", value=42, randomize=False, minimum=0, maximum=max_64_bit_int, step=1, ) randomize_seed = gr.Checkbox(label="Randomize seed", value=False) motion_bucket_id = gr.Slider( label="Motion bucket id", info="Controls how much motion to add/remove from the image", value=80, minimum=1, maximum=255, ) fps_id = gr.Slider( label="Frames per second", info="The length of your video in seconds will be 25/fps", value=8, minimum=5, maximum=30, ) # note: we want something that is close to 16:9 (1.7777) # 576 / 320 = 1.8 # 448 / 256 = 1.75 width = gr.Slider( label="Width of input image", info="It should be divisible by 64", value=576, # 256, 320, 384, 448 minimum=256, maximum=2048, step=64, ) height = gr.Slider( label="Height of input image", info="It should be divisible by 64", value=320, # 256, 320, 384, 448 minimum=256, maximum=1152, ) max_guidance_scale = gr.Slider( label="Max guidance scale", info="classifier-free guidance strength", value=1.2, minimum=1, maximum=2, ) min_guidance_scale = gr.Slider( label="Min guidance scale", info="classifier-free guidance strength", value=1, minimum=1, maximum=1.5, ) num_inference_steps = gr.Slider( label="Num inference steps", info="steps for inference", value=4, minimum=1, maximum=20, step=1, ) generate_btn.click( fn=sample, inputs=[ image, seed, randomize_seed, motion_bucket_id, fps_id, max_guidance_scale, min_guidance_scale, width, height, num_inference_steps, ], outputs=[video, seed], api_name="video", ) if __name__ == "__main__": demo.queue() demo.launch(show_error=True)