import torch._dynamo torch._dynamo.config.suppress_errors = True import torch import gradio as gr import os import base64 from glob import glob from pathlib import Path from typing import Optional from diffusers import StableVideoDiffusionPipeline from diffusers.utils import load_image, export_to_video from PIL import Image import uuid import random from huggingface_hub import login, hf_hub_download import spaces # pipe = StableVideoDiffusionPipeline.from_pretrained( # # "stabilityai/stable-video-diffusion-img2vid-xt-1-1", # "vdo/stable-video-diffusion-img2vid-xt-1-1", # torch_dtype=torch.float16, # variant="fp16" # ) # pipe.save_pretrained("./checkpoints", variant="fp16") model_directory = './checkpoints' if not os.path.exists(model_directory): pipe = StableVideoDiffusionPipeline.from_pretrained( # "stabilityai/stable-video-diffusion-img2vid-xt-1-1", "vdo/stable-video-diffusion-img2vid-xt-1-1", torch_dtype=torch.float16, variant="fp16" ) pipe.save_pretrained("./checkpoints", variant="fp16") else: pipe = StableVideoDiffusionPipeline.from_pretrained( model_directory, torch_dtype=torch.float16, variant="fp16" ) pipe.to("cuda") # pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) #pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=True) max_64_bit_int = 2**63 - 1 @spaces.GPU(duration=180) def generate_video( image: Image, seed: int, motion_bucket_id: int = 127, fps_id: int = 6, version: str = "svd_xt", cond_aug: float = 0.02, decoding_t: int = 3, # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary. device: str = "cuda", output_folder: str = "outputs", ): global pipe # note julian: normally we should resize input images, but normally they are already in 1024x576, so.. # also, I would like to experiment with vertical videos, and 1024x512 videos image = resize_image(image) if image.mode == "RGBA": image = image.convert("RGB") generator = torch.manual_seed(seed) os.makedirs(output_folder, exist_ok=True) base_count = len(glob(os.path.join(output_folder, "*.mp4"))) video_path = os.path.join(output_folder, f"{base_count:06d}.mp4") # pipe.to("cuda") frames = pipe(image, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=0.1, num_frames=25).frames[0] export_to_video(frames, video_path, fps=fps_id) torch.manual_seed(seed) # Read the content of the video file and encode it to base64 with open(video_path, "rb") as video_file: video_base64 = base64.b64encode(video_file.read()).decode('utf-8') # Prepend the appropriate data URI header with MIME type video_data_uri = 'data:video/mp4;base64,' + video_base64 # clean-up (otherwise there is a risk of "ghosting", eg. someone seeing the previous generated video", # of one of the steps go wrong) os.remove(video_path) return video_data_uri def resize_image(image, output_size=(1024, 576)): # Calculate aspect ratios target_aspect = output_size[0] / output_size[1] # Aspect ratio of the desired size image_aspect = image.width / image.height # Aspect ratio of the original image # Resize then crop if the original image is larger if image_aspect > target_aspect: # Resize the image to match the target height, maintaining aspect ratio new_height = output_size[1] new_width = int(new_height * image_aspect) resized_image = image.resize((new_width, new_height), Image.LANCZOS) # Calculate coordinates for cropping left = (new_width - output_size[0]) / 2 top = 0 right = (new_width + output_size[0]) / 2 bottom = output_size[1] else: # Resize the image to match the target width, maintaining aspect ratio new_width = output_size[0] new_height = int(new_width / image_aspect) resized_image = image.resize((new_width, new_height), Image.LANCZOS) # Calculate coordinates for cropping left = 0 top = (new_height - output_size[1]) / 2 right = output_size[0] bottom = (new_height + output_size[1]) / 2 # Crop the image cropped_image = resized_image.crop((left, top, right, bottom)) return cropped_image css = """ img, video { max-height: 400px; object-fit: contain; } """ with gr.Blocks(css=css) as demo: image = gr.Image(label="Upload your image", type="pil") generate_btn = gr.Button("Generate") base64_out = gr.Textbox(label="Base64 Video") seed = gr.Slider(label="Seed", value=42, randomize=False, minimum=0, maximum=max_64_bit_int, step=1) motion_bucket_id = gr.Slider(label="Motion bucket id", info="Controls how much motion to add/remove from the image", value=127, minimum=1, maximum=255) fps_id = gr.Slider(label="Frames per second", info="The length of your video in seconds will be 25/fps", value=6, minimum=5, maximum=30) generate_btn.click( fn=generate_video, inputs=[image, seed, motion_bucket_id, fps_id], outputs=base64_out, api_name="run" ) demo.launch()