import gradio as gr #import gradio.helpers import torch import os import base64 from glob import glob from pathlib import Path from typing import Optional from diffusers import StableVideoDiffusionPipeline from diffusers.utils import load_image, export_to_video from PIL import Image import uuid import random from huggingface_hub import login, hf_hub_download #gradio.helpers.CACHED_FOLDER = '/data/cache' SECRET_TOKEN = os.getenv('SECRET_TOKEN', 'default_secret') HF_API_KEY = os.getenv('HF_API_KEY', '') login(token=HF_API_KEY) pipe = StableVideoDiffusionPipeline.from_pretrained( "stabilityai/stable-video-diffusion-img2vid-xt-1-1", torch_dtype=torch.float16, variant="fp16" ) pipe.to("cuda") pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) #pipe.vae = torch.compile(pipe.vae, mode="reduce-overhead", fullgraph=True) max_64_bit_int = 2**63 - 1 def generate_video( secret_token: str, image: Image, seed: int, motion_bucket_id: int = 127, fps_id: int = 6, version: str = "svd_xt", cond_aug: float = 0.02, decoding_t: int = 3, # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary. device: str = "cuda", output_folder: str = "outputs", ): if secret_token != SECRET_TOKEN: raise gr.Error( f'Invalid secret token. Please fork the original space if you want to use it for yourself.') # note julian: normally we should resize input images, but normally they are already in 1024x576, so.. # also, I would like to experiment with vertical videos, and 1024x512 videos image = resize_image(image) if image.mode == "RGBA": image = image.convert("RGB") generator = torch.manual_seed(seed) os.makedirs(output_folder, exist_ok=True) base_count = len(glob(os.path.join(output_folder, "*.mp4"))) video_path = os.path.join(output_folder, f"{base_count:06d}.mp4") frames = pipe(image, decode_chunk_size=decoding_t, generator=generator, motion_bucket_id=motion_bucket_id, noise_aug_strength=0.1, num_frames=25).frames[0] export_to_video(frames, video_path, fps=fps_id) torch.manual_seed(seed) # Read the content of the video file and encode it to base64 with open(video_path, "rb") as video_file: video_base64 = base64.b64encode(video_file.read()).decode('utf-8') # Prepend the appropriate data URI header with MIME type video_data_uri = 'data:video/mp4;base64,' + video_base64 # clean-up (otherwise there is a risk of "ghosting", eg. someone seeing the previous generated video", # of one of the steps go wrong) os.remove(video_path) return video_data_uri def resize_image(image, output_size=(1024, 576)): # Calculate aspect ratios target_aspect = output_size[0] / output_size[1] # Aspect ratio of the desired size image_aspect = image.width / image.height # Aspect ratio of the original image # Resize then crop if the original image is larger if image_aspect > target_aspect: # Resize the image to match the target height, maintaining aspect ratio new_height = output_size[1] new_width = int(new_height * image_aspect) resized_image = image.resize((new_width, new_height), Image.LANCZOS) # Calculate coordinates for cropping left = (new_width - output_size[0]) / 2 top = 0 right = (new_width + output_size[0]) / 2 bottom = output_size[1] else: # Resize the image to match the target width, maintaining aspect ratio new_width = output_size[0] new_height = int(new_width / image_aspect) resized_image = image.resize((new_width, new_height), Image.LANCZOS) # Calculate coordinates for cropping left = 0 top = (new_height - output_size[1]) / 2 right = output_size[0] bottom = (new_height + output_size[1]) / 2 # Crop the image cropped_image = resized_image.crop((left, top, right, bottom)) return cropped_image with gr.Blocks() as demo: secret_token = gr.Text( label='Secret Token', max_lines=1, placeholder='Enter your secret token') gr.HTML("""
This space is a REST API to programmatically generate MP4 videos.
Interested in using it? Look no further than the original space!