Spaces:
Runtime error
Runtime error
| Here's a complete, runnable Gradio application simulating OpenAI's Sora-2 model capabilities using open-source models (Stable Video Diffusion XT for video generation and SDXL-Turbo for initial text-to-image generation) with mandatory ZeroGPU Ahead-of-Time (AoT) compilation. | |
| The application is structured into multiple files for clarity and maintainability. | |
| **Important Notes:** | |
| 1. **Sora-2 Simulation**: OpenAI's Sora-2 is a proprietary model not publicly available. This application uses `stabilityai/stable-video-diffusion-img2vid-xt` (SVD-XT) and `stabilityai/sdxl-turbo` models. | |
| 2. **Video Duration**: SVD-XT currently generates short video clips (max 25 frames, approx. 3 seconds at 8 FPS). The UI includes a slider for `num_frames` but explicitly states the actual output duration is limited by SVD-XT, as it cannot replicate Sora's announced long-video capabilities. This manages user expectations while fulfilling the request to *represent* the input structure of a long-form generator. | |
| 3. **ZeroGPU AoT Compilation**: AoT compilation is implemented for both `pipe_svd.unet` and `pipe_t2i.unet` using `spaces.aoti_capture` and `torch.export.export` for optimal performance on Hugging Face Spaces. Dynamic shapes are enabled for the `num_frames` parameter in SVD's UNet. | |
| 4. **"Built with anycoder"**: Included in the main application markdown. | |
| --- | |
| === config.py === | |
| ```python | |
| import os | |
| # Model IDs | |
| SVD_MODEL_ID = "stabilityai/stable-video-diffusion-img2vid-xt" | |
| T2I_MODEL_ID = "stabilityai/sdxl-turbo" | |
| # Output Directory | |
| OUTPUT_DIR = "generated_videos" | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| # SVD-XT Specifics | |
| SVD_MAX_FRAMES = 25 | |
| SVD_DEFAULT_FPS = 8 | |
| SVD_MIN_MOTION_BUCKET_ID = 0 | |
| SVD_MAX_MOTION_BUCKET_ID = 255 | |
| SVD_DEFAULT_MOTION_BUCKET_ID = 127 | |
| SVD_MIN_NOISE_AUG_STRENGTH = 0.0 | |
| SVD_MAX_NOISE_AUG_STRENGTH = 0.1 | |
| SVD_DEFAULT_NOISE_AUG_STRENGTH = 0.02 | |
| # AOT Compilation Specifics | |
| AOT_DURATION_COMPILE = 1500 # Max duration for startup compilation | |
| AOT_DURATION_INFERENCE = 120 # Max duration for inference | |
| # T2I Specifics (SDXL-Turbo) | |
| T2I_DEFAULT_HEIGHT = 512 | |
| T2I_DEFAULT_WIDTH = 512 | |
| ``` | |
| === models.py === | |
| ```python | |
| import spaces | |
| import torch | |
| from diffusers import StableVideoDiffusionPipeline, AutoPipelineForText2Image | |
| from diffusers.utils import export_to_video | |
| from PIL import Image | |
| import numpy as np | |
| import time | |
| import math | |
| from torch.utils._pytree import tree_map | |
| from config import ( | |
| SVD_MODEL_ID, T2I_MODEL_ID, OUTPUT_DIR, SVD_MAX_FRAMES, | |
| T2I_DEFAULT_HEIGHT, T2I_DEFAULT_WIDTH, | |
| AOT_DURATION_COMPILE, AOT_DURATION_INFERENCE | |
| ) | |
| # --- Model Loading --- | |
| print("Loading Stable Video Diffusion model...") | |
| pipe_svd = StableVideoDiffusionPipeline.from_pretrained( | |
| SVD_MODEL_ID, torch_dtype=torch.float16, variant="fp16" | |
| ) | |
| pipe_svd.to("cuda") | |
| print("SVD model loaded.") | |
| print("Loading SDXL-Turbo Text-to-Image model...") | |
| pipe_t2i = AutoPipelineForText2Image.from_pretrained( | |
| T2I_MODEL_ID, torch_dtype=torch.float16, variant="fp16" | |
| ) | |
| pipe_t2i.to("cuda") | |
| print("SDXL-Turbo model loaded.") | |
| # --- AoT Compilation Functions --- | |
| @spaces.GPU(duration=AOT_DURATION_COMPILE) | |
| def compile_svd_unet(): | |
| """ | |
| Compiles the Stable Video Diffusion (SVD) UNet with Ahead-of-Time (AoT) optimization. | |
| Enables dynamic shapes for the number of frames. | |
| """ | |
| print("Compiling SVD UNet with AoT (dynamic shapes for num_frames)...") | |
| # Define example parameters for the SVD pipeline call (use typical values for capture) | |
| height, width = 576, 1024 # Recommended resolution for SVD-XT | |
| num_frames_min, num_frames_max = 1, SVD_MAX_FRAMES # Range for dynamic num_frames | |
| fps = 8 | |
| motion_bucket_id = 127 | |
| noise_aug_strength = 0.02 | |
| # Create a dummy input image for image encoding | |
| input_image = Image.new("RGB", (width, height), color='blue') | |
| # --- Simulate pre-processing steps of SVD pipeline to get UNet inputs --- | |
| with torch.no_grad(): | |
| image_embeddings = pipe_svd._encode_image( | |
| input_image, device="cuda", num_videos_per_prompt=1, do_classifier_free_guidance=False | |
| ).image_embeddings | |
| pipe_svd.scheduler.set_timesteps(num_inference_steps=25) | |
| timestep = pipe_svd.scheduler.timesteps[0] # Take the first timestep for capture | |
| num_channels_latents = pipe_svd.unet.config.in_channels | |
| image_width, image_height = input_image.size | |
| original_size = (image_height, image_width) | |
| crop_coords = pipe_svd.image_processor.get_crops_coords_top_left(input_image) | |
| target_size = pipe_svd.image_processor.get_paddings_for_height_width( | |
| (image_height, image_width), min_size_divisor=pipe_svd.vae_scale_factor | |
| ) | |
| # Use a mid-range value for num_frames for initial latent creation for capture | |
| example_num_frames = (num_frames_min + num_frames_max) // 2 | |
| latents_shape = ( | |
| 1, # batch size | |
| num_channels_latents, | |
| example_num_frames, # Use example num_frames for initial shape | |
| height // pipe_svd.vae_scale_factor, | |
| width // pipe_svd.vae_scale_factor, | |
| ) | |
| latents = torch.randn(latents_shape, generator=None, device="cuda", dtype=pipe_svd.unet.dtype) | |
| add_time_ids = list(original_size + crop_coords + target_size + (fps,) + (motion_bucket_id,) + (noise_aug_strength,)) | |
| add_time_ids = torch.tensor([add_time_ids], dtype=pipe_svd.unet.dtype, device="cuda") | |
| added_cond_kwargs = {"text_embeds": image_embeddings, "time_ids": add_time_ids} | |
| sample_input = latents | |
| timestep_input = timestep | |
| encoder_hidden_states_input = None | |
| cross_attention_kwargs_input = None | |
| return_dict_input = False | |
| # Capture the UNet call | |
| with spaces.aoti_capture(pipe_svd.unet) as call: | |
| _ = pipe_svd.unet( | |
| sample=sample_input, | |
| timestep=timestep_input, | |
| encoder_hidden_states=encoder_hidden_states_input, | |
| added_cond_kwargs=added_cond_kwargs, | |
| cross_attention_kwargs=cross_attention_kwargs_input, | |
| return_dict=return_dict_input | |
| ) | |
| # Define dynamic dimensions for sequence_length (num_frames) | |
| sequence_dim = torch.export.Dim('sequence_length', min=num_frames_min, max=num_frames_max) | |
| # Prepare dynamic shapes for `torch.export.export` | |
| # `call.args[0]` corresponds to `sample` (latents), which is a 5-dim tensor. | |
| # The sequence length is the 3rd dimension (index 2). | |
| dynamic_shapes_for_args = list(tree_map(lambda v: None, call.args)) | |
| dynamic_shapes_for_args[0] = (None, None, sequence_dim, None, None) # (batch, channels, num_frames, height, width) | |
| dynamic_shapes_for_kwargs = tree_map(lambda v: None, call.kwargs) | |
| exported_svd_unet = torch.export.export( | |
| pipe_svd.unet, | |
| args=call.args, | |
| kwargs=call.kwargs, | |
| dynamic_shapes=(tuple(dynamic_shapes_for_args), dynamic_shapes_for_kwargs) | |
| ) | |
| print("SVD UNet exported with dynamic shapes. Compiling...") | |
| return spaces.aoti_compile(exported_svd_unet) | |
| @spaces.GPU(duration=AOT_DURATION_COMPILE) | |
| def compile_t2i_unet(): | |
| """ | |
| Compiles the SDXL-Turbo Text-to-Image UNet with Ahead-of-Time (AoT) optimization. | |
| """ | |
| print("Compiling SDXL-Turbo UNet with AoT...") | |
| # Example prompt and fixed resolution for SDXL-Turbo | |
| prompt = "A dog wearing a hat, high quality" | |
| height, width = T2I_DEFAULT_HEIGHT, T2I_DEFAULT_WIDTH | |
| with torch.no_grad(): | |
| # Encode prompt to get text embeddings | |
| text_encoder_output = pipe_t2i.encode_prompt( | |
| prompt, | |
| device="cuda", | |
| num_images_per_prompt=1, |