Spaces:

Gertie01
/

demo-os56ddeg

Sleeping

File size: 5,116 Bytes

bd63620

import gradio as gr
from models import generate_t2v, generate_i2v
from utils import frames_to_video
import tempfile
import os

# Stable Video Diffusion XT supports max 32 frames.
MAX_FRAMES = 32 
MAX_FPS = 30
MAX_VIDEO_DURATION_SECONDS = MAX_FRAMES / MAX_FPS 

def handle_t2v(prompt: str, motion: int, frames: int, fps: int):
    """Handles text-to-video generation."""
    # Create a temporary file path for the video output
    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_file:
        temp_path = tmp_file.name
    
    try:
        frames_list = generate_t2v(prompt, motion, frames, fps)
        output_path = frames_to_video(frames_list, fps, temp_path)
        return output_path
    except Exception as e:
        # Clean up temp file on error
        if os.path.exists(temp_path):
            os.remove(temp_path)
        raise gr.Error(f"Video generation failed: {e}")

def handle_i2v(input_image, motion: int, frames: int, fps: int):
    """Handles image-to-video generation."""
    # Create a temporary file path for the video output
    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_file:
        temp_path = tmp_file.name
    
    if input_image is None:
        raise gr.Error("Please upload an image for Image-to-Video generation.")

    try:
        frames_list = generate_i2v(input_image, motion, frames, fps)
        output_path = frames_to_video(frames_list, fps, temp_path)
        return output_path
    except Exception as e:
        # Clean up temp file on error
        if os.path.exists(temp_path):
            os.remove(temp_path)
        raise gr.Error(f"Video generation failed: {e}")


title = "Sora-2 (Simulation) - Video Generation Demo"
description = f"""
# {title}
This application simulates the capabilities of large-scale video models like OpenAI's Sora, supporting Text-to-Video (T2V) and Image-to-Video (I2V) generation. We use cutting-edge open models Stable Video Diffusion (SVD-XT) and SDXL.

**🚨 IMPORTANT LIMITATION:** Due to the extreme computational demands and time constraints (2-minute videos are not feasible in this environment), we use **Stable Video Diffusion (SVD-XT)** which currently supports videos up to {MAX_FRAMES} frames (approx. {MAX_VIDEO_DURATION_SECONDS:.1f} seconds at max FPS).

## Modalities Implemented:
1. **Text-to-Video (T2V):** Uses Stable Diffusion XL (SDXL) to create a high-quality initial image, followed by Stable Video Diffusion (SVD) to add realistic motion.
2. **Image-to-Video (I2V):** Uses Stable Video Diffusion (SVD) to animate a static image.

Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
"""

with gr.Blocks(title=title) as demo:
    gr.Markdown(description)

    output_video = gr.Video(label="Generated Video", height=400, autoplay=True)

    # Global Controls for both tabs
    with gr.Accordion(f"Settings (Max {MAX_VIDEO_DURATION_SECONDS:.1f}s)", open=True):
        motion_slider = gr.Slider(
            minimum=1, 
            maximum=255, 
            value=127, 
            step=1, 
            label="Motion Intensity (Motion Bucket ID)", 
            info="Higher values produce more dynamic movement."
        )
        frames_slider = gr.Slider(
            minimum=14, 
            maximum=MAX_FRAMES, 
            value=14, 
            step=2, 
            label="Number of Frames",
            info=f"SVD-XT supports up to {MAX_FRAMES} frames."
        )
        fps_slider = gr.Slider(
            minimum=10, 
            maximum=MAX_FPS, 
            value=24, 
            step=1, 
            label="Frames Per Second (FPS)"
        )

    with gr.Tabs():
        with gr.TabItem("Text-to-Video (T2V)"):
            t2v_prompt = gr.Textbox(
                label="Prompt",
                placeholder="A majestic golden retriever wearing a tiny crown running through a field of glowing lavender.",
                value="A cozy cabin nestled in a snowy forest, steam rising from the chimney."
            )
            t2v_button = gr.Button("Generate T2V Video (Sora-2 / SDXL + SVD)", variant="primary")
            
            t2v_button.click(
                handle_t2v,
                inputs=[t2v_prompt, motion_slider, frames_slider, fps_slider],
                outputs=output_video,
                api_name="t2v_generate"
            )

        with gr.TabItem("Image-to-Video (I2V)"):
            i2v_image = gr.Image(
                label="Input Image", 
                type="pil", 
                sources=["upload", "clipboard"],
                height=300
            )
            gr.Markdown("Note: SVD works best with 16:9 or 9:16 aspect ratio images (e.g., 1024x576). The image will be resized.")
            i2v_button = gr.Button("Generate I2V Video (Sora-2 / SVD)", variant="primary")

            i2v_button.click(
                handle_i2v,
                inputs=[i2v_image, motion_slider, frames_slider, fps_slider],
                outputs=output_video,
                api_name="i2v_generate"
            )

if __name__ == "__main__":
    demo.queue(max_size=20).launch(max_threads=1, show_api=True)