File size: 5,116 Bytes
bd63620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import gradio as gr
from models import generate_t2v, generate_i2v
from utils import frames_to_video
import tempfile
import os

# Stable Video Diffusion XT supports max 32 frames.
MAX_FRAMES = 32 
MAX_FPS = 30
MAX_VIDEO_DURATION_SECONDS = MAX_FRAMES / MAX_FPS 

def handle_t2v(prompt: str, motion: int, frames: int, fps: int):
    """Handles text-to-video generation."""
    # Create a temporary file path for the video output
    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_file:
        temp_path = tmp_file.name
    
    try:
        frames_list = generate_t2v(prompt, motion, frames, fps)
        output_path = frames_to_video(frames_list, fps, temp_path)
        return output_path
    except Exception as e:
        # Clean up temp file on error
        if os.path.exists(temp_path):
            os.remove(temp_path)
        raise gr.Error(f"Video generation failed: {e}")

def handle_i2v(input_image, motion: int, frames: int, fps: int):
    """Handles image-to-video generation."""
    # Create a temporary file path for the video output
    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_file:
        temp_path = tmp_file.name
    
    if input_image is None:
        raise gr.Error("Please upload an image for Image-to-Video generation.")

    try:
        frames_list = generate_i2v(input_image, motion, frames, fps)
        output_path = frames_to_video(frames_list, fps, temp_path)
        return output_path
    except Exception as e:
        # Clean up temp file on error
        if os.path.exists(temp_path):
            os.remove(temp_path)
        raise gr.Error(f"Video generation failed: {e}")


title = "Sora-2 (Simulation) - Video Generation Demo"
description = f"""
# {title}
This application simulates the capabilities of large-scale video models like OpenAI's Sora, supporting Text-to-Video (T2V) and Image-to-Video (I2V) generation. We use cutting-edge open models Stable Video Diffusion (SVD-XT) and SDXL.

**🚨 IMPORTANT LIMITATION:** Due to the extreme computational demands and time constraints (2-minute videos are not feasible in this environment), we use **Stable Video Diffusion (SVD-XT)** which currently supports videos up to {MAX_FRAMES} frames (approx. {MAX_VIDEO_DURATION_SECONDS:.1f} seconds at max FPS).

## Modalities Implemented:
1. **Text-to-Video (T2V):** Uses Stable Diffusion XL (SDXL) to create a high-quality initial image, followed by Stable Video Diffusion (SVD) to add realistic motion.
2. **Image-to-Video (I2V):** Uses Stable Video Diffusion (SVD) to animate a static image.

Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
"""

with gr.Blocks(title=title) as demo:
    gr.Markdown(description)

    output_video = gr.Video(label="Generated Video", height=400, autoplay=True)

    # Global Controls for both tabs
    with gr.Accordion(f"Settings (Max {MAX_VIDEO_DURATION_SECONDS:.1f}s)", open=True):
        motion_slider = gr.Slider(
            minimum=1, 
            maximum=255, 
            value=127, 
            step=1, 
            label="Motion Intensity (Motion Bucket ID)", 
            info="Higher values produce more dynamic movement."
        )
        frames_slider = gr.Slider(
            minimum=14, 
            maximum=MAX_FRAMES, 
            value=14, 
            step=2, 
            label="Number of Frames",
            info=f"SVD-XT supports up to {MAX_FRAMES} frames."
        )
        fps_slider = gr.Slider(
            minimum=10, 
            maximum=MAX_FPS, 
            value=24, 
            step=1, 
            label="Frames Per Second (FPS)"
        )

    with gr.Tabs():
        with gr.TabItem("Text-to-Video (T2V)"):
            t2v_prompt = gr.Textbox(
                label="Prompt",
                placeholder="A majestic golden retriever wearing a tiny crown running through a field of glowing lavender.",
                value="A cozy cabin nestled in a snowy forest, steam rising from the chimney."
            )
            t2v_button = gr.Button("Generate T2V Video (Sora-2 / SDXL + SVD)", variant="primary")
            
            t2v_button.click(
                handle_t2v,
                inputs=[t2v_prompt, motion_slider, frames_slider, fps_slider],
                outputs=output_video,
                api_name="t2v_generate"
            )

        with gr.TabItem("Image-to-Video (I2V)"):
            i2v_image = gr.Image(
                label="Input Image", 
                type="pil", 
                sources=["upload", "clipboard"],
                height=300
            )
            gr.Markdown("Note: SVD works best with 16:9 or 9:16 aspect ratio images (e.g., 1024x576). The image will be resized.")
            i2v_button = gr.Button("Generate I2V Video (Sora-2 / SVD)", variant="primary")

            i2v_button.click(
                handle_i2v,
                inputs=[i2v_image, motion_slider, frames_slider, fps_slider],
                outputs=output_video,
                api_name="i2v_generate"
            )

if __name__ == "__main__":
    demo.queue(max_size=20).launch(max_threads=1, show_api=True)