import spaces
import torch
import gradio as gr
from diffusers import CogVideoXPipeline
from diffusers.utils import export_to_video
from PIL import Image

# ────────────────────────────────────────────────────────────
# 1. Load & optimize the CogVideoX pipeline with CPU offload
# ────────────────────────────────────────────────────────────
pipe = CogVideoXPipeline.from_pretrained(
    "THUDM/CogVideoX1.5-5B",
    torch_dtype=torch.bfloat16
)
pipe.enable_model_cpu_offload()   # auto move submodules between CPU/GPU
pipe.vae.enable_slicing()         # slice VAE for extra VRAM savings

# ────────────────────────────────────────────────────────────
# 2. Resolution parsing & sanitization
# ────────────────────────────────────────────────────────────
def make_divisible_by_8(x: int) -> int:
    return (x // 8) * 8

def parse_resolution(res_str: str):
    """
    Convert strings like "480p" into (height, width) both divisible by 8
    while preserving ~16:9 aspect ratio.
    """
    h = int(res_str.rstrip("p"))
    w = int(h * 16 / 9)
    return make_divisible_by_8(h), make_divisible_by_8(w)

# ────────────────────────────────────────────────────────────
# 3. GPU‑decorated video generation function
# ────────────────────────────────────────────────────────────
@spaces.GPU(duration=180)  # allow up to 180s of GPU time
def generate_video(
    prompt: str,
    steps: int,
    frames: int,
    fps: int,
    resolution: str
) -> str:
    # 3.1 Determine target resolution and native resolution
    target_h, target_w = parse_resolution(resolution)

    # 3.2 Run the diffusion pipeline at native resolution
    output = pipe(
        prompt=prompt,
        num_inference_steps=steps,
        num_frames=frames,
    )
    video_frames = output.frames[0]  # list of PIL Images at native size

    # 3.3 Resize frames to user-specified resolution
    resized_frames = [
        frame.resize((target_w, target_h), Image.LANCZOS)
        for frame in video_frames
    ]

    # 3.4 Export to MP4 (H.264) with chosen FPS
    video_path = export_to_video(resized_frames, "generated.mp4", fps=fps)
    return video_path

# ────────────────────────────────────────────────────────────
# 4. Build the Gradio interface with interactive controls
# ────────────────────────────────────────────────────────────
with gr.Blocks(title="Textual Imagination: A text to video synthesis") as demo:
    gr.Markdown(
        """
        # 🎞️ Textual Imagination: A text to video synthesis
        Generate videos from text prompts.
        Adjust inference steps, frame count, fps, and resolution below.
        """
    )
    with gr.Row():
        with gr.Column():
            prompt_input = gr.Textbox(
                label="Prompt",
                lines=2
            )
            steps_slider = gr.Slider(
                minimum=1, maximum=100, step=1, value=50,
                label="Inference Steps"
            )
            frames_slider = gr.Slider(
                minimum=16, maximum=320, step=1, value=161,
                label="Total Frames"
            )
            fps_slider = gr.Slider(
                minimum=1, maximum=60, step=1, value=16,
                label="Frames per Second (FPS)"
            )
            res_dropdown = gr.Dropdown(
                choices=["360p", "480p", "720p", "1080p"],
                value="480p",
                label="Resolution"
            )
            gen_button = gr.Button("Generate Video")
        with gr.Column():
            video_output = gr.Video(
                label="Generated Video",
                format="mp4"
            )

    gen_button.click(
        fn=generate_video,
        inputs=[prompt_input, steps_slider, frames_slider, fps_slider, res_dropdown],
        outputs=video_output
    )

# ────────────────────────────────────────────────────────────
# 5. Launch: disable SSR so Gradio blocks and stays alive
# ────────────────────────────────────────────────────────────
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        ssr_mode=False
    )