import gradio as gr
import numpy as np
import torch

from video_diffusion.stable_diffusion_video.stable_diffusion_pipeline import StableDiffusionWalkPipeline
from video_diffusion.utils.model_list import stable_model_list


class StableDiffusionText2VideoGenerator:
    def __init__(self):
        self.pipe = None

    def load_model(
        self,
        model_path,
    ):
        if self.pipe is None:
            self.pipe = StableDiffusionWalkPipeline.from_pretrained(
                model_path,
                torch_dtype=torch.float16,
                revision="fp16",
            )

            self.pipe.to("cuda")
            self.pipe.enable_xformers_memory_efficient_attention()
            self.pipe.enable_attention_slicing()
            
        return self.pipe

    def generate_video(
        self,
        model_path: str,
        first_prompts: str,
        second_prompts: str,
        negative_prompt: str,
        num_interpolation_steps: int,
        guidance_scale: int,
        num_inference_step: int,
        height: int,
        width: int,
        upsample: bool,
        fps=int,
    ):
        first_seed = np.random.randint(0, 100000)
        second_seed = np.random.randint(0, 100000)
        seeds = [first_seed, second_seed]
        prompts = [first_prompts, second_prompts]
        pipe = self.load_model(model_path=model_path)

        output_video = pipe.walk(
            prompts=prompts,
            num_interpolation_steps=int(num_interpolation_steps),
            height=height,
            width=width,
            guidance_scale=guidance_scale,
            num_inference_steps=num_inference_step,
            negative_prompt=negative_prompt,
            seeds=seeds,
            upsample=upsample,
            fps=fps,
        )

        return output_video

    def app():
        with gr.Blocks():
            with gr.Row():
                with gr.Column():
                    stable_text2video_first_prompt = gr.Textbox(
                        lines=1,
                        placeholder="First Prompt",
                        show_label=False,
                    )
                    stable_text2video_second_prompt = gr.Textbox(
                        lines=1,
                        placeholder="Second Prompt",
                        show_label=False,
                    )
                    stable_text2video_negative_prompt = gr.Textbox(
                        lines=1,
                        placeholder="Negative Prompt ",
                        show_label=False,
                    )
                    with gr.Row():
                        with gr.Column():
                            stable_text2video_model_path = gr.Dropdown(
                                choices=stable_model_list,
                                label="Stable Model List",
                                value=stable_model_list[0],
                            )
                            stable_text2video_guidance_scale = gr.Slider(
                                minimum=0,
                                maximum=15,
                                step=1,
                                value=8.5,
                                label="Guidance Scale",
                            )
                            stable_text2video_num_inference_steps = gr.Slider(
                                minimum=1,
                                maximum=100,
                                step=1,
                                value=30,
                                label="Number of Inference Steps",
                            )
                            stable_text2video_fps = gr.Slider(
                                minimum=1,
                                maximum=60,
                                step=1,
                                value=10,
                                label="Fps",
                            )
                        with gr.Row():
                            with gr.Column():
                                stable_text2video_num_interpolation_steps = gr.Number(
                                    value=10,
                                    label="Number of Interpolation Steps",
                                )
                                stable_text2video_height = gr.Slider(
                                    minimum=1,
                                    maximum=1000,
                                    step=1,
                                    value=512,
                                    label="Height",
                                )
                                stable_text2video_width = gr.Slider(
                                    minimum=1,
                                    maximum=1000,
                                    step=1,
                                    value=512,
                                    label="Width",
                                )
                                stable_text2video_upsample = gr.Checkbox(
                                    label="Upsample",
                                    default=False,
                                )

                    text2video_generate = gr.Button(value="Generator")

                with gr.Column():
                    text2video_output = gr.Video(label="Output")

            text2video_generate.click(
                fn=StableDiffusionText2VideoGenerator().generate_video,
                inputs=[
                    stable_text2video_model_path,
                    stable_text2video_first_prompt,
                    stable_text2video_second_prompt,
                    stable_text2video_negative_prompt,
                    stable_text2video_num_interpolation_steps,
                    stable_text2video_guidance_scale,
                    stable_text2video_num_inference_steps,
                    stable_text2video_height,
                    stable_text2video_width,
                    stable_text2video_upsample,
                    stable_text2video_fps,
                ],
                outputs=text2video_output,
            )