import os
import time
from pathlib import Path
from loguru import logger
from datetime import datetime
import gradio as gr
import random
import spaces
import torch

from hyvideo.utils.file_utils import save_videos_grid
from hyvideo.utils.preprocess_text_encoder_tokenizer_utils import preprocess_text_encoder_tokenizer
from hyvideo.config import parse_args
from hyvideo.inference import HunyuanVideoSampler
from hyvideo.constants import NEGATIVE_PROMPT
from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
from diffusers.utils import export_to_video

from huggingface_hub import snapshot_download

# if torch.cuda.device_count() > 0:
#     snapshot_download(repo_id="tencent/HunyuanVideo", repo_type="model", local_dir="ckpts", force_download=False)
#     snapshot_download(repo_id="xtuner/llava-llama-3-8b-v1_1-transformers", repo_type="model", local_dir="ckpts/llava-llama-3-8b-v1_1-transformers", force_download=True)

#     class Args:
#         def __init__(self, input_dir, output_dir):
#             self.input_dir = input_dir
#             self.output_dir = output_dir

#     # Create the object
#     args = Args("ckpts/llava-llama-3-8b-v1_1-transformers", "ckpts/text_encoder")
#     preprocess_text_encoder_tokenizer(args)
#     snapshot_download(repo_id="openai/clip-vit-large-patch14", repo_type="model", local_dir="ckpts/text_encoder_2", force_download=True)

def initialize_model():
    model_id = "hunyuanvideo-community/HunyuanVideo"

    transformer = HunyuanVideoTransformer3DModel.from_pretrained(
        model_id, subfolder="transformer", torch_dtype=torch.bfloat16
    )
    model = HunyuanVideoPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch.float16)
    model.vae.enable_tiling()
    model.to("cuda")
    return model

model = initialize_model()

def generate_video(
    prompt,
    resolution,
    video_length,
    seed,
    num_inference_steps,
    guidance_scale,
    flow_shift,
    embedded_guidance_scale
):
    print("generate_video (prompt: " + prompt + ")")
    return generate_video_gpu(
        model,
        prompt,
        resolution,
        video_length,
        seed,
        num_inference_steps,
        guidance_scale,
        flow_shift,
        embedded_guidance_scale
    )

@spaces.GPU(duration=120)
def generate_video_gpu(
    model,
    prompt,
    resolution,
    video_length,
    seed,
    num_inference_steps,
    guidance_scale,
    flow_shift,
    embedded_guidance_scale
):
    print("generate_video_gpu (prompt: " + prompt + ")")
    if torch.cuda.device_count() == 0:
        gr.Warning("Set this space to GPU config to make it work.")
        return None
    
    seed = None if seed == -1 else seed
    width, height = resolution.split("x")
    width, height = int(width), int(height)
    negative_prompt = "" # not applicable in the inference
    print("Predicting video...")
    generator = None
    if seed is not None:
        generator = torch.manual_seed(seed)
    frames: List[PIL.Image.Image] = model(
        prompt=prompt,
        height=height,
        width=width, 
        num_frames=video_length,
        generator=generator,
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale
    ).frames[0]

    output_video = export_to_video(frames, fps=15)
    return video_path


def create_demo(model_path):
    with gr.Blocks() as demo:
        if torch.cuda.device_count() == 0:
            with gr.Row():
                gr.HTML("""
                    <p style="background-color: red;"><big><big><big><b>⚠️To use <i>Hunyuan Video</i>, <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/HunyuanVideo?duplicate=true">duplicate this space</a> and set a GPU with 80 GB VRAM.</b>
    
                    You can't use <i>Hunyuan Video</i> directly here because this space runs on a CPU, which is not enough for <i>Hunyuan Video</i>. Please provide <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/HunyuanVideo/discussions/new">feedback</a> if you have issues.
                    </big></big></big></p>
                    """)
        gr.Markdown("# Hunyuan Video Generation")
        
        with gr.Row():
            with gr.Column():
                prompt = gr.Textbox(label="Prompt", value="A cat walks on the grass, realistic style.")
                with gr.Row():
                    resolution = gr.Dropdown(
                        choices=[
                            # 720p
                            ("1280x720 (16:9, 720p)", "1280x720"),
                            ("720x1280 (9:16, 720p)", "720x1280"), 
                            ("1104x832 (4:3, 720p)", "1104x832"),
                            ("832x1104 (3:4, 720p)", "832x1104"),
                            ("960x960 (1:1, 720p)", "960x960"),
                            # 540p
                            ("960x544 (16:9, 540p)", "960x544"),
                            ("544x960 (9:16, 540p)", "544x960"),
                            ("832x624 (4:3, 540p)", "832x624"), 
                            ("624x832 (3:4, 540p)", "624x832"),
                            ("720x720 (1:1, 540p)", "720x720"),
                        ],
                        value="832x624",
                        label="Resolution"
                    )
                    video_length = gr.Dropdown(
                        label="Video Length",
                        choices=[
                            ("2s(65f)", 65),
                            ("5s(129f)", 129),
                        ],
                        value=65,
                    )
                num_inference_steps = gr.Slider(1, 100, value=5, step=1, label="Number of Inference Steps")
                
                with gr.Accordion("Advanced Options", open=False):
                    with gr.Column():
                        seed = gr.Slider(label="Seed (-1 for random)", value=-1, minimum=-1, maximum=2**63 - 1, step=1)
                        guidance_scale = gr.Slider(1.0, 20.0, value=1.0, step=0.5, label="Guidance Scale")
                        flow_shift = gr.Slider(0.0, 10.0, value=7.0, step=0.1, label="Flow Shift") 
                        embedded_guidance_scale = gr.Slider(1.0, 20.0, value=6.0, step=0.5, label="Embedded Guidance Scale")

                generate_btn = gr.Button(value = "🚀 Generate Video", variant = "primary")
            
            with gr.Row():
                output = gr.Video(label = "Generated Video", autoplay = True)

        gr.Markdown("""
## **Alternatives**
If you can't use _Hunyuan Video_, you can use _[CogVideoX](https://huggingface.co/spaces/THUDM/CogVideoX-5B-Space)_ or _[LTX Video Playground](https://huggingface.co/spaces/Lightricks/LTX-Video-Playground)_ instead.
                    """)
        
        generate_btn.click(
            fn=generate_video,
            inputs=[
                prompt,
                resolution,
                video_length,
                seed,
                num_inference_steps,
                guidance_scale,
                flow_shift,
                embedded_guidance_scale
            ],
            outputs=output
        )
    
    return demo

if __name__ == "__main__":
    os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
    demo = create_demo("ckpts")
    demo.queue(10).launch()