# this is the huggingface handler file
from animatediff.pipelines.pipeline_animation import AnimationPipeline
from diffusers import AutoencoderKL, DDPMScheduler, DDIMScheduler
from transformers import CLIPTextModel, CLIPTokenizer
from omegaconf import OmegaConf
from animatediff.models.unet import UNet3DConditionModel
from animatediff.pipelines.pipeline_animation import AnimationPipeline
from animatediff.utils.util import save_videos_grid
from animatediff.utils.util import load_weights
from diffusers.utils.import_utils import is_xformers_available
from typing import Any
import torch
from einops import rearrange
import torchvision

import numpy as np

class EndpointHandler():
    def __init__(self, model_path: str = "models/StableDiffusion/", inference_config_path: str = "configs/inference/inference-v3.yaml", motion_module: str = "models/Motion_Module/mm_sd_v15.ckpt"):
        
        inference_config = OmegaConf.load(inference_config_path)
        ### >>> create validation pipeline >>> ###
        tokenizer    = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer")
        text_encoder = CLIPTextModel.from_pretrained(model_path, subfolder="text_encoder")
        vae          = AutoencoderKL.from_pretrained(model_path, subfolder="vae")            
        unet         = UNet3DConditionModel.from_pretrained_2d(model_path, subfolder="unet", unet_additional_kwargs=OmegaConf.to_container(inference_config.unet_additional_kwargs))

        if is_xformers_available(): unet.enable_xformers_memory_efficient_attention()
        else: assert False

        self.pipeline = AnimationPipeline(
            vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet,
            scheduler=DDIMScheduler(**OmegaConf.to_container(inference_config.noise_scheduler_kwargs)),
        ).to("cuda")

        self.pipeline = load_weights(
            self.pipeline,
            # motion module
            motion_module_path         = motion_module,
            motion_module_lora_configs = [],
            # image layers
            dreambooth_model_path      = "",
            lora_model_path            = "",
            lora_alpha                 = 0.8,
        ).to("cuda")

    def initialize(self, context):
        """
        Initialize model. This will be called during model loading time
        """
        
        

    def preprocess(self, data):
        """
        preprocess will be called once for each request.
        """
    
    def __call__(self, prompt, negative_prompt, steps, guidance_scale):
        """
        __call__ method will be called once per request. This can be used to
        run inference.
        """
        vids = self.pipeline(
            prompt=prompt, 
            negative_prompt=negative_prompt, 
            num_inference_steps=steps, 
            guidance_scale=guidance_scale,
            width= 256,
            height= 256,
            video_length= 5,
            ).videos
        
        videos = rearrange(vids, "b c t h w -> t b c h w")
        n_rows=6
        fps=1
        loop = True
        rescale=False
        outputs = []
        for x in videos:
            x = torchvision.utils.make_grid(x, nrow=n_rows)
            x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
            if rescale:
                x = (x + 1.0) / 2.0  # -1,1 -> 0,1
            x = (x * 255).numpy().astype(np.uint8)
            outputs.append(x)
            
        # imageio.mimsave(path, outputs, fps=fps)
        
        # return a gif file as bytes
        return outputs