from inpaint_zoom.utils.zoom_in_utils import image_grid, shrink_and_paste_on_blank, dummy, write_video
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
from PIL import Image
import gradio as gr
import numpy as np
import torch
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"


stable_paint_model_list = [
  "stabilityai/stable-diffusion-2-inpainting", 
  "runwayml/stable-diffusion-inpainting"
]

stable_paint_prompt_list = [
        "children running in the forest , sunny, bright, by studio ghibli painting, superior quality, masterpiece,  traditional Japanese colors, by Grzegorz Rutkowski, concept art",
        "A beautiful landscape of a mountain range with a lake in the foreground",
]

stable_paint_negative_prompt_list = [
        "lurry, bad art, blurred, text, watermark",
    ]

class StableDiffusionZoomIn:
    def __init__(self):
        self.pipe = None
        
    def load_model(self, model_id):
        if self.pipe is None:
            self.pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, revision="fp16")
            
        self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config)
        self.pipe = self.pipe.to("cuda")

        self.pipe.safety_checker = dummy
        self.pipe.enable_attention_slicing() 
        self.pipe.enable_xformers_memory_efficient_attention()
        self.g_cuda = torch.Generator(device='cuda')
            
        return self.pipe

    def generate_video(
        self,
        model_id,
        prompt,
        negative_prompt,
        guidance_scale,
        num_inference_steps,
        ):
        
        pipe = self.load_model(model_id)

        num_init_images = 2
        seed = 9999 
        height = 512
        width = height 

        current_image = Image.new(mode="RGBA", size=(height, width))
        mask_image = np.array(current_image)[:,:,3] 
        mask_image = Image.fromarray(255-mask_image).convert("RGB")
        current_image = current_image.convert("RGB")

        init_images =  pipe(prompt=[prompt]*num_init_images,
                            negative_prompt=[negative_prompt]*num_init_images,
                            image=current_image,
                            guidance_scale = guidance_scale,
                            height = height,
                            width = width, 
                            generator = self.g_cuda.manual_seed(seed),
                            mask_image=mask_image, 
                            num_inference_steps=num_inference_steps)[0]


        image_grid(init_images, rows=1, cols=num_init_images)


        init_image_selected = 1 #@param
        if num_init_images == 1:
            init_image_selected = 0
        else:
            init_image_selected = init_image_selected - 1

        num_outpainting_steps = 20 #@param
        mask_width = 128 #@param
        num_interpol_frames = 30 #@param 

        current_image = init_images[init_image_selected]
        all_frames = []
        all_frames.append(current_image)

        for i in range(num_outpainting_steps):
            print('Generating image: ' + str(i+1) + ' / ' + str(num_outpainting_steps))

            prev_image_fix = current_image

            prev_image = shrink_and_paste_on_blank(current_image, mask_width)

            current_image = prev_image

            #create mask (black image with white mask_width width edges)
            mask_image = np.array(current_image)[:,:,3] 
            mask_image = Image.fromarray(255-mask_image).convert("RGB")

            #inpainting step
            current_image = current_image.convert("RGB")
            images = pipe(prompt=prompt,
                            negative_prompt=negative_prompt,
                            image=current_image,
                            guidance_scale = guidance_scale,
                            height = height,
                            width = width, 
                            #this can make the whole thing deterministic but the output less exciting
                            #generator = g_cuda.manual_seed(seed), 
                            mask_image=mask_image, 
                            num_inference_steps=num_inference_steps)[0]
            current_image = images[0]
            current_image.paste(prev_image, mask=prev_image)

            #interpolation steps bewteen 2 inpainted images (=sequential zoom and crop)
            for j in range(num_interpol_frames - 1):
                interpol_image = current_image
                interpol_width = round(
                    (1- ( 1-2*mask_width/height )**( 1-(j+1)/num_interpol_frames ) )*height/2 
                    )
                interpol_image = interpol_image.crop((interpol_width,
                                                    interpol_width,
                                                    width - interpol_width,
                                                    height - interpol_width))

                interpol_image = interpol_image.resize((height, width))

                #paste the higher resolution previous image in the middle to avoid drop in quality caused by zooming
                interpol_width2 = round(
                    ( 1 - (height-2*mask_width) / (height-2*interpol_width) ) / 2*height
                    )
                prev_image_fix_crop = shrink_and_paste_on_blank(prev_image_fix, interpol_width2)
                interpol_image.paste(prev_image_fix_crop, mask = prev_image_fix_crop)

                all_frames.append(interpol_image)

            all_frames.append(current_image)

        video_file_name = "infinite_zoom_out"
        fps = 30 
        save_path = video_file_name + ".mp4"
        write_video(save_path, all_frames, fps)
        return save_path


    def app():
        with gr.Blocks():
            with gr.Row():
                with gr.Column():
                    text2image_in_model_path = gr.Dropdown(
                        choices=stable_paint_model_list, 
                        value=stable_paint_model_list[0], 
                        label='Text-Image Model Id'
                    )

                    text2image_in_prompt = gr.Textbox(
                        lines=1, 
                        value=stable_paint_prompt_list[0], 
                        label='Prompt'
                    )

                    text2image_in_negative_prompt = gr.Textbox(
                        lines=1, 
                        value=stable_paint_negative_prompt_list[0], 
                        label='Negative Prompt'
                    )

                    with gr.Row():
                        with gr.Column():
                            text2image_in_guidance_scale = gr.Slider(
                                minimum=0.1, 
                                maximum=15, 
                                step=0.1, 
                                value=7.5, 
                                label='Guidance Scale'
                            )

                            text2image_in_num_inference_step = gr.Slider(
                                minimum=1, 
                                maximum=100, 
                                step=1, 
                                value=50, 
                                label='Num Inference Step'
                            )

                    text2image_in_predict = gr.Button(value='Generator')
            
                with gr.Column():
                    output_image = gr.Video(label='Output')
                        
            
            text2image_in_predict.click(
                fn=StableDiffusionZoomIn().generate_video,
                inputs=[
                    text2image_in_model_path,
                    text2image_in_prompt,
                    text2image_in_negative_prompt,
                    text2image_in_guidance_scale,
                    text2image_in_num_inference_step,
                ],
                outputs=output_image
            )