import gradio as gr
import clip
from model import ClipCaptionModel
from transformers import GPT2Tokenizer
import numpy as np
import torch
import PIL
from predict import generate2, generate_beam
from huggingface_hub import hf_hub_download

D = torch.device
CPU = torch.device('cpu')
pretrained_model_variance = "0.015"
device = "cpu"
model_path = hf_hub_download('johko/capdec_015', 'model.pt')

clip_model, preprocess = clip.load("RN50x4", device=device, jit=False)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

model_0 = hf_hub_download('johko/capdec_0', 'model.pt')
model_001 = hf_hub_download('johko/capdec_001', 'model.pt')
model_005 = hf_hub_download('johko/capdec_005', 'model.pt')
model_015 = hf_hub_download('johko/capdec_015', 'model.pt')
model_025 = hf_hub_download('johko/capdec_025', 'model.pt')
model_05 = hf_hub_download('johko/capdec_05', 'model.pt')


def load_noise_level_model(noise_level):
    if noise_level == "0.0":
        model_path = model_0
    elif noise_level == "0.001":
        model_path = model_001
    elif noise_level == "0.005":
        model_path = model_005
    elif noise_level == "0.015":
        model_path = model_015
    elif noise_level == "0.025":
        model_path = model_025
    elif noise_level == "0.05":
        model_path = model_05
    else:
        raise ValueError("Unknown Noise Level")
    
    model = ClipCaptionModel()
    model.load_state_dict(torch.load(model_path, map_location=CPU)) 
    model = model.eval() 
    model = model.to(device)
    
    return model

def infer(input_image: np.ndarray, noise_level: str):
    use_beam_search = True 
    
    model = load_noise_level_model(noise_level)

    pil_image = PIL.Image.fromarray(input_image)

    image = preprocess(pil_image).unsqueeze(0).to(device)
    with torch.no_grad():
        prefix = clip_model.encode_image(image).to(device, dtype=torch.float32)
        prefix_embed = model.clip_project(prefix).reshape(1, 40, -1)
    if use_beam_search:
        generated_text_prefix = generate_beam(model, tokenizer, embed=prefix_embed)[0]
    else:
        generated_text_prefix = generate2(model, tokenizer, embed=prefix_embed)

    return input_image, generated_text_prefix

description="""This space is a demo for the paper [*Text-Only Training for Image Captioning using Noise-Injected CLIP*](https://arxiv.org/pdf/2211.00575.pdf) 
by David Nukrai, Ron Mokady and Amir Globerson.

The paper is about training an Image Captioning model by only using text. It leverages the usage of noise injections at different Noise Levels, 
with which you can experiment as well in this demo. The text caption will change depending on the Noise Level you choose."""

dropdown = gr.components.Dropdown(["0.0", "0.001", "0.005", "0.015", "0.025", "0.05"], value="0.015", label="Noise Level")
input_image = gr.components.Image(label="Input Image")
output_image = gr.components.Image(label="Image")
output_text = gr.components.Textbox(label="Generated Caption")

iface = gr.Interface(
    title="CapDec Image Captioning",
    description=description,
    fn=infer, 
    inputs=[input_image, dropdown], 
    outputs=[output_image, output_text], 
    examples=[["examples/flickr_ex2.jpg", "0.015"], ["examples/web_ex3.jpeg", "0.015"]])
iface.launch()