import gradio as gr
import fasttext

from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

import numpy as np
import pandas as pd
import torch


id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}


title = "Movie Review Score Discriminator"
description = "It is a program that classifies whether it is positive or negative by entering movie reviews.  \
                You can choose between the Korean version and the English version.  \
                It also provides a version called ""Default"", which determines whether it is Korean or English and predicts it."


class LanguageIdentification:
    def __init__(self):
        pretrained_lang_model = "./lid.176.ftz"
        self.model = fasttext.load_model(pretrained_lang_model)

    def predict_lang(self, text):
        predictions = self.model.predict(text, k=200) # returns top 200 matching languages
        return predictions

LANGUAGE = LanguageIdentification()


def tokenized_data(tokenizer, inputs):
    return tokenizer.batch_encode_plus(
        [inputs],
        return_tensors="pt",
        padding="max_length",
        max_length=64,
        truncation=True)


examples = []
df = pd.read_csv('examples.csv', sep='\t', index_col='Unnamed: 0')
np.random.seed(100)

idx = np.random.choice(50, size=5, replace=False)
eng_examples = [ ['Eng', df.iloc[i, 0]] for i in idx ]
kor_examples = [ ['Kor', df.iloc[i, 1]] for i in idx ]
examples = eng_examples + kor_examples


eng_model_name = "roberta-base"
eng_step = 1900
eng_tokenizer = AutoTokenizer.from_pretrained(eng_model_name)
eng_file_name = "{}-{}.pt".format(eng_model_name, eng_step)
eng_state_dict = torch.load(eng_file_name)
eng_model = AutoModelForSequenceClassification.from_pretrained(
    eng_model_name, num_labels=2, id2label=id2label, label2id=label2id,
    state_dict=eng_state_dict
)


kor_model_name = "klue/roberta-small"
kor_step = 2400
kor_tokenizer = AutoTokenizer.from_pretrained(kor_model_name)
kor_file_name = "{}-{}.pt".format(kor_model_name.replace('/', '_'), kor_step)
kor_state_dict = torch.load(kor_file_name)
kor_model = AutoModelForSequenceClassification.from_pretrained(
    kor_model_name, num_labels=2, id2label=id2label, label2id=label2id,
    state_dict=kor_state_dict
)


def builder(Lang, Text):
    percent_kor, percent_eng = 0, 0
    text_list = Text.split(' ')


    # [ output_1 ]
    if Lang == 'Default':
        pred = LANGUAGE.predict_lang(Text)
        if '__label__en' in pred[0]:
            Lang = 'Eng'
            idx = pred[0].index('__label__en')
            p_eng = pred[1][idx]
        if '__label__ko' in pred[0]:
            Lang = 'Kor'
            idx = pred[0].index('__label__ko')
            p_kor = pred[1][idx]
        # Normalize Percentage
        percent_kor = p_kor / (p_kor+p_eng)
        percent_eng = p_eng / (p_kor+p_eng)

    if Lang == 'Eng':
        model = eng_model
        tokenizer = eng_tokenizer
        if percent_eng==0: percent_eng=1

    if Lang == 'Kor':
        model = kor_model
        tokenizer = kor_tokenizer
        if percent_kor==0: percent_kor=1
        

    # [ output_2 ]
    inputs = tokenized_data(tokenizer, Text)
    model.eval()
    with torch.no_grad():
        logits = model(input_ids=inputs['input_ids'], 
            attention_mask=inputs['attention_mask']).logits
    
    m = torch.nn.Softmax(dim=1)
    output = m(logits)
    # print(logits, output)


    # [ output_3 ]
    output_analysis = []
    for word in text_list:
        tokenized_word = tokenized_data(tokenizer, word)
        with torch.no_grad():
            logit = model(input_ids=tokenized_word['input_ids'], 
                attention_mask=tokenized_word['attention_mask']).logits
        word_output = m(logit)
        if word_output[0][1] > 0.99:
            output_analysis.append( (word, '+++') )
        elif word_output[0][1] > 0.9:
            output_analysis.append( (word, '++') )
        elif word_output[0][1] > 0.8:
            output_analysis.append( (word, '+') )
        elif word_output[0][1] < 0.01:
            output_analysis.append( (word, '---') )
        elif word_output[0][1] < 0.1:
            output_analysis.append( (word, '--') )
        elif word_output[0][1] < 0.2:
            output_analysis.append( (word, '-') )
        else:
            output_analysis.append( (word, None) )
    

    return [ {'Kor': percent_kor, 'Eng': percent_eng}, 
            {id2label[1]: output[0][1].item(), id2label[0]: output[0][0].item()}, 
            output_analysis ]
            
    # prediction = torch.argmax(logits, axis=1)
    return id2label[prediction.item()]


# demo3 = gr.Interface.load("models/mdj1412/movie_review_score_discriminator_eng", inputs="text", outputs="text", 
#                          title=title, theme="peach",
#                          allow_flagging="auto",
#                          description=description, examples=examples)


demo = gr.Interface(builder, inputs=[gr.inputs.Dropdown(['Default', 'Eng', 'Kor']), gr.Textbox(placeholder="리뷰를 입력하시오.")], 
                    outputs=[ gr.Label(num_top_classes=3, label='Lang'), 
                            gr.Label(num_top_classes=2, label='Result'),
                            gr.HighlightedText(label="Analysis", combine_adjacent=False)
                            .style(color_map={"+++": "#CF0000", "++": "#FF3232", "+": "#FFD4D4", "---": "#0004FE", "--": "#4C47FF", "-": "#BEBDFF"}) ],
                    # outputs='label',
                    title=title, description=description, examples=examples)

def fn2(a, b):
    return [None, None, None]

with gr.Blocks() as demo1:
    gr.Markdown(
    """
    <h1 align="center">
    Movie Review Score Discriminator
    </h1>
    """)

    gr.Markdown(
    """
    해당 사이트에서는 영화 리뷰를 입력했을 때, 긍정적인 리뷰인지 부정적인 리뷰인지 판별 해준다.
    """)


    with gr.Row():
        gr.Markdown(
        """
        ### 입력(Lang, Text)에 대한 설명
        가운데를 기준으로 왼쪽에 있는 칸들은 입력에 해당한다.  
        * ‘Lang’은 자신이 입력할 언어를 선택하는 것이다. 기입이 없을 경우(Default)에는 한국어인지 영어인지 오른쪽 박스 ‘Lang’에서 판단 해준다.  
        * ‘Text’는 영화 리뷰를 입력하는 곳이다. 입력을 마치고 제출하기 버튼을 누르면 결과를 분석할 수 있다.
        """)
        gr.Markdown(
        """
        ### 출력(Lang, Result, Analysis)에 대한 설명
        가운데를 기준으로 오른쪽에 있는 칸들은 출력에 해당한다. 
        * ‘Lang’은 왼쪽 입력에서 언어를 선택할 때, 한국어를 선택했으면 100% Kor, 영어를 선택했다면 100%% Eng 이다. 
            만약 Default를 선택했다면 한국어인지 영어인지 판단 해준다. 
            이 방법은 [해당 사이트](https://medium.com/@c.chaitanya/language-identification-in-python-using-fasttext-60359dc30ed0)에 있는 패키지를 사용했다.
        * ‘Result’는 두 모델(Kor, Eng)을 이용해서 긍정 리뷰 또는 부정 리뷰를 판단했다. (모델에 대해 더 알고 싶으면 밑에서 확인하길 바란다.)
        * ‘Analysis’는 입력한 리뷰의 긍정 또는 부정을 결정될 때, 어떤 단어에 의해서 결정 되었는지 강조 해준다. 
            즉, 영향을 준 단어에 대해서는 얼마나 긍정적인(또는 부정적인) 영향을 주었는지 강조되어 있다. 
            ( 긍정인 경우 붉은색(+++, ++, +), 부정인 경우 파란색(---,--,-) )
        """)

    with gr.Accordion("모델에 대한 설명"):
        gr.Markdown(
        """
        ' | Kor | Eng 
        :---:|:---:|:---:
        Model Name | klue/roberta-base | bert-base-uncased
        Learning Rate | 3e-05 | 5e-5
        Batch Size Train | 64 | 64
        Steps | 2800 | 2000
        Batch Size Test | 4 | 4
        Validation Accuracy | 93.55% | 95.81%
        Test Accuracy | 94.0% | 92.8%
        """)

    with gr.Row():
        with gr.Column():
            inputs_1 = gr.inputs.Dropdown(['Default', 'Eng', 'Kor'], label='Lang')
            inputs_2 = gr.Textbox(placeholder="리뷰를 입력하시오.", label='Text')
            with gr.Row():
                btn2 = gr.Button("클리어")
                btn = gr.Button("제출하기")
        with gr.Column():
            output_1 = gr.Label(num_top_classes=3, label='Lang')
            output_2 = gr.Label(num_top_classes=2, label='Result')
            output_3 = gr.HighlightedText(label="Analysis", combine_adjacent=False) \
                .style(color_map={"+++": "#CF0000", "++": "#FF3232", "+": "#FFD4D4", "---": "#0004FE", "--": "#4C47FF", "-": "#BEBDFF"})
    
    # btn2.click(fn=fn2, inputs=[None, None], output=[output_1, output_2, output_3])
    btn.click(fn=builder, inputs=[inputs_1, inputs_2], outputs=[output_1, output_2, output_3])
    gr.Examples(examples, inputs=[inputs_1, inputs_2])
    

if __name__ == "__main__":
    # print(examples)
    # demo.launch()
    demo1.launch()