import streamlit as st
from transformers import AutoModelForSequenceClassification
import pandas as pd
import numpy as np
import torch
import pickle
import wikipedia

from preprocess import Preprocess
from utility import Utility

st.title("Movie Genre Predictor")

st.subheader("Enter the text you'd like to analyze.")
text = st.text_input('Enter plot of the movie')
wiki_url = st.text_input("Enter wikipedia url of the movie (Needed for fetching the cast information)")

model = AutoModelForSequenceClassification.from_pretrained("./checkpoint-36819")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

lr_model = pickle.load(open("models/cast_plot_lr","rb"))
cast_mlb = pickle.load(open("models/cast_mlb","rb"))
column_names = pickle.load(open("models/column_names","rb"))
top_actors = list(column_names)[11:]

meta_model = pickle.load(open("models/meta_model","rb"))

utility = Utility()
preprocess = Preprocess()

if st.button("Predict"):
    cast = []
    if len(wiki_url)!=0:
            cast_wiki = wikipedia.page(title=wiki_url.split("/")[-1].replace("_"," "), auto_suggest=False).section("Cast")
            cast_names = [val.split(" as ")[0] for val in cast_wiki.split("\n")]
            for actor in cast_names[:5]:
                  try:
                    cast.append(wikipedia.page(title=actor).pageid)
                  except:
                    search_results = wikipedia.search(actor,results=2)
                    try:
                      cast.append(wikipedia.page(title=search_results[0]).pageid)
                    except:
                      try:
                        cast.append(wikipedia.page(title=search_results(actor)[1]).pageid)
                      except:
                        pass
    
    st.write("Wiki Ids of Top 5 Cast:",cast)
    st.write("Genre: ")
    
    clean_plot = preprocess.apply(text)

    # Base Model 1: DistilBERT
    id2label, label2id, tokenizer, tokenized_plot = utility.tokenize(clean_plot, ["Action","Drama", "Romance", "Comedy", "Thriller"])
    input_ids = [np.asarray(tokenized_plot['input_ids'])]
    attention_mask = [np.asarray(tokenized_plot['attention_mask'])]

    y_pred = model(torch.IntTensor(input_ids), torch.IntTensor(attention_mask))
    pred = torch.FloatTensor(y_pred['logits'][0])
    
    sigmoid = torch.nn.Sigmoid()
    distilbert_pred = sigmoid(pred.squeeze().cpu())

    # Base model 2: LR One Vs All
    cast_features = []
    for actor in cast:
        if actor in top_actors:
            cast_features.append(str(actor))
    lr_model_pred = lr_model.predict_proba(cast_mlb.transform([cast_features]))

    # Concatenating Outputs of base models
    r1 = distilbert_pred[3]
    r2 = distilbert_pred[1]
    r3 = distilbert_pred[2]
    distilbert_pred[1] = r1
    distilbert_pred[2] = r2
    distilbert_pred[3] = r3
    pred1 = distilbert_pred
    pred2 = lr_model_pred
    distilbert_pred = pred1.detach().numpy()
    lr_model_pred = np.array(pred2)[0]
    concat_features = np.concatenate((lr_model_pred,distilbert_pred))

    # Meta model 3: LR One Vs All
    probs = meta_model.predict_proba([concat_features])
    
    # Preparing Output
    out = []
    id2label = {0:"Action",1:"Comedy",2:"Drama",3:"Romance",4:"Thriller"}
    i = 0
    for prob in probs[0]:
        out.append([id2label[i], prob])
        i += 1
    st.write(out)