import gradio as gr
import json
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, InputExample, util
import pandas as pd

def Main(Modelo, Umbral, File):
  
  error = ""
  modelResult  = ""
  
  try:
    data_test = ConvertJsonToList(File.name)
    modelResult = TestModel('jfarray/Model_'+ Modelo +'_50_Epochs',data_test, Umbral)
  except Exception as e:
    error = e
  
  return [error, modelResult]
    
def ConvertJsonToList(fileName):
  #subject_fileDataset = load_dataset("json", data_files=fileName)
  subject_fileDataset = []
  with open(fileName, encoding='utf-8') as fh:
    fileContent = fh.read().replace('}\n{','}\n,{')
    subject_fileDataset = json.loads('[' + fileContent+ ']')
    #print(data[0]['respuesta'])
  
  samples = []

  for i in range (0,len(subject_fileDataset)): #len(subject1)
    hashed_id = subject_fileDataset[i]['hashed_id']
    mark = subject_fileDataset[i]['nota']
    responseStudent = subject_fileDataset[i]['respuesta']
    responseTeacher = ""
    for j in range(0,len(subject_fileDataset[i]['metadata']['minipreguntas'])):
      responseTeacher = responseTeacher + subject_fileDataset[i]['metadata']['minipreguntas'][j]['minirespuesta']
    
    ie = InputExample(guid= hashed_id, texts=[responseTeacher, responseStudent], label=mark)

    samples.append(ie)

  return samples
  
def TestModel(checkpoint, data, Umbral):
  local_model_path = checkpoint
  model = SentenceTransformer(local_model_path)
  df = pd.DataFrame(columns=["Hashed_id", "Nota", "Similitud Semántica", "Supera Umbral"])
  
  sentences1 = []
  sentences2 = []
  hashed_ids = []
  marks = []
  scores = []
  thresholds = []
  for i in range (0,len(data)): #len(data)
    sentences1.append(data[i].texts[0])
    sentences2.append(data[i].texts[1])

  #Compute embedding for both lists
  embeddings1 = model.encode(sentences1, convert_to_tensor=True)
  embeddings2 = model.encode(sentences2, convert_to_tensor=True)
  
  #Compute cosine-similarits
  cosine_scores = util.cos_sim(embeddings1, embeddings2)
  
  for i in range(len(sentences1)):
    hashed_ids.append(data[i].guid)
    marks.append(data[i].label)
    scores.append(round(cosine_scores[i][i].item(),3))
    umbralDiferencia = data[i].label * (Umbral/100)
    if ( abs(round(data[i].label,3) - round(cosine_scores[i][i].item(),3)) >= umbralDiferencia ):
      thresholds.append("SI")
    else:
      thresholds.append("")
      
  df['Hashed_id'] = hashed_ids
  df['Nota'] = marks
  df['Similitud Semántica'] = scores
  df['Supera Umbral'] = thresholds
  
  return df

Modelos = gr.inputs.Dropdown(["dccuchile_bert-base-spanish-wwm-uncased"
  , "bert-base-multilingual-uncased"
  , "all-distilroberta-v1"
  , "paraphrase-multilingual-mpnet-base-v2"
  , "paraphrase-multilingual-MiniLM-L12-v2"
  , "distiluse-base-multilingual-cased-v1"])
Umbral = gr.inputs.Slider(minimum=0, maximum=100, step=None, default=15, label=None)
FileInput = gr.inputs.File(file_count="single", type="file", label="Fichero Json")
LabelOutput = gr.outputs.Label(num_top_classes=None, type="auto", label="")
DataFrameOutput = gr.outputs.Dataframe(headers=["Hashed_id", "Nota", "Similitud Semántica", "Supera Umbral"]
  , max_rows=20, max_cols=4, overflow_row_behaviour="paginate", type="pandas", label="Resultado")

iface = gr.Interface(fn=Main
    , inputs=[Modelos, Umbral, FileInput]
    , outputs=[LabelOutput, DataFrameOutput]
    , examples=[["dccuchile_bert-base-spanish-wwm-uncased", 10, "TestFile.json"]]
    , title = "Similitud Semántica de textos en Español de tamaño medio (200-250 palabras)"
)

iface.launch(share = False,enable_queue=True, show_error =True)