|
import gradio as gr |
|
import json |
|
from datasets import load_dataset |
|
from sentence_transformers import SentenceTransformer, InputExample, util |
|
import pandas as pd |
|
|
|
def Main(Modelo, Umbral, File): |
|
|
|
error = "" |
|
modelResult = "" |
|
|
|
try: |
|
data_test = ConvertJsonToList(File.name) |
|
modelResult = TestModel('jfarray/Model_'+ Modelo +'_50_Epochs',data_test, Umbral) |
|
except Exception as e: |
|
error = e |
|
|
|
return [error, modelResult] |
|
|
|
def ConvertJsonToList(fileName): |
|
|
|
subject_fileDataset = [] |
|
with open(fileName, encoding='utf-8') as fh: |
|
fileContent = fh.read().replace('}\n{','}\n,{') |
|
subject_fileDataset = json.loads('[' + fileContent+ ']') |
|
|
|
|
|
samples = [] |
|
|
|
for i in range (0,len(subject_fileDataset)): |
|
hashed_id = subject_fileDataset[i]['hashed_id'] |
|
mark = subject_fileDataset[i]['nota'] |
|
responseStudent = subject_fileDataset[i]['respuesta'] |
|
responseTeacher = "" |
|
for j in range(0,len(subject_fileDataset[i]['metadata']['minipreguntas'])): |
|
responseTeacher = responseTeacher + subject_fileDataset[i]['metadata']['minipreguntas'][j]['minirespuesta'] |
|
|
|
ie = InputExample(guid= hashed_id, texts=[responseTeacher, responseStudent], label=mark) |
|
|
|
samples.append(ie) |
|
|
|
return samples |
|
|
|
def TestModel(checkpoint, data, Umbral): |
|
local_model_path = checkpoint |
|
model = SentenceTransformer(local_model_path) |
|
df = pd.DataFrame(columns=["Hashed_id", "Nota", "Similitud Semántica", "Supera Umbral"]) |
|
|
|
sentences1 = [] |
|
sentences2 = [] |
|
hashed_ids = [] |
|
marks = [] |
|
scores = [] |
|
thresholds = [] |
|
for i in range (0,len(data)): |
|
sentences1.append(data[i].texts[0]) |
|
sentences2.append(data[i].texts[1]) |
|
|
|
|
|
embeddings1 = model.encode(sentences1, convert_to_tensor=True) |
|
embeddings2 = model.encode(sentences2, convert_to_tensor=True) |
|
|
|
|
|
cosine_scores = util.cos_sim(embeddings1, embeddings2) |
|
|
|
for i in range(len(sentences1)): |
|
hashed_ids.append(data[i].guid) |
|
marks.append(data[i].label) |
|
scores.append(round(cosine_scores[i][i].item(),3)) |
|
umbralDiferencia = data[i].label * (Umbral/100) |
|
if ( abs(round(data[i].label,3) - round(cosine_scores[i][i].item(),3)) >= umbralDiferencia ): |
|
thresholds.append("SI") |
|
else: |
|
thresholds.append("") |
|
|
|
df['Hashed_id'] = hashed_ids |
|
df['Nota'] = marks |
|
df['Similitud Semántica'] = scores |
|
df['Supera Umbral'] = thresholds |
|
|
|
return df |
|
|
|
Modelos = gr.inputs.Dropdown(["dccuchile_bert-base-spanish-wwm-uncased" |
|
, "bert-base-multilingual-uncased" |
|
, "all-distilroberta-v1" |
|
, "paraphrase-multilingual-mpnet-base-v2" |
|
, "paraphrase-multilingual-MiniLM-L12-v2" |
|
, "distiluse-base-multilingual-cased-v1"]) |
|
Umbral = gr.inputs.Slider(minimum=0, maximum=100, step=None, default=15, label=None) |
|
FileInput = gr.inputs.File(file_count="single", type="file", label="Fichero Json") |
|
LabelOutput = gr.outputs.Label(num_top_classes=None, type="auto", label="") |
|
DataFrameOutput = gr.outputs.Dataframe(headers=["Hashed_id", "Nota", "Similitud Semántica", "Supera Umbral"] |
|
, max_rows=20, max_cols=4, overflow_row_behaviour="paginate", type="pandas", label="Resultado") |
|
|
|
iface = gr.Interface(fn=Main |
|
, inputs=[Modelos, Umbral, FileInput] |
|
, outputs=[LabelOutput, DataFrameOutput] |
|
, examples=[["dccuchile_bert-base-spanish-wwm-uncased", 10, "TestFile.json"]] |
|
, title = "Similitud Semántica de textos en Español de tamaño medio (200-250 palabras)" |
|
) |
|
|
|
iface.launch(share = False,enable_queue=True, show_error =True) |