from statistics import mean import random import torch from transformers import BertModel, BertTokenizerFast import numpy as np import torch.nn.functional as F import gradio as gr tokenizer = BertTokenizerFast.from_pretrained("setu4993/LaBSE") model = BertModel.from_pretrained("setu4993/LaBSE") model = model.eval() def embed(text, tokenizer, model): inputs = tokenizer(text, return_tensors="pt", padding=True) with torch.no_grad(): outputs = model(**inputs) return outputs.pooler_output def similarity(embeddings_1, embeddings_2): normalized_embeddings_1 = F.normalize(embeddings_1, p=2) normalized_embeddings_2 = F.normalize(embeddings_2, p=2) return torch.matmul( normalized_embeddings_1, normalized_embeddings_2.transpose(0, 1) ) def semantic_sim(sentence1, sentence2): em1 = embed(sentence1, tokenizer, model) em2 = embed(sentence2, tokenizer, model) sim = int(float(similarity(em1, em2)*5)) out = "" if sim == 5: out = "Equivalent" elif sim == 4: out = "Mostly equivalent, unimportant details differ" elif sim == 3: out = "Roughly equivalent, important details differ or are missing" elif sim == 2: out = "Not equivalent, but share some details" elif sim == 1: out = "Same general topic, but not equivalent" elif sim == 0: out = "Completely dissimilar" return out iface = gr.Interface(fn=semantic_sim, inputs=["text", "text"], outputs=["text"]).launch()