tyang's picture
Update app.py
55a6cc4
import torch
from scipy.spatial.distance import cosine
from transformers import AutoModel, AutoTokenizer
from thefuzz import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
tokenizer_simcse = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
model_simcse = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
tokenizer_mpnet = AutoTokenizer.from_pretrained('sentence-transformers/stsb-mpnet-base-v2')
model_mpnet = AutoModel.from_pretrained('sentence-transformers/stsb-mpnet-base-v2')
vectorizer = TfidfVectorizer()
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0]
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
def thefuzz(text1, text2):
score = fuzz.token_sort_ratio(text1, text2)
return {'levenshtein distance of sorted tokens':score/100}
def tfidf(text1, text2):
t1_tfidf = vectorizer.fit_transform([text1])
t2_tfidf = vectorizer.transform([text2])
cosine_sim = cosine_similarity(t1_tfidf, t2_tfidf).flatten()[0]
return {'cosine similarity of tf-idf vectors':str(round(cosine_sim,2))}
def simcse(text1, text2):
texts = [text1,text2]
inputs = tokenizer_simcse(texts, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
embeddings = model_simcse(**inputs, output_hidden_states=True, return_dict=True).pooler_output
cosine_sim = 1 - cosine(embeddings[0], embeddings[1])
return {"cosine similarity of simcse embeddings":str(round(cosine_sim,2))}
def mpnet(text1, text2):
encoded_input = tokenizer_mpnet([text1,text2], padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
model_output = model_mpnet(**encoded_input)
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
cosine_sim = 1 - cosine(sentence_embeddings[0], sentence_embeddings[1])
return {"cosine similarity of stsb-mpnet embeddings":str(round(cosine_sim,2))}
def get_scores(text1, text2):
fuzz_out = thefuzz(text1, text2)
tfidf_out = tfidf(text1, text2)
simcse_out = simcse(text1, text2)
mpnet_out = mpnet(text1, text2)
return simcse_out, mpnet_out, fuzz_out, tfidf_out
inputs = [
gr.Textbox(lines=5, label="Input Text One"),
gr.Textbox(lines=5, label="Input Text Two")
]
outputs = [
gr.Label(label="Cosine similarity based on SimCSE embeddings"),
gr.Label(label="Cosine similarity based on stsb-mpnet embeddings"),
gr.Label(label="Token sort ratio using Levenshtein distance"),
gr.Label(label="Cosine similarity based on tf-idf vectors"),
]
title = "SimCSE vs MPNet vs thefuzz vs TF-IDF"
description = "Demo for comparing semantic text similarity methods. Princeton-NLP SimCSE, stsb-mpnet-base-v2 from sentence-transformers (MPnet from Microsoft as the backbone), thefuzz from SeatGeek, and TF-IDF. Interface by Troy Yang."
article = "<p style='text-align: center'><a href='https://github.com/princeton-nlp/SimCSE'>SimCSE: Simple Contrastive Learning of Sentence Embeddings</a> | <a href='https://huggingface.co/sentence-transformers/stsb-mpnet-base-v2'>stsb-mpnet-base-v2 HuggingFace model card</a> | <a href='https://github.com/seatgeek/thefuzz'>thefuzz: Fuzzy String Matching in Python</a></p>"
examples = [
["There's a kid on a skateboard.","A kid is skateboarding."],
['There is no boy standing in front of the blue building in the space reserved for handicapped people', 'A boy is standing in front of the blue building in the space reserved for handicapped people'],
['People wearing costumes are gathering in a forest and are looking in the same direction','Masked people are looking in the same direction in a forest'],
['Two large persons are sitting on a park bench and they have a bottle of soda between them','Two large persons are standing near a park bench and they have nothing between them'],
['A young man with brown hair and shades is sitting in front of some cans of soda',
'A young man with brown hair and sunglasses is sitting in front of some cans of soda']
,['A young lady with light brown hair is wearing a red necklace, a sweatshirt and earrings and is smiling',
'There is no young lady with light brown hair wearing a red necklace, a sweatshirt and earrings and smiling']
,['A woman wearing a blue and white uniform with a white and blue hat is keeping her mouth open and is near others dressed in the same fashion',
'A woman wearing casual clothing is keeping her mouth closed and is near other people dressed differently']
,['The man with brown hair is wearing sunglasses and is sitting listlessly at a table with cans of soda and other drinks',
'The man with brown hair is wearing sunglasses and is sitting at a table with cans of soda and other drinks']
,['There is no man wearing clothes that are covered with paint or is sitting outside in a busy area writing something',
'A man is wearing clothes that are covered with paint and is sitting outside in a busy area writing something']
,['The shirtless man in striped shorts and sunglasses is not standing near a man in a white shirt and sunglasses',
'The shirtless man in striped shorts and sunglasses is standing near a man in a white shirt and sunglasses']
,['The shirtless man in striped shorts and sunglasses is standing near a man in a white shirt and sunglasses',
'The shirtless man in striped shorts and sunglasses is standing near a person in a white shirt and sunglasses']
,['A young boy is wearing a blue patterned swim suit, a black and yellow swim cap and has blue swim goggles on her head',
'A young girl is wearing a blue patterned swim suit, a black and yellow swim cap and has blue swimming goggles on her head']
]
gr.Interface(get_scores, inputs, outputs, title=title, description=description, article=article,
theme="darkdefault", examples=examples, flagging_options=["strongly related","related", "neutral", "unrelated", "stongly unrelated"]).launch()#()#