jgrosjean's picture
Update app.py
f98ea7f verified
raw
history blame
3.86 kB
import gradio as gr
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
# Load swissBERT for sentence embeddings model
model_name = "jgrosjean-mathesis/sentence-swissbert"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
def generate_sentence_embedding(sentence, language):
if "de" in language:
model.set_default_language("de_CH")
if "fr" in language:
model.set_default_language("fr_CH")
if "it" in language:
model.set_default_language("it_CH")
if "rm" in language:
model.set_default_language("rm_CH")
inputs = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt", max_length=512)
with torch.no_grad():
outputs = model(**inputs)
token_embeddings = outputs.last_hidden_state
attention_mask = inputs['attention_mask'].unsqueeze(-1).expand(token_embeddings.size()).float()
sum_embeddings = torch.sum(token_embeddings * attention_mask, 1)
sum_mask = torch.clamp(attention_mask.sum(1), min=1e-9)
embedding = sum_embeddings / sum_mask
return embedding
def calculate_cosine_similarities(source_sentence, source_language, target_sentence_1, target_language_1, target_sentence_2, target_language_2, target_sentence_3, target_language_3):
source_embedding = generate_sentence_embedding(source_sentence, source_language)
target_embedding_1 = generate_sentence_embedding(target_sentence_1, target_language_1)
target_embedding_2 = generate_sentence_embedding(target_sentence_2, target_language_2)
target_embedding_3 = generate_sentence_embedding(target_sentence_3, target_language_3)
cosine_score_1 = cosine_similarity(source_embedding, target_embedding_1)
cosine_score_2 = cosine_similarity(source_embedding, target_embedding_2)
cosine_score_3 = cosine_similarity(source_embedding, target_embedding_3)
cosine_scores = {
target_sentence_1: cosine_score_1[0][0],
target_sentence_2: cosine_score_2[0][0],
target_sentence_3: cosine_score_3[0][0]
}
cosine_scores_dict = dict(sorted(cosine_scores.items(), key=lambda item: item[1], reverse=True))
cosine_scores_output = ""
for key, value in cosine_scores_dict.items():
cosine_scores_output += key + ": " + str(value) + "\n"
cosine_scores_output = "**" + cosine_scores_output.replace("\n", "**\n", 1)
return cosine_scores_output
def main():
demo = gr.Interface(
fn=calculate_cosine_similarities,
inputs=[
gr.Textbox(lines=1, placeholder="Enter source sentence", label="Source Sentence"),
gr.Dropdown(["de", "fr", "it", "rm"], label="Source Language"),
gr.Textbox(lines=1, placeholder="Enter target sentence 1", label="Target Sentence 1"),
gr.Dropdown(["de", "fr", "it", "rm"], label="Target Language 1"),
gr.Textbox(lines=1, placeholder="Enter target sentence 2", label="Target Sentence 2"),
gr.Dropdown(["de", "fr", "it", "rm"], label="Target Language 2"),
gr.Textbox(lines=1, placeholder="Enter target sentence 3", label="Target Sentence 3"),
gr.Dropdown(["de", "fr", "it", "rm"], label="Target Language 3")
],
outputs= gr.Textbox(label="Cosine Similarity Scores", type="text", lines=3),
title="Sentence Similarity Calculator",
description="Enter a source sentence and up to three target sentences to calculate their cosine similarity.",
examples=[
["Der Zug fährt um 9 Uhr in Zürich ab.", "de", "Le train arrive à Lausanne à 11 heures.", "fr", "Alla stazione di Lugano ci sono diversi binari.", "it", "A Cuera van biars trens ellas muntognas.", "rm"]
]
)
demo.launch(share=True)
if __name__ == "__main__":
main()