Spaces:
Runtime error
Runtime error
import streamlit as st | |
# Library for Sentence Similarity | |
import pandas as pd | |
from sentence_transformers import SentenceTransformer | |
from sklearn.metrics.pairwise import cosine_similarity | |
# Library for Entailment | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
import torch | |
# Library for keyword extraction | |
import yake | |
# Load models and tokenisers for both sentence transformers and text classification | |
sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2') | |
tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli") | |
text_classification_model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli") | |
### Streamlit interface ### | |
st.title("Sentence Similarity") | |
sidebar_selectbox = st.sidebar.selectbox( | |
"What would you like to work with?", | |
("Compare two sentences", "Bulk upload and mark") | |
) | |
# Streamlit form elements (default to "Compare two sentences") | |
if sidebar_selectbox == "Compare two sentences": | |
st.subheader("Compare the similarity between two sentences") | |
with st.form("submission_form", clear_on_submit=False): | |
sentence_1 = st.text_input("Sentence 1 input") | |
sentence_2 = st.text_input("Sentence 2 input") | |
submit_button_compare = st.form_submit_button("Compare Sentences") | |
# If submit_button_compare clicked | |
if submit_button_compare: | |
print("Comparing sentences...") | |
### Compare Sentence Similarity ### | |
# Perform calculations | |
#Initialise sentences | |
sentences = [] | |
# Append input sentences to 'sentences' list | |
sentences.append(sentence_1) | |
sentences.append(sentence_2) | |
# Create embeddings for both sentences | |
sentence_embeddings = sentence_transformer_model.encode(sentences) | |
cos_sim = cosine_similarity(sentence_embeddings[0].reshape(1, -1), sentence_embeddings[1].reshape(1, -1))[0][0] | |
cos_sim = round(cos_sim * 100) # Convert to percentage and round-off | |
# st.write('Similarity between "{}" and "{}" is {}%'.format(sentence_1, | |
# sentence_2, cos_sim)) | |
st.subheader("Similarity") | |
st.write(f"Similarity between the two sentences is {cos_sim}%.") | |
### Text classification - entailment, neutral or contradiction ### | |
raw_inputs = [f"{sentence_1}</s></s>{sentence_2}"] | |
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt") | |
# print(inputs) | |
outputs = text_classification_model(**inputs) | |
outputs = torch.nn.functional.softmax(outputs.logits, dim = -1) | |
# print(outputs) | |
# argmax_index = torch.argmax(outputs).item() | |
print(text_classification_model.config.id2label[0], ":", round(outputs[0][0].item()*100,2),"%") | |
print(text_classification_model.config.id2label[1], ":", round(outputs[0][1].item()*100,2),"%") | |
print(text_classification_model.config.id2label[2], ":", round(outputs[0][2].item()*100,2),"%") | |
st.subheader("Text classification for both sentences:") | |
st.write(text_classification_model.config.id2label[1], ":", round(outputs[0][1].item()*100,2),"%") | |
st.write(text_classification_model.config.id2label[0], ":", round(outputs[0][0].item()*100,2),"%") | |
st.write(text_classification_model.config.id2label[2], ":", round(outputs[0][2].item()*100,2),"%") | |
### Extract keywords with YAKE ### (might make more sense with word cloud) | |
st.subheader("Keywords:") | |
kw_extractor = yake.KeywordExtractor(top=10, stopwords=None) | |
keywords = kw_extractor.extract_keywords(sentence_2) | |
# keywords_array = [] | |
for kw, v in keywords: | |
# print("Keyphrase: ", kw, ": score", v) | |
# keywords_array.append(kw) | |
st.write(kw) | |
if sidebar_selectbox == "Bulk upload and mark": | |
st.subheader("Bulk compare similarity of sentences") | |
sentence_reference = st.text_input("Reference sentence input") | |
# Only allow user to upload CSV files | |
data_file = st.file_uploader("Upload CSV",type=["csv"]) | |
if data_file is not None: | |
with st.spinner('Wait for it...'): | |
file_details = {"filename":data_file.name, "filetype":data_file.type, "filesize":data_file.size} | |
# st.write(file_details) | |
df = pd.read_csv(data_file) | |
# Get length of df.shape (might not need this) | |
#total_rows = df.shape[0] | |
similarity_scores = [] | |
for idx, row in df.iterrows(): | |
# st.write(idx, row['Sentences']) | |
# Create an empty sentence list | |
sentences = [] | |
# Compare the setences two by two | |
sentence_comparison = row['Sentences'] | |
sentences.append(sentence_reference) | |
sentences.append(sentence_comparison) | |
sentence_embeddings = sentence_transformer_model.encode(sentences) | |
cos_sim = cosine_similarity(sentence_embeddings[0].reshape(1, -1), sentence_embeddings[1].reshape(1, -1))[0][0] | |
cos_sim = round(cos_sim * 100) | |
similarity_scores.append(cos_sim) | |
# Append new column to dataframe | |
df['Similarity (%)'] = similarity_scores | |
st.dataframe(df) | |
st.success('Done!') | |
def convert_df(df): | |
return df.to_csv().encode('utf-8') | |
csv = convert_df(df) | |
st.download_button( | |
"Press to Download", | |
csv, | |
"marked assignment.csv", | |
"text/csv", | |
key='download-csv' | |
) |