auto-grader / app.py
Kuaaangwen's picture
Update app.py
917d2f9
raw history blame
No virus
6.75 kB
import streamlit as st
# Library for Sentence Similarity
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# Library for Entailment
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
# Library for keyword extraction
import yake
# Load models and tokenisers for both sentence transformers and text classification
sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2')
tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
text_classification_model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")
### Streamlit interface ###
st.title("Sentence Similarity")
sidebar_selectbox = st.sidebar.selectbox(
"What would you like to work with?",
("Compare two sentences", "Bulk upload and mark")
)
# Streamlit form elements (default to "Compare two sentences")
if sidebar_selectbox == "Compare two sentences":
st.subheader("Compare the similarity between two sentences")
with st.form("submission_form", clear_on_submit=False):
sentence_1 = st.text_input("Sentence 1 input")
sentence_2 = st.text_input("Sentence 2 input")
submit_button_compare = st.form_submit_button("Compare Sentences")
# If submit_button_compare clicked
if submit_button_compare:
print("Comparing sentences...")
### Compare Sentence Similarity ###
# Perform calculations
#Initialise sentences
sentences = []
# Append input sentences to 'sentences' list
sentences.append(sentence_1)
sentences.append(sentence_2)
# Create embeddings for both sentences
sentence_embeddings = sentence_transformer_model.encode(sentences)
cos_sim = cosine_similarity(sentence_embeddings[0].reshape(1, -1), sentence_embeddings[1].reshape(1, -1))[0][0]
cos_sim = round(cos_sim * 100) # Convert to percentage and round-off
# st.write('Similarity between "{}" and "{}" is {}%'.format(sentence_1,
# sentence_2, cos_sim))
st.subheader("Similarity")
st.write(f"Similarity between the two sentences is {cos_sim}%.")
### Text classification - entailment, neutral or contradiction ###
raw_inputs = [f"{sentence_1}</s></s>{sentence_2}"]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
# print(inputs)
outputs = text_classification_model(**inputs)
outputs = torch.nn.functional.softmax(outputs.logits, dim = -1)
# print(outputs)
# argmax_index = torch.argmax(outputs).item()
print(text_classification_model.config.id2label[0], ":", round(outputs[0][0].item()*100,2),"%")
print(text_classification_model.config.id2label[1], ":", round(outputs[0][1].item()*100,2),"%")
print(text_classification_model.config.id2label[2], ":", round(outputs[0][2].item()*100,2),"%")
st.subheader("Text classification for both sentences:")
st.write(text_classification_model.config.id2label[1], ":", round(outputs[0][1].item()*100,2),"%")
st.write(text_classification_model.config.id2label[0], ":", round(outputs[0][0].item()*100,2),"%")
st.write(text_classification_model.config.id2label[2], ":", round(outputs[0][2].item()*100,2),"%")
### Extract keywords with YAKE ### (might make more sense with word cloud)
st.subheader("Keywords:")
kw_extractor = yake.KeywordExtractor(top=10, stopwords=None)
keywords = kw_extractor.extract_keywords(sentence_2)
# keywords_array = []
for kw, v in keywords:
# print("Keyphrase: ", kw, ": score", v)
# keywords_array.append(kw)
st.write(kw)
if sidebar_selectbox == "Bulk upload and mark":
st.subheader("Bulk compare similarity of sentences")
sentence_reference = st.text_input("Reference sentence input")
# Only allow user to upload CSV files
data_file = st.file_uploader("Upload CSV",type=["csv"])
if data_file is not None:
with st.spinner('Wait for it...'):
file_details = {"filename":data_file.name, "filetype":data_file.type, "filesize":data_file.size}
# st.write(file_details)
df = pd.read_csv(data_file)
# Get length of df.shape (might not need this)
#total_rows = df.shape[0]
similarity_scores = []
for idx, row in df.iterrows():
# st.write(idx, row['Sentences'])
# Create an empty sentence list
sentences = []
# Compare the setences two by two
sentence_comparison = row['Sentences']
sentences.append(sentence_reference)
sentences.append(sentence_comparison)
sentence_embeddings = sentence_transformer_model.encode(sentences)
cos_sim = cosine_similarity(sentence_embeddings[0].reshape(1, -1), sentence_embeddings[1].reshape(1, -1))[0][0]
cos_sim = round(cos_sim * 100)
similarity_scores.append(cos_sim)
# Append new column to dataframe
df['Similarity (%)'] = similarity_scores
st.dataframe(df)
st.success('Done!')
@st.cache
def convert_df(df):
return df.to_csv().encode('utf-8')
csv = convert_df(df)
st.download_button(
"Press to Download",
csv,
"marked assignment.csv",
"text/csv",
key='download-csv'
)