Spaces:

Kuaaangwen
/

auto-grader

Runtime error

App Files Files Community

auto-grader / app.py

Kuaaangwen

Update app.py

917d2f9 about 2 years ago

raw history blame

No virus

6.75 kB

	import streamlit as st

	# Library for Sentence Similarity
	import pandas as pd
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity

	# Library for Entailment
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import torch


	# Library for keyword extraction
	import yake


	# Load models and tokenisers for both sentence transformers and text classification

	sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2')

	tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")

	text_classification_model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")



	### Streamlit interface ###

	st.title("Sentence Similarity")

	sidebar_selectbox = st.sidebar.selectbox(
	"What would you like to work with?",
	("Compare two sentences", "Bulk upload and mark")
	)

	# Streamlit form elements (default to "Compare two sentences")

	if sidebar_selectbox == "Compare two sentences":

	st.subheader("Compare the similarity between two sentences")

	with st.form("submission_form", clear_on_submit=False):

	sentence_1 = st.text_input("Sentence 1 input")

	sentence_2 = st.text_input("Sentence 2 input")

	submit_button_compare = st.form_submit_button("Compare Sentences")

	# If submit_button_compare clicked
	if submit_button_compare:

	print("Comparing sentences...")

	### Compare Sentence Similarity ###

	# Perform calculations

	#Initialise sentences
	sentences = []

	# Append input sentences to 'sentences' list
	sentences.append(sentence_1)
	sentences.append(sentence_2)

	# Create embeddings for both sentences
	sentence_embeddings = sentence_transformer_model.encode(sentences)

	cos_sim = cosine_similarity(sentence_embeddings[0].reshape(1, -1), sentence_embeddings[1].reshape(1, -1))[0][0]
	cos_sim = round(cos_sim * 100) # Convert to percentage and round-off


	# st.write('Similarity between "{}" and "{}" is {}%'.format(sentence_1,
	# sentence_2, cos_sim))

	st.subheader("Similarity")
	st.write(f"Similarity between the two sentences is {cos_sim}%.")


	### Text classification - entailment, neutral or contradiction ###

	raw_inputs = [f"{sentence_1}</s></s>{sentence_2}"]

	inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")

	# print(inputs)

	outputs = text_classification_model(**inputs)

	outputs = torch.nn.functional.softmax(outputs.logits, dim = -1)
	# print(outputs)

	# argmax_index = torch.argmax(outputs).item()

	print(text_classification_model.config.id2label[0], ":", round(outputs[0][0].item()*100,2),"%")
	print(text_classification_model.config.id2label[1], ":", round(outputs[0][1].item()*100,2),"%")
	print(text_classification_model.config.id2label[2], ":", round(outputs[0][2].item()*100,2),"%")

	st.subheader("Text classification for both sentences:")

	st.write(text_classification_model.config.id2label[1], ":", round(outputs[0][1].item()*100,2),"%")
	st.write(text_classification_model.config.id2label[0], ":", round(outputs[0][0].item()*100,2),"%")
	st.write(text_classification_model.config.id2label[2], ":", round(outputs[0][2].item()*100,2),"%")


	### Extract keywords with YAKE ### (might make more sense with word cloud)

	st.subheader("Keywords:")

	kw_extractor = yake.KeywordExtractor(top=10, stopwords=None)
	keywords = kw_extractor.extract_keywords(sentence_2)

	# keywords_array = []

	for kw, v in keywords:
	# print("Keyphrase: ", kw, ": score", v)
	# keywords_array.append(kw)

	st.write(kw)





	if sidebar_selectbox == "Bulk upload and mark":

	st.subheader("Bulk compare similarity of sentences")

	sentence_reference = st.text_input("Reference sentence input")

	# Only allow user to upload CSV files
	data_file = st.file_uploader("Upload CSV",type=["csv"])

	if data_file is not None:
	with st.spinner('Wait for it...'):
	file_details = {"filename":data_file.name, "filetype":data_file.type, "filesize":data_file.size}
	# st.write(file_details)
	df = pd.read_csv(data_file)

	# Get length of df.shape (might not need this)
	#total_rows = df.shape[0]

	similarity_scores = []

	for idx, row in df.iterrows():
	# st.write(idx, row['Sentences'])

	# Create an empty sentence list
	sentences = []

	# Compare the setences two by two
	sentence_comparison = row['Sentences']
	sentences.append(sentence_reference)
	sentences.append(sentence_comparison)

	sentence_embeddings = sentence_transformer_model.encode(sentences)

	cos_sim = cosine_similarity(sentence_embeddings[0].reshape(1, -1), sentence_embeddings[1].reshape(1, -1))[0][0]
	cos_sim = round(cos_sim * 100)

	similarity_scores.append(cos_sim)

	# Append new column to dataframe

	df['Similarity (%)'] = similarity_scores

	st.dataframe(df)
	st.success('Done!')

	@st.cache
	def convert_df(df):
	return df.to_csv().encode('utf-8')

	csv = convert_df(df)

	st.download_button(
	"Press to Download",
	csv,
	"marked assignment.csv",
	"text/csv",
	key='download-csv'
	)