Spaces:
Runtime error
Runtime error
File size: 6,750 Bytes
917d2f9 514343b 917d2f9 4fd42f1 917d2f9 4fd42f1 917d2f9 4fd42f1 917d2f9 843aeb0 917d2f9 44264ed 917d2f9 418bd7c 88993fe 44264ed e838b9b b3c3404 1716434 b3c3404 418bd7c b3c3404 20efea7 b3c3404 20efea7 b3c3404 56c44d6 15e7e9a 917d2f9 ea052a5 15e7e9a b0f213e 15e7e9a 917d2f9 b1d589a 15e7e9a 917d2f9 56c44d6 08a6457 534b3f6 411304b 82206e8 411304b 82206e8 411304b 82206e8 411304b 917d2f9 411304b 82206e8 411304b 82206e8 411304b c152654 0ddd1ea c152654 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
import streamlit as st
# Library for Sentence Similarity
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# Library for Entailment
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
# Library for keyword extraction
import yake
# Load models and tokenisers for both sentence transformers and text classification
sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2')
tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
text_classification_model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")
### Streamlit interface ###
st.title("Sentence Similarity")
sidebar_selectbox = st.sidebar.selectbox(
"What would you like to work with?",
("Compare two sentences", "Bulk upload and mark")
)
# Streamlit form elements (default to "Compare two sentences")
if sidebar_selectbox == "Compare two sentences":
st.subheader("Compare the similarity between two sentences")
with st.form("submission_form", clear_on_submit=False):
sentence_1 = st.text_input("Sentence 1 input")
sentence_2 = st.text_input("Sentence 2 input")
submit_button_compare = st.form_submit_button("Compare Sentences")
# If submit_button_compare clicked
if submit_button_compare:
print("Comparing sentences...")
### Compare Sentence Similarity ###
# Perform calculations
#Initialise sentences
sentences = []
# Append input sentences to 'sentences' list
sentences.append(sentence_1)
sentences.append(sentence_2)
# Create embeddings for both sentences
sentence_embeddings = sentence_transformer_model.encode(sentences)
cos_sim = cosine_similarity(sentence_embeddings[0].reshape(1, -1), sentence_embeddings[1].reshape(1, -1))[0][0]
cos_sim = round(cos_sim * 100) # Convert to percentage and round-off
# st.write('Similarity between "{}" and "{}" is {}%'.format(sentence_1,
# sentence_2, cos_sim))
st.subheader("Similarity")
st.write(f"Similarity between the two sentences is {cos_sim}%.")
### Text classification - entailment, neutral or contradiction ###
raw_inputs = [f"{sentence_1}</s></s>{sentence_2}"]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
# print(inputs)
outputs = text_classification_model(**inputs)
outputs = torch.nn.functional.softmax(outputs.logits, dim = -1)
# print(outputs)
# argmax_index = torch.argmax(outputs).item()
print(text_classification_model.config.id2label[0], ":", round(outputs[0][0].item()*100,2),"%")
print(text_classification_model.config.id2label[1], ":", round(outputs[0][1].item()*100,2),"%")
print(text_classification_model.config.id2label[2], ":", round(outputs[0][2].item()*100,2),"%")
st.subheader("Text classification for both sentences:")
st.write(text_classification_model.config.id2label[1], ":", round(outputs[0][1].item()*100,2),"%")
st.write(text_classification_model.config.id2label[0], ":", round(outputs[0][0].item()*100,2),"%")
st.write(text_classification_model.config.id2label[2], ":", round(outputs[0][2].item()*100,2),"%")
### Extract keywords with YAKE ### (might make more sense with word cloud)
st.subheader("Keywords:")
kw_extractor = yake.KeywordExtractor(top=10, stopwords=None)
keywords = kw_extractor.extract_keywords(sentence_2)
# keywords_array = []
for kw, v in keywords:
# print("Keyphrase: ", kw, ": score", v)
# keywords_array.append(kw)
st.write(kw)
if sidebar_selectbox == "Bulk upload and mark":
st.subheader("Bulk compare similarity of sentences")
sentence_reference = st.text_input("Reference sentence input")
# Only allow user to upload CSV files
data_file = st.file_uploader("Upload CSV",type=["csv"])
if data_file is not None:
with st.spinner('Wait for it...'):
file_details = {"filename":data_file.name, "filetype":data_file.type, "filesize":data_file.size}
# st.write(file_details)
df = pd.read_csv(data_file)
# Get length of df.shape (might not need this)
#total_rows = df.shape[0]
similarity_scores = []
for idx, row in df.iterrows():
# st.write(idx, row['Sentences'])
# Create an empty sentence list
sentences = []
# Compare the setences two by two
sentence_comparison = row['Sentences']
sentences.append(sentence_reference)
sentences.append(sentence_comparison)
sentence_embeddings = sentence_transformer_model.encode(sentences)
cos_sim = cosine_similarity(sentence_embeddings[0].reshape(1, -1), sentence_embeddings[1].reshape(1, -1))[0][0]
cos_sim = round(cos_sim * 100)
similarity_scores.append(cos_sim)
# Append new column to dataframe
df['Similarity (%)'] = similarity_scores
st.dataframe(df)
st.success('Done!')
@st.cache
def convert_df(df):
return df.to_csv().encode('utf-8')
csv = convert_df(df)
st.download_button(
"Press to Download",
csv,
"marked assignment.csv",
"text/csv",
key='download-csv'
) |