File size: 6,750 Bytes
917d2f9
514343b
917d2f9
 
 
 
4fd42f1
917d2f9
 
 
4fd42f1
 
917d2f9
 
4fd42f1
 
917d2f9
 
 
843aeb0
917d2f9
44264ed
917d2f9
 
 
 
 
418bd7c
88993fe
44264ed
e838b9b
 
 
 
 
b3c3404
1716434
b3c3404
418bd7c
b3c3404
20efea7
b3c3404
20efea7
b3c3404
 
 
 
 
56c44d6
15e7e9a
 
917d2f9
 
 
 
ea052a5
15e7e9a
 
b0f213e
 
 
15e7e9a
 
 
 
 
917d2f9
b1d589a
15e7e9a
 
 
 
917d2f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56c44d6
 
 
 
 
 
 
08a6457
 
534b3f6
 
 
 
411304b
 
 
 
82206e8
411304b
 
82206e8
411304b
82206e8
411304b
 
 
 
 
 
 
 
 
 
 
917d2f9
411304b
 
 
 
 
82206e8
411304b
82206e8
411304b
 
 
 
c152654
 
 
 
 
 
 
 
 
 
0ddd1ea
c152654
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import streamlit as st

# Library for Sentence Similarity
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Library for Entailment
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch


# Library for keyword extraction
import yake


# Load models and tokenisers for both sentence transformers and text classification

sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2')

tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")

text_classification_model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")



### Streamlit interface ###
      
st.title("Sentence Similarity")

sidebar_selectbox = st.sidebar.selectbox(
    "What would you like to work with?",
    ("Compare two sentences", "Bulk upload and mark")
)

# Streamlit form elements (default to "Compare two sentences")

if sidebar_selectbox == "Compare two sentences":

       st.subheader("Compare the similarity between two sentences")
       
       with st.form("submission_form", clear_on_submit=False):
       
              sentence_1 = st.text_input("Sentence 1 input")
              
              sentence_2 = st.text_input("Sentence 2 input")
              
              submit_button_compare = st.form_submit_button("Compare Sentences")
              
       # If submit_button_compare clicked
       if submit_button_compare:

              print("Comparing sentences...")

              ### Compare Sentence Similarity ###
       
              # Perform calculations
              
              #Initialise sentences
              sentences = []
              
              # Append input sentences to 'sentences' list
              sentences.append(sentence_1)
              sentences.append(sentence_2)
              
              # Create embeddings for both sentences
              sentence_embeddings = sentence_transformer_model.encode(sentences)
              
              cos_sim = cosine_similarity(sentence_embeddings[0].reshape(1, -1), sentence_embeddings[1].reshape(1, -1))[0][0]
              cos_sim = round(cos_sim * 100) # Convert to percentage and round-off
             
                     
              # st.write('Similarity between "{}" and "{}" is {}%'.format(sentence_1,
              #        sentence_2, cos_sim))

              st.subheader("Similarity")
              st.write(f"Similarity between the two sentences is {cos_sim}%.")


              ### Text classification - entailment, neutral or contradiction ###

              raw_inputs = [f"{sentence_1}</s></s>{sentence_2}"]

              inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")

              # print(inputs)

              outputs = text_classification_model(**inputs)

              outputs = torch.nn.functional.softmax(outputs.logits, dim = -1)
              # print(outputs)

              # argmax_index = torch.argmax(outputs).item()

              print(text_classification_model.config.id2label[0], ":", round(outputs[0][0].item()*100,2),"%")
              print(text_classification_model.config.id2label[1], ":", round(outputs[0][1].item()*100,2),"%")
              print(text_classification_model.config.id2label[2], ":", round(outputs[0][2].item()*100,2),"%")

              st.subheader("Text classification for both sentences:")

              st.write(text_classification_model.config.id2label[1], ":", round(outputs[0][1].item()*100,2),"%")
              st.write(text_classification_model.config.id2label[0], ":", round(outputs[0][0].item()*100,2),"%")
              st.write(text_classification_model.config.id2label[2], ":", round(outputs[0][2].item()*100,2),"%")


              ### Extract keywords with YAKE ### (might make more sense with word cloud)

              st.subheader("Keywords:")

              kw_extractor = yake.KeywordExtractor(top=10, stopwords=None)
              keywords = kw_extractor.extract_keywords(sentence_2)

              # keywords_array = []

              for kw, v in keywords:
                # print("Keyphrase: ", kw, ": score", v)
                # keywords_array.append(kw)

                st.write(kw)





if sidebar_selectbox == "Bulk upload and mark":

       st.subheader("Bulk compare similarity of sentences")
       
       sentence_reference = st.text_input("Reference sentence input")
       
       # Only allow user to upload CSV files
       data_file = st.file_uploader("Upload CSV",type=["csv"])
       
       if data_file is not None:
              with st.spinner('Wait for it...'):
                     file_details = {"filename":data_file.name, "filetype":data_file.type, "filesize":data_file.size}
                     # st.write(file_details)
                     df = pd.read_csv(data_file)
                     
                     # Get length of df.shape (might not need this)
                     #total_rows = df.shape[0]
                     
                     similarity_scores = []
                     
                     for idx, row in df.iterrows():
                            # st.write(idx, row['Sentences'])
                            
                            # Create an empty sentence list
                            sentences = []
                            
                            # Compare the setences two by two
                            sentence_comparison = row['Sentences']
                            sentences.append(sentence_reference)
                            sentences.append(sentence_comparison)
                            
                            sentence_embeddings = sentence_transformer_model.encode(sentences)
                            
                            cos_sim = cosine_similarity(sentence_embeddings[0].reshape(1, -1), sentence_embeddings[1].reshape(1, -1))[0][0]
                            cos_sim = round(cos_sim * 100)
                            
                            similarity_scores.append(cos_sim)                    
                     
                     # Append new column to dataframe
                     
                     df['Similarity (%)'] = similarity_scores
                     
                     st.dataframe(df)
              st.success('Done!')  
              
              @st.cache
              def convert_df(df):
                     return df.to_csv().encode('utf-8')
                     
              csv = convert_df(df)
              
              st.download_button(
                 "Press to Download",
                 csv,
                 "marked assignment.csv",
                 "text/csv",
                 key='download-csv'
              )