Kuaaangwen commited on
Commit
917d2f9
1 Parent(s): 411304b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -14
app.py CHANGED
@@ -1,21 +1,30 @@
 
1
 
2
- ''' To-do
 
 
 
3
 
4
- Create a side bar to compare two or upload CSV
 
 
5
 
6
- In the second tab, allow them to compare all CSV files
7
 
 
 
8
 
9
- '''
10
 
11
- import streamlit as st
12
- import pandas as pd
13
- from sentence_transformers import SentenceTransformer
14
- from sklearn.metrics.pairwise import cosine_similarity
15
 
16
- model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1')
17
 
18
- # Streamlit interface
 
 
 
 
19
 
20
  st.title("Sentence Similarity")
21
 
@@ -40,6 +49,10 @@ if sidebar_selectbox == "Compare two sentences":
40
 
41
  # If submit_button_compare clicked
42
  if submit_button_compare:
 
 
 
 
43
 
44
  # Perform calculations
45
 
@@ -51,14 +64,61 @@ if sidebar_selectbox == "Compare two sentences":
51
  sentences.append(sentence_2)
52
 
53
  # Create embeddings for both sentences
54
- sentence_embeddings = model.encode(sentences)
55
 
56
  cos_sim = cosine_similarity(sentence_embeddings[0].reshape(1, -1), sentence_embeddings[1].reshape(1, -1))[0][0]
57
  cos_sim = round(cos_sim * 100) # Convert to percentage and round-off
58
 
59
 
60
- st.write('Similarity between {} and {} is {}%'.format(sentence_1,
61
- sentence_2, cos_sim))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
 
64
 
@@ -93,7 +153,7 @@ if sidebar_selectbox == "Bulk upload and mark":
93
  sentences.append(sentence_reference)
94
  sentences.append(sentence_comparison)
95
 
96
- sentence_embeddings = model.encode(sentences)
97
 
98
  cos_sim = cosine_similarity(sentence_embeddings[0].reshape(1, -1), sentence_embeddings[1].reshape(1, -1))[0][0]
99
  cos_sim = round(cos_sim * 100)
 
1
+ import streamlit as st
2
 
3
+ # Library for Sentence Similarity
4
+ import pandas as pd
5
+ from sentence_transformers import SentenceTransformer
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
 
8
+ # Library for Entailment
9
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
10
+ import torch
11
 
 
12
 
13
+ # Library for keyword extraction
14
+ import yake
15
 
 
16
 
17
+ # Load models and tokenisers for both sentence transformers and text classification
18
+
19
+ sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2')
 
20
 
21
+ tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
22
 
23
+ text_classification_model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")
24
+
25
+
26
+
27
+ ### Streamlit interface ###
28
 
29
  st.title("Sentence Similarity")
30
 
 
49
 
50
  # If submit_button_compare clicked
51
  if submit_button_compare:
52
+
53
+ print("Comparing sentences...")
54
+
55
+ ### Compare Sentence Similarity ###
56
 
57
  # Perform calculations
58
 
 
64
  sentences.append(sentence_2)
65
 
66
  # Create embeddings for both sentences
67
+ sentence_embeddings = sentence_transformer_model.encode(sentences)
68
 
69
  cos_sim = cosine_similarity(sentence_embeddings[0].reshape(1, -1), sentence_embeddings[1].reshape(1, -1))[0][0]
70
  cos_sim = round(cos_sim * 100) # Convert to percentage and round-off
71
 
72
 
73
+ # st.write('Similarity between "{}" and "{}" is {}%'.format(sentence_1,
74
+ # sentence_2, cos_sim))
75
+
76
+ st.subheader("Similarity")
77
+ st.write(f"Similarity between the two sentences is {cos_sim}%.")
78
+
79
+
80
+ ### Text classification - entailment, neutral or contradiction ###
81
+
82
+ raw_inputs = [f"{sentence_1}</s></s>{sentence_2}"]
83
+
84
+ inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
85
+
86
+ # print(inputs)
87
+
88
+ outputs = text_classification_model(**inputs)
89
+
90
+ outputs = torch.nn.functional.softmax(outputs.logits, dim = -1)
91
+ # print(outputs)
92
+
93
+ # argmax_index = torch.argmax(outputs).item()
94
+
95
+ print(text_classification_model.config.id2label[0], ":", round(outputs[0][0].item()*100,2),"%")
96
+ print(text_classification_model.config.id2label[1], ":", round(outputs[0][1].item()*100,2),"%")
97
+ print(text_classification_model.config.id2label[2], ":", round(outputs[0][2].item()*100,2),"%")
98
+
99
+ st.subheader("Text classification for both sentences:")
100
+
101
+ st.write(text_classification_model.config.id2label[1], ":", round(outputs[0][1].item()*100,2),"%")
102
+ st.write(text_classification_model.config.id2label[0], ":", round(outputs[0][0].item()*100,2),"%")
103
+ st.write(text_classification_model.config.id2label[2], ":", round(outputs[0][2].item()*100,2),"%")
104
+
105
+
106
+ ### Extract keywords with YAKE ### (might make more sense with word cloud)
107
+
108
+ st.subheader("Keywords:")
109
+
110
+ kw_extractor = yake.KeywordExtractor(top=10, stopwords=None)
111
+ keywords = kw_extractor.extract_keywords(sentence_2)
112
+
113
+ # keywords_array = []
114
+
115
+ for kw, v in keywords:
116
+ # print("Keyphrase: ", kw, ": score", v)
117
+ # keywords_array.append(kw)
118
+
119
+ st.write(kw)
120
+
121
+
122
 
123
 
124
 
 
153
  sentences.append(sentence_reference)
154
  sentences.append(sentence_comparison)
155
 
156
+ sentence_embeddings = sentence_transformer_model.encode(sentences)
157
 
158
  cos_sim = cosine_similarity(sentence_embeddings[0].reshape(1, -1), sentence_embeddings[1].reshape(1, -1))[0][0]
159
  cos_sim = round(cos_sim * 100)