Shrikrishna commited on
Commit
8821593
1 Parent(s): a4457d8

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +21 -0
  2. cv.pkl +3 -0
  3. helper.py +314 -0
  4. model.pkl +3 -0
  5. stopwords.pkl +3 -0
app.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import helper
3
+ import pickle
4
+
5
+ model = pickle.load(open('model.pkl','rb'))
6
+
7
+ st.header('Duplicate Question Pairs')
8
+
9
+ q1 = st.text_input('Enter question 1')
10
+ q2 = st.text_input('Enter question 2')
11
+
12
+ if st.button('Find'):
13
+ query = helper.query_point_creator(q1,q2)
14
+ result = model.predict(query)[0]
15
+
16
+ if result:
17
+ st.header('Duplicate')
18
+ else:
19
+ st.header('Not Duplicate')
20
+
21
+
cv.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0dd431bc16109e00aec45975ec9fa7a4defd28476947fdd181141c49c12c9e2
3
+ size 901204
helper.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from bs4 import BeautifulSoup
3
+ import distance
4
+ from fuzzywuzzy import fuzz
5
+ import pickle
6
+ import numpy as np
7
+
8
+ cv = pickle.load(open('cv.pkl','rb'))
9
+
10
+
11
+ def test_common_words(q1,q2):
12
+ w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
13
+ w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))
14
+ return len(w1 & w2)
15
+
16
+ def test_total_words(q1,q2):
17
+ w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
18
+ w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))
19
+ return (len(w1) + len(w2))
20
+
21
+
22
+ def test_fetch_token_features(q1, q2):
23
+ SAFE_DIV = 0.0001
24
+
25
+ STOP_WORDS = pickle.load(open('stopwords.pkl','rb'))
26
+
27
+ token_features = [0.0] * 8
28
+
29
+ # Converting the Sentence into Tokens:
30
+ q1_tokens = q1.split()
31
+ q2_tokens = q2.split()
32
+
33
+ if len(q1_tokens) == 0 or len(q2_tokens) == 0:
34
+ return token_features
35
+
36
+ # Get the non-stopwords in Questions
37
+ q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
38
+ q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
39
+
40
+ # Get the stopwords in Questions
41
+ q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
42
+ q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
43
+
44
+ # Get the common non-stopwords from Question pair
45
+ common_word_count = len(q1_words.intersection(q2_words))
46
+
47
+ # Get the common stopwords from Question pair
48
+ common_stop_count = len(q1_stops.intersection(q2_stops))
49
+
50
+ # Get the common Tokens from Question pair
51
+ common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
52
+
53
+ token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
54
+ token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
55
+ token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
56
+ token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
57
+ token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
58
+ token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
59
+
60
+ # Last word of both question is same or not
61
+ token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
62
+
63
+ # First word of both question is same or not
64
+ token_features[7] = int(q1_tokens[0] == q2_tokens[0])
65
+
66
+ return token_features
67
+
68
+
69
+ def test_fetch_length_features(q1, q2):
70
+ length_features = [0.0] * 3
71
+
72
+ # Converting the Sentence into Tokens:
73
+ q1_tokens = q1.split()
74
+ q2_tokens = q2.split()
75
+
76
+ if len(q1_tokens) == 0 or len(q2_tokens) == 0:
77
+ return length_features
78
+
79
+ # Absolute length features
80
+ length_features[0] = abs(len(q1_tokens) - len(q2_tokens))
81
+
82
+ # Average Token Length of both Questions
83
+ length_features[1] = (len(q1_tokens) + len(q2_tokens)) / 2
84
+
85
+ strs = list(distance.lcsubstrings(q1, q2))
86
+ length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)
87
+
88
+ return length_features
89
+
90
+
91
+ def test_fetch_fuzzy_features(q1, q2):
92
+ fuzzy_features = [0.0] * 4
93
+
94
+ # fuzz_ratio
95
+ fuzzy_features[0] = fuzz.QRatio(q1, q2)
96
+
97
+ # fuzz_partial_ratio
98
+ fuzzy_features[1] = fuzz.partial_ratio(q1, q2)
99
+
100
+ # token_sort_ratio
101
+ fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)
102
+
103
+ # token_set_ratio
104
+ fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)
105
+
106
+ return fuzzy_features
107
+
108
+
109
+ def preprocess(q):
110
+ q = str(q).lower().strip()
111
+
112
+ # Replace certain special characters with their string equivalents
113
+ q = q.replace('%', ' percent')
114
+ q = q.replace('$', ' dollar ')
115
+ q = q.replace('₹', ' rupee ')
116
+ q = q.replace('€', ' euro ')
117
+ q = q.replace('@', ' at ')
118
+
119
+ # The pattern '[math]' appears around 900 times in the whole dataset.
120
+ q = q.replace('[math]', '')
121
+
122
+ # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
123
+ q = q.replace(',000,000,000 ', 'b ')
124
+ q = q.replace(',000,000 ', 'm ')
125
+ q = q.replace(',000 ', 'k ')
126
+ q = re.sub(r'([0-9]+)000000000', r'\1b', q)
127
+ q = re.sub(r'([0-9]+)000000', r'\1m', q)
128
+ q = re.sub(r'([0-9]+)000', r'\1k', q)
129
+
130
+ # Decontracting words
131
+ # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
132
+ # https://stackoverflow.com/a/19794953
133
+ contractions = {
134
+ "ain't": "am not",
135
+ "aren't": "are not",
136
+ "can't": "can not",
137
+ "can't've": "can not have",
138
+ "'cause": "because",
139
+ "could've": "could have",
140
+ "couldn't": "could not",
141
+ "couldn't've": "could not have",
142
+ "didn't": "did not",
143
+ "doesn't": "does not",
144
+ "don't": "do not",
145
+ "hadn't": "had not",
146
+ "hadn't've": "had not have",
147
+ "hasn't": "has not",
148
+ "haven't": "have not",
149
+ "he'd": "he would",
150
+ "he'd've": "he would have",
151
+ "he'll": "he will",
152
+ "he'll've": "he will have",
153
+ "he's": "he is",
154
+ "how'd": "how did",
155
+ "how'd'y": "how do you",
156
+ "how'll": "how will",
157
+ "how's": "how is",
158
+ "i'd": "i would",
159
+ "i'd've": "i would have",
160
+ "i'll": "i will",
161
+ "i'll've": "i will have",
162
+ "i'm": "i am",
163
+ "i've": "i have",
164
+ "isn't": "is not",
165
+ "it'd": "it would",
166
+ "it'd've": "it would have",
167
+ "it'll": "it will",
168
+ "it'll've": "it will have",
169
+ "it's": "it is",
170
+ "let's": "let us",
171
+ "ma'am": "madam",
172
+ "mayn't": "may not",
173
+ "might've": "might have",
174
+ "mightn't": "might not",
175
+ "mightn't've": "might not have",
176
+ "must've": "must have",
177
+ "mustn't": "must not",
178
+ "mustn't've": "must not have",
179
+ "needn't": "need not",
180
+ "needn't've": "need not have",
181
+ "o'clock": "of the clock",
182
+ "oughtn't": "ought not",
183
+ "oughtn't've": "ought not have",
184
+ "shan't": "shall not",
185
+ "sha'n't": "shall not",
186
+ "shan't've": "shall not have",
187
+ "she'd": "she would",
188
+ "she'd've": "she would have",
189
+ "she'll": "she will",
190
+ "she'll've": "she will have",
191
+ "she's": "she is",
192
+ "should've": "should have",
193
+ "shouldn't": "should not",
194
+ "shouldn't've": "should not have",
195
+ "so've": "so have",
196
+ "so's": "so as",
197
+ "that'd": "that would",
198
+ "that'd've": "that would have",
199
+ "that's": "that is",
200
+ "there'd": "there would",
201
+ "there'd've": "there would have",
202
+ "there's": "there is",
203
+ "they'd": "they would",
204
+ "they'd've": "they would have",
205
+ "they'll": "they will",
206
+ "they'll've": "they will have",
207
+ "they're": "they are",
208
+ "they've": "they have",
209
+ "to've": "to have",
210
+ "wasn't": "was not",
211
+ "we'd": "we would",
212
+ "we'd've": "we would have",
213
+ "we'll": "we will",
214
+ "we'll've": "we will have",
215
+ "we're": "we are",
216
+ "we've": "we have",
217
+ "weren't": "were not",
218
+ "what'll": "what will",
219
+ "what'll've": "what will have",
220
+ "what're": "what are",
221
+ "what's": "what is",
222
+ "what've": "what have",
223
+ "when's": "when is",
224
+ "when've": "when have",
225
+ "where'd": "where did",
226
+ "where's": "where is",
227
+ "where've": "where have",
228
+ "who'll": "who will",
229
+ "who'll've": "who will have",
230
+ "who's": "who is",
231
+ "who've": "who have",
232
+ "why's": "why is",
233
+ "why've": "why have",
234
+ "will've": "will have",
235
+ "won't": "will not",
236
+ "won't've": "will not have",
237
+ "would've": "would have",
238
+ "wouldn't": "would not",
239
+ "wouldn't've": "would not have",
240
+ "y'all": "you all",
241
+ "y'all'd": "you all would",
242
+ "y'all'd've": "you all would have",
243
+ "y'all're": "you all are",
244
+ "y'all've": "you all have",
245
+ "you'd": "you would",
246
+ "you'd've": "you would have",
247
+ "you'll": "you will",
248
+ "you'll've": "you will have",
249
+ "you're": "you are",
250
+ "you've": "you have"
251
+ }
252
+
253
+ q_decontracted = []
254
+
255
+ for word in q.split():
256
+ if word in contractions:
257
+ word = contractions[word]
258
+
259
+ q_decontracted.append(word)
260
+
261
+ q = ' '.join(q_decontracted)
262
+ q = q.replace("'ve", " have")
263
+ q = q.replace("n't", " not")
264
+ q = q.replace("'re", " are")
265
+ q = q.replace("'ll", " will")
266
+
267
+ # Removing HTML tags
268
+ q = BeautifulSoup(q)
269
+ q = q.get_text()
270
+
271
+ # Remove punctuations
272
+ pattern = re.compile('\W')
273
+ q = re.sub(pattern, ' ', q).strip()
274
+
275
+ return q
276
+
277
+
278
+ def query_point_creator(q1, q2):
279
+ input_query = []
280
+
281
+ # preprocess
282
+ q1 = preprocess(q1)
283
+ q2 = preprocess(q2)
284
+
285
+ # fetch basic features
286
+ input_query.append(len(q1))
287
+ input_query.append(len(q2))
288
+
289
+ input_query.append(len(q1.split(" ")))
290
+ input_query.append(len(q2.split(" ")))
291
+
292
+ input_query.append(test_common_words(q1, q2))
293
+ input_query.append(test_total_words(q1, q2))
294
+ input_query.append(round(test_common_words(q1, q2) / test_total_words(q1, q2), 2))
295
+
296
+ # fetch token features
297
+ token_features = test_fetch_token_features(q1, q2)
298
+ input_query.extend(token_features)
299
+
300
+ # fetch length based features
301
+ length_features = test_fetch_length_features(q1, q2)
302
+ input_query.extend(length_features)
303
+
304
+ # fetch fuzzy features
305
+ fuzzy_features = test_fetch_fuzzy_features(q1, q2)
306
+ input_query.extend(fuzzy_features)
307
+
308
+ # bow feature for q1
309
+ q1_bow = cv.transform([q1]).toarray()
310
+
311
+ # bow feature for q2
312
+ q2_bow = cv.transform([q2]).toarray()
313
+
314
+ return np.hstack((np.array(input_query).reshape(1, 22), q1_bow, q2_bow))
model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e23f83ef4f692fbc43e33fe6f67e6241ce33c519e8a3baed061197220b51fd1
3
+ size 280451245
stopwords.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:787b4474df155880cb742e3b470c075f61e8e35588e2778022f9af6bd3232651
3
+ size 2018