tushargandhi77 commited on
Commit
397bbcd
1 Parent(s): 00fb394

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +19 -0
  2. cv.pkl +3 -0
  3. helper.py +315 -0
  4. model.pkl +3 -0
  5. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import helper
3
+ import pickle
4
+
5
+ model = pickle.load(open('model.pkl','rb'))
6
+
7
+ st.header('Duplicate Question Pairs')
8
+
9
+ q1 = st.text_input('Enter question 1')
10
+ q2 = st.text_input('Enter question 2')
11
+
12
+ if st.button('Find'):
13
+ query = helper.query_point_creator(q1,q2)
14
+ result = model.predict(query)[0]
15
+
16
+ if result:
17
+ st.header('Duplicate')
18
+ else:
19
+ st.header('Not Duplicate')
cv.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8effcafa031d5404e4c3ca10bf0d5f32bacf196771c5b625df56923336a5a6a
3
+ size 325457
helper.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from bs4 import BeautifulSoup
3
+ import distance
4
+ from fuzzywuzzy import fuzz
5
+ import pickle
6
+ import numpy as np
7
+ from nltk.corpus import stopwords
8
+
9
+ cv = pickle.load(open('cv.pkl','rb'))
10
+
11
+
12
+ def test_common_words(q1,q2):
13
+ w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
14
+ w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))
15
+ return len(w1 & w2)
16
+
17
+ def test_total_words(q1,q2):
18
+ w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
19
+ w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))
20
+ return (len(w1) + len(w2))
21
+
22
+
23
+ def test_fetch_token_features(q1, q2):
24
+ SAFE_DIV = 0.0001
25
+
26
+ STOP_WORDS = stopwords.words("english")
27
+
28
+ token_features = [0.0] * 8
29
+
30
+ # Converting the Sentence into Tokens:
31
+ q1_tokens = q1.split()
32
+ q2_tokens = q2.split()
33
+
34
+ if len(q1_tokens) == 0 or len(q2_tokens) == 0:
35
+ return token_features
36
+
37
+ # Get the non-stopwords in Questions
38
+ q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
39
+ q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
40
+
41
+ # Get the stopwords in Questions
42
+ q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
43
+ q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
44
+
45
+ # Get the common non-stopwords from Question pair
46
+ common_word_count = len(q1_words.intersection(q2_words))
47
+
48
+ # Get the common stopwords from Question pair
49
+ common_stop_count = len(q1_stops.intersection(q2_stops))
50
+
51
+ # Get the common Tokens from Question pair
52
+ common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
53
+
54
+ token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
55
+ token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
56
+ token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
57
+ token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
58
+ token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
59
+ token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
60
+
61
+ # Last word of both question is same or not
62
+ token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
63
+
64
+ # First word of both question is same or not
65
+ token_features[7] = int(q1_tokens[0] == q2_tokens[0])
66
+
67
+ return token_features
68
+
69
+
70
+ def test_fetch_length_features(q1, q2):
71
+ length_features = [0.0] * 3
72
+
73
+ # Converting the Sentence into Tokens:
74
+ q1_tokens = q1.split()
75
+ q2_tokens = q2.split()
76
+
77
+ if len(q1_tokens) == 0 or len(q2_tokens) == 0:
78
+ return length_features
79
+
80
+ # Absolute length features
81
+ length_features[0] = abs(len(q1_tokens) - len(q2_tokens))
82
+
83
+ # Average Token Length of both Questions
84
+ length_features[1] = (len(q1_tokens) + len(q2_tokens)) / 2
85
+
86
+ strs = list(distance.lcsubstrings(q1, q2))
87
+ length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)
88
+
89
+ return length_features
90
+
91
+
92
+ def test_fetch_fuzzy_features(q1, q2):
93
+ fuzzy_features = [0.0] * 4
94
+
95
+ # fuzz_ratio
96
+ fuzzy_features[0] = fuzz.QRatio(q1, q2)
97
+
98
+ # fuzz_partial_ratio
99
+ fuzzy_features[1] = fuzz.partial_ratio(q1, q2)
100
+
101
+ # token_sort_ratio
102
+ fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)
103
+
104
+ # token_set_ratio
105
+ fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)
106
+
107
+ return fuzzy_features
108
+
109
+
110
+ def preprocess(q):
111
+ q = str(q).lower().strip()
112
+
113
+ # Replace certain special characters with their string equivalents
114
+ q = q.replace('%', ' percent')
115
+ q = q.replace('$', ' dollar ')
116
+ q = q.replace('₹', ' rupee ')
117
+ q = q.replace('€', ' euro ')
118
+ q = q.replace('@', ' at ')
119
+
120
+ # The pattern '[math]' appears around 900 times in the whole dataset.
121
+ q = q.replace('[math]', '')
122
+
123
+ # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
124
+ q = q.replace(',000,000,000 ', 'b ')
125
+ q = q.replace(',000,000 ', 'm ')
126
+ q = q.replace(',000 ', 'k ')
127
+ q = re.sub(r'([0-9]+)000000000', r'\1b', q)
128
+ q = re.sub(r'([0-9]+)000000', r'\1m', q)
129
+ q = re.sub(r'([0-9]+)000', r'\1k', q)
130
+
131
+ # Decontracting words
132
+ # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
133
+ # https://stackoverflow.com/a/19794953
134
+ contractions = {
135
+ "ain't": "am not",
136
+ "aren't": "are not",
137
+ "can't": "can not",
138
+ "can't've": "can not have",
139
+ "'cause": "because",
140
+ "could've": "could have",
141
+ "couldn't": "could not",
142
+ "couldn't've": "could not have",
143
+ "didn't": "did not",
144
+ "doesn't": "does not",
145
+ "don't": "do not",
146
+ "hadn't": "had not",
147
+ "hadn't've": "had not have",
148
+ "hasn't": "has not",
149
+ "haven't": "have not",
150
+ "he'd": "he would",
151
+ "he'd've": "he would have",
152
+ "he'll": "he will",
153
+ "he'll've": "he will have",
154
+ "he's": "he is",
155
+ "how'd": "how did",
156
+ "how'd'y": "how do you",
157
+ "how'll": "how will",
158
+ "how's": "how is",
159
+ "i'd": "i would",
160
+ "i'd've": "i would have",
161
+ "i'll": "i will",
162
+ "i'll've": "i will have",
163
+ "i'm": "i am",
164
+ "i've": "i have",
165
+ "isn't": "is not",
166
+ "it'd": "it would",
167
+ "it'd've": "it would have",
168
+ "it'll": "it will",
169
+ "it'll've": "it will have",
170
+ "it's": "it is",
171
+ "let's": "let us",
172
+ "ma'am": "madam",
173
+ "mayn't": "may not",
174
+ "might've": "might have",
175
+ "mightn't": "might not",
176
+ "mightn't've": "might not have",
177
+ "must've": "must have",
178
+ "mustn't": "must not",
179
+ "mustn't've": "must not have",
180
+ "needn't": "need not",
181
+ "needn't've": "need not have",
182
+ "o'clock": "of the clock",
183
+ "oughtn't": "ought not",
184
+ "oughtn't've": "ought not have",
185
+ "shan't": "shall not",
186
+ "sha'n't": "shall not",
187
+ "shan't've": "shall not have",
188
+ "she'd": "she would",
189
+ "she'd've": "she would have",
190
+ "she'll": "she will",
191
+ "she'll've": "she will have",
192
+ "she's": "she is",
193
+ "should've": "should have",
194
+ "shouldn't": "should not",
195
+ "shouldn't've": "should not have",
196
+ "so've": "so have",
197
+ "so's": "so as",
198
+ "that'd": "that would",
199
+ "that'd've": "that would have",
200
+ "that's": "that is",
201
+ "there'd": "there would",
202
+ "there'd've": "there would have",
203
+ "there's": "there is",
204
+ "they'd": "they would",
205
+ "they'd've": "they would have",
206
+ "they'll": "they will",
207
+ "they'll've": "they will have",
208
+ "they're": "they are",
209
+ "they've": "they have",
210
+ "to've": "to have",
211
+ "wasn't": "was not",
212
+ "we'd": "we would",
213
+ "we'd've": "we would have",
214
+ "we'll": "we will",
215
+ "we'll've": "we will have",
216
+ "we're": "we are",
217
+ "we've": "we have",
218
+ "weren't": "were not",
219
+ "what'll": "what will",
220
+ "what'll've": "what will have",
221
+ "what're": "what are",
222
+ "what's": "what is",
223
+ "what've": "what have",
224
+ "when's": "when is",
225
+ "when've": "when have",
226
+ "where'd": "where did",
227
+ "where's": "where is",
228
+ "where've": "where have",
229
+ "who'll": "who will",
230
+ "who'll've": "who will have",
231
+ "who's": "who is",
232
+ "who've": "who have",
233
+ "why's": "why is",
234
+ "why've": "why have",
235
+ "will've": "will have",
236
+ "won't": "will not",
237
+ "won't've": "will not have",
238
+ "would've": "would have",
239
+ "wouldn't": "would not",
240
+ "wouldn't've": "would not have",
241
+ "y'all": "you all",
242
+ "y'all'd": "you all would",
243
+ "y'all'd've": "you all would have",
244
+ "y'all're": "you all are",
245
+ "y'all've": "you all have",
246
+ "you'd": "you would",
247
+ "you'd've": "you would have",
248
+ "you'll": "you will",
249
+ "you'll've": "you will have",
250
+ "you're": "you are",
251
+ "you've": "you have"
252
+ }
253
+
254
+ q_decontracted = []
255
+
256
+ for word in q.split():
257
+ if word in contractions:
258
+ word = contractions[word]
259
+
260
+ q_decontracted.append(word)
261
+
262
+ q = ' '.join(q_decontracted)
263
+ q = q.replace("'ve", " have")
264
+ q = q.replace("n't", " not")
265
+ q = q.replace("'re", " are")
266
+ q = q.replace("'ll", " will")
267
+
268
+ # Removing HTML tags
269
+ q = BeautifulSoup(q)
270
+ q = q.get_text()
271
+
272
+ # Remove punctuations
273
+ pattern = re.compile('\W')
274
+ q = re.sub(pattern, ' ', q).strip()
275
+
276
+ return q
277
+
278
+
279
+ def query_point_creator(q1, q2):
280
+ input_query = []
281
+
282
+ # preprocess
283
+ q1 = preprocess(q1)
284
+ q2 = preprocess(q2)
285
+
286
+ # fetch basic features
287
+ input_query.append(len(q1))
288
+ input_query.append(len(q2))
289
+
290
+ input_query.append(len(q1.split(" ")))
291
+ input_query.append(len(q2.split(" ")))
292
+
293
+ input_query.append(test_common_words(q1, q2))
294
+ input_query.append(test_total_words(q1, q2))
295
+ input_query.append(round(test_common_words(q1, q2) / test_total_words(q1, q2), 2))
296
+
297
+ # fetch token features
298
+ token_features = test_fetch_token_features(q1, q2)
299
+ input_query.extend(token_features)
300
+
301
+ # fetch length based features
302
+ length_features = test_fetch_length_features(q1, q2)
303
+ input_query.extend(length_features)
304
+
305
+ # fetch fuzzy features
306
+ fuzzy_features = test_fetch_fuzzy_features(q1, q2)
307
+ input_query.extend(fuzzy_features)
308
+
309
+ # bow feature for q1
310
+ q1_bow = cv.transform([q1]).toarray()
311
+
312
+ # bow feature for q2
313
+ q2_bow = cv.transform([q2]).toarray()
314
+
315
+ return np.hstack((np.array(input_query).reshape(1, 22), q1_bow, q2_bow))
model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5411c4c7bddaefcba1bd64586f54dbe21dbea67a16326e200cf3a661571adb10
3
+ size 97422637
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ sklearn
3
+ fuzzywuzzy
4
+ distance
5
+ bs4