dipannitaray commited on
Commit
09c0b7e
1 Parent(s): 886bf4c

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +64 -0
  2. cv.pkl +3 -0
  3. helper.py +323 -0
  4. images.png +0 -0
  5. model.pkl +3 -0
app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import helper
3
+ import argparse
4
+ import pickle
5
+ import base64
6
+
7
+
8
+ def get_base64(bin_file):
9
+ with open(bin_file, 'rb') as f:
10
+ data = f.read()
11
+ return base64.b64encode(data).decode()
12
+
13
+ def set_background(png_file):
14
+ bin_str = get_base64(png_file)
15
+ page_bg_img = '''
16
+ <style>
17
+ .stApp {
18
+ background-image: url("data:image/png;base64,%s");
19
+ background-size: cover;
20
+ }
21
+ </style>
22
+ ''' % bin_str
23
+ st.markdown(page_bg_img, unsafe_allow_html=True)
24
+ set_background('quorabackgr.jpg')
25
+
26
+ # Initialize argparse
27
+ parser = argparse.ArgumentParser(description="Streamlit App with Command-Line Arguments")
28
+ parser.add_argument('--input_file', type=str, help='Path to the input file')
29
+ parser.add_argument('--output_file', type=str, help='Path to the output file')
30
+ args = parser.parse_args()
31
+
32
+ if args.input_file:
33
+ input_file = args.input_file
34
+ else:
35
+ input_file = "default_input.txt"
36
+
37
+ if args.output_file:
38
+ output_file = args.output_file
39
+ else:
40
+ output_file = "default_output.txt"
41
+
42
+ # Add an image
43
+ st.image("images.png")
44
+ with open('model.pkl', 'rb') as f:
45
+ unpickler = pickle.Unpickler(f)
46
+ while True:
47
+ try:
48
+ model = unpickler.load()
49
+ break
50
+ except EOFError:
51
+ continue
52
+ model = pickle.load(open('model.pkl', 'rb'))
53
+
54
+ st.header('Predicting if the given Questions are duplicate or not')
55
+
56
+ q1 = st.text_input('Enter the First Question')
57
+ q2 = st.text_input('Enter the Second Question')
58
+ if st.button('Predict'):
59
+ query = helper.query_point_creator(q1,q2)
60
+ result = model.predict(query)[0]
61
+ if result:
62
+ st.header('The given Questions is Duplicate')
63
+ else:
64
+ st.header('The given Quesions are Not Duplicate')
cv.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5099c22544c6d7ee6565dcfd66c3c0b1de44f0464bcf664cbe23db76b71adc52
3
+ size 325457
helper.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from bs4 import BeautifulSoup
3
+ import distance
4
+ from fuzzywuzzy import fuzz
5
+ import pickle
6
+ import numpy as np
7
+ from nltk.corpus import stopwords
8
+
9
+ cv = pickle.load(open('cv.pkl','rb'))
10
+
11
+ def preprocess(q):
12
+ q = str(q).lower().strip()
13
+
14
+ # Replace certain special characters with their string equivalents
15
+ q = q.replace('%', ' percent')
16
+ q = q.replace('$', ' dollar ')
17
+ q = q.replace('₹', ' rupee ')
18
+ q = q.replace('€', ' euro ')
19
+ q = q.replace('@', ' at ')
20
+
21
+ # The pattern '[math]' appears around 900 times in the whole dataset.
22
+ q = q.replace('[math]', '')
23
+
24
+ # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
25
+ q = q.replace(',000,000,000 ', 'b ')
26
+ q = q.replace(',000,000 ', 'm ')
27
+ q = q.replace(',000 ', 'k ')
28
+ q = re.sub(r'([0-9]+)000000000', r'\1b', q)
29
+ q = re.sub(r'([0-9]+)000000', r'\1m', q)
30
+ q = re.sub(r'([0-9]+)000', r'\1k', q)
31
+
32
+ # Decontracting words
33
+ # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
34
+ # https://stackoverflow.com/a/19794953
35
+ contractions = {
36
+ "ain't": "am not",
37
+ "aren't": "are not",
38
+ "can't": "can not",
39
+ "can't've": "can not have",
40
+ "'cause": "because",
41
+ "could've": "could have",
42
+ "couldn't": "could not",
43
+ "couldn't've": "could not have",
44
+ "didn't": "did not",
45
+ "doesn't": "does not",
46
+ "don't": "do not",
47
+ "hadn't": "had not",
48
+ "hadn't've": "had not have",
49
+ "hasn't": "has not",
50
+ "haven't": "have not",
51
+ "he'd": "he would",
52
+ "he'd've": "he would have",
53
+ "he'll": "he will",
54
+ "he'll've": "he will have",
55
+ "he's": "he is",
56
+ "how'd": "how did",
57
+ "how'd'y": "how do you",
58
+ "how'll": "how will",
59
+ "how's": "how is",
60
+ "i'd": "i would",
61
+ "i'd've": "i would have",
62
+ "i'll": "i will",
63
+ "i'll've": "i will have",
64
+ "i'm": "i am",
65
+ "i've": "i have",
66
+ "isn't": "is not",
67
+ "it'd": "it would",
68
+ "it'd've": "it would have",
69
+ "it'll": "it will",
70
+ "it'll've": "it will have",
71
+ "it's": "it is",
72
+ "let's": "let us",
73
+ "ma'am": "madam",
74
+ "mayn't": "may not",
75
+ "might've": "might have",
76
+ "mightn't": "might not",
77
+ "mightn't've": "might not have",
78
+ "must've": "must have",
79
+ "mustn't": "must not",
80
+ "mustn't've": "must not have",
81
+ "needn't": "need not",
82
+ "needn't've": "need not have",
83
+ "o'clock": "of the clock",
84
+ "oughtn't": "ought not",
85
+ "oughtn't've": "ought not have",
86
+ "shan't": "shall not",
87
+ "sha'n't": "shall not",
88
+ "shan't've": "shall not have",
89
+ "she'd": "she would",
90
+ "she'd've": "she would have",
91
+ "she'll": "she will",
92
+ "she'll've": "she will have",
93
+ "she's": "she is",
94
+ "should've": "should have",
95
+ "shouldn't": "should not",
96
+ "shouldn't've": "should not have",
97
+ "so've": "so have",
98
+ "so's": "so as",
99
+ "that'd": "that would",
100
+ "that'd've": "that would have",
101
+ "that's": "that is",
102
+ "there'd": "there would",
103
+ "there'd've": "there would have",
104
+ "there's": "there is",
105
+ "they'd": "they would",
106
+ "they'd've": "they would have",
107
+ "they'll": "they will",
108
+ "they'll've": "they will have",
109
+ "they're": "they are",
110
+ "they've": "they have",
111
+ "to've": "to have",
112
+ "wasn't": "was not",
113
+ "we'd": "we would",
114
+ "we'd've": "we would have",
115
+ "we'll": "we will",
116
+ "we'll've": "we will have",
117
+ "we're": "we are",
118
+ "we've": "we have",
119
+ "weren't": "were not",
120
+ "what'll": "what will",
121
+ "what'll've": "what will have",
122
+ "what're": "what are",
123
+ "what's": "what is",
124
+ "what've": "what have",
125
+ "when's": "when is",
126
+ "when've": "when have",
127
+ "where'd": "where did",
128
+ "where's": "where is",
129
+ "where've": "where have",
130
+ "who'll": "who will",
131
+ "who'll've": "who will have",
132
+ "who's": "who is",
133
+ "who've": "who have",
134
+ "why's": "why is",
135
+ "why've": "why have",
136
+ "will've": "will have",
137
+ "won't": "will not",
138
+ "won't've": "will not have",
139
+ "would've": "would have",
140
+ "wouldn't": "would not",
141
+ "wouldn't've": "would not have",
142
+ "y'all": "you all",
143
+ "y'all'd": "you all would",
144
+ "y'all'd've": "you all would have",
145
+ "y'all're": "you all are",
146
+ "y'all've": "you all have",
147
+ "you'd": "you would",
148
+ "you'd've": "you would have",
149
+ "you'll": "you will",
150
+ "you'll've": "you will have",
151
+ "you're": "you are",
152
+ "you've": "you have"
153
+ }
154
+
155
+ q_decontracted = []
156
+
157
+ for word in q.split():
158
+ if word in contractions:
159
+ word = contractions[word]
160
+
161
+ q_decontracted.append(word)
162
+
163
+ q = ' '.join(q_decontracted)
164
+ q = q.replace("'ve", " have")
165
+ q = q.replace("n't", " not")
166
+ q = q.replace("'re", " are")
167
+ q = q.replace("'ll", " will")
168
+
169
+ # Removing HTML tags
170
+ q = BeautifulSoup(q)
171
+ q = q.get_text()
172
+
173
+ # Remove punctuations
174
+ pattern = re.compile('\W')
175
+ q = re.sub(pattern, ' ', q).strip()
176
+
177
+ return q
178
+
179
+ def common_words(row):
180
+ w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
181
+ w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
182
+ return len(w1 & w2)
183
+ def common_words(row):
184
+ w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
185
+ w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
186
+ return len(w1 & w2)
187
+ def test_common_words(q1,q2):
188
+ w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
189
+ w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))
190
+ return len(w1 & w2)
191
+
192
+ def test_total_words(q1,q2):
193
+ w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
194
+ w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))
195
+ return (len(w1) + len(w2))
196
+
197
+
198
+ def test_fetch_token_features(q1, q2):
199
+ SAFE_DIV = 0.0001
200
+
201
+ stop_words = stopwords.words("english")
202
+
203
+ token_features = [0.0] * 8
204
+
205
+ # Converting the Sentence into Tokens:
206
+ q1_tokens = q1.split()
207
+ q2_tokens = q2.split()
208
+
209
+ if len(q1_tokens) == 0 or len(q2_tokens) == 0:
210
+ return token_features
211
+
212
+ # Get the non-stopwords in Questions
213
+ q1_words = set([word for word in q1_tokens if word not in stop_words])
214
+ q2_words = set([word for word in q2_tokens if word not in stop_words])
215
+
216
+ # Get the stopwords in Questions
217
+ q1_stops = set([word for word in q1_tokens if word in stop_words])
218
+ q2_stops = set([word for word in q2_tokens if word in stop_words])
219
+
220
+ # Get the common non-stopwords from Question pair
221
+ common_word_count = len(q1_words.intersection(q2_words))
222
+
223
+ # Get the common stopwords from Question pair
224
+ common_stop_count = len(q1_stops.intersection(q2_stops))
225
+
226
+ # Get the common Tokens from Question pair
227
+ common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
228
+
229
+ token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
230
+ token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
231
+ token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
232
+ token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
233
+ token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
234
+ token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
235
+
236
+ # Last word of both question is same or not
237
+ token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
238
+
239
+ # First word of both question is same or not
240
+ token_features[7] = int(q1_tokens[0] == q2_tokens[0])
241
+
242
+ return token_features
243
+
244
+
245
+ def test_fetch_length_features(q1, q2):
246
+ length_features = [0.0] * 3
247
+
248
+ # Converting the Sentence into Tokens:
249
+ q1_tokens = q1.split()
250
+ q2_tokens = q2.split()
251
+
252
+ if len(q1_tokens) == 0 or len(q2_tokens) == 0:
253
+ return length_features
254
+
255
+ # Absolute length features
256
+ length_features[0] = abs(len(q1_tokens) - len(q2_tokens))
257
+
258
+ # Average Token Length of both Questions
259
+ length_features[1] = (len(q1_tokens) + len(q2_tokens)) / 2
260
+
261
+ strs = list(distance.lcsubstrings(q1, q2))
262
+ length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)
263
+
264
+ return length_features
265
+
266
+
267
+
268
+ def test_fetch_fuzzy_features(q1, q2):
269
+ fuzzy_features = [0.0] * 4
270
+
271
+ # fuzz_ratio
272
+ fuzzy_features[0] = fuzz.QRatio(q1, q2)
273
+
274
+ # fuzz_partial_ratio
275
+ fuzzy_features[1] = fuzz.partial_ratio(q1, q2)
276
+
277
+ # token_sort_ratio
278
+ fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)
279
+
280
+ # token_set_ratio
281
+ fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)
282
+
283
+ return fuzzy_features
284
+
285
+
286
+
287
+ def query_point_creator(q1, q2):
288
+ input_query = []
289
+
290
+ # preprocess
291
+ q1 = preprocess(q1)
292
+ q2 = preprocess(q2)
293
+
294
+ # fetch basic features
295
+ input_query.append(len(q1))
296
+ input_query.append(len(q2))
297
+
298
+ input_query.append(len(q1.split(" ")))
299
+ input_query.append(len(q2.split(" ")))
300
+
301
+ input_query.append(test_common_words(q1, q2))
302
+ input_query.append(test_total_words(q1, q2))
303
+ input_query.append(round(test_common_words(q1, q2) / test_total_words(q1, q2), 2))
304
+
305
+ # fetch token features
306
+ token_features = test_fetch_token_features(q1, q2)
307
+ input_query.extend(token_features)
308
+
309
+ # fetch length based features
310
+ length_features = test_fetch_length_features(q1, q2)
311
+ input_query.extend(length_features)
312
+
313
+ # fetch fuzzy features
314
+ fuzzy_features = test_fetch_fuzzy_features(q1, q2)
315
+ input_query.extend(fuzzy_features)
316
+
317
+ # bow feature for q1
318
+ q1_bow = cv.transform([q1]).toarray()
319
+ print(q1_bow)
320
+ # bow feature for q2
321
+ q2_bow = cv.transform([q2]).toarray()
322
+
323
+ return np.hstack((np.array(input_query).reshape(1, 22), q1_bow, q2_bow))
images.png ADDED
model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:283f469c588633a1cc940454c30551b5e7ea67585e3af3931f88b2473fa7063c
3
+ size 97213057