animay620 commited on
Commit
f923391
1 Parent(s): 6055971

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +20 -0
  2. cv.pkl +3 -0
  3. helper.py +266 -0
  4. model.pkl +3 -0
  5. requirements.txt +0 -0
app.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import helper
3
+ import pickle
4
+ import sklearn
5
+ from sklearn.feature_extraction.text import CountVectorizer
6
+
7
+ model = pickle.load(open('model.pkl', 'rb'))
8
+ st.header("Duplicate Question Pairs")
9
+
10
+ q1 = st.text_input("Enter question 1")
11
+ q2 = st.text_input("Enter question 2")
12
+
13
+ if st.button("Find"):
14
+ query = helper.query_point_creator(q1, q2)
15
+ result = model.predict(query)[0]
16
+
17
+ if result:
18
+ st.header("Duplicate")
19
+ else:
20
+ st.header("Not Duplicate")
cv.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7990f044f2698296a6d944f70f0d10f55296eeb9544011d474748428deb2692c
3
+ size 543885
helper.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from bs4 import BeautifulSoup
4
+ import re
5
+
6
+ import nltk
7
+ from nltk.corpus import stopwords
8
+ nltk.download('stopwords')
9
+
10
+ import distance
11
+ from fuzzywuzzy import fuzz
12
+ import pickle
13
+
14
+ import sklearn
15
+ from sklearn.feature_extraction.text import CountVectorizer
16
+
17
+ cv = pickle.load(open('cv.pkl', 'rb'))
18
+ def preprocess(q):
19
+ q = str(q).lower().strip()
20
+
21
+ q = q.replace('%', ' percent')
22
+ q = q.replace('$', ' dollar ')
23
+ q = q.replace('₹', ' rupee ')
24
+ q = q.replace('€', ' euro ')
25
+ q = q.replace('@', ' at ')
26
+
27
+ contractions = {
28
+ "ain't": "am not",
29
+ "aren't": "are not",
30
+ "can't": "can not",
31
+ "can't've": "can not have",
32
+ "'cause": "because",
33
+ "could've": "could have",
34
+ "couldn't": "could not",
35
+ "couldn't've": "could not have",
36
+ "didn't": "did not",
37
+ "doesn't": "does not",
38
+ "don't": "do not",
39
+ "hadn't": "had not",
40
+ "hadn't've": "had not have",
41
+ "hasn't": "has not",
42
+ "haven't": "have not",
43
+ "he'd": "he would",
44
+ "he'd've": "he would have",
45
+ "he'll": "he will",
46
+ "he'll've": "he will have",
47
+ "he's": "he is",
48
+ "how'd": "how did",
49
+ "how'd'y": "how do you",
50
+ "how'll": "how will",
51
+ "how's": "how is",
52
+ "i'd": "i would",
53
+ "i'd've": "i would have",
54
+ "i'll": "i will",
55
+ "i'll've": "i will have",
56
+ "i'm": "i am",
57
+ "i've": "i have",
58
+ "isn't": "is not",
59
+ "it'd": "it would",
60
+ "it'd've": "it would have",
61
+ "it'll": "it will",
62
+ "it'll've": "it will have",
63
+ "it's": "it is",
64
+ "let's": "let us",
65
+ "ma'am": "madam",
66
+ "mayn't": "may not",
67
+ "might've": "might have",
68
+ "mightn't": "might not",
69
+ "mightn't've": "might not have",
70
+ "must've": "must have",
71
+ "mustn't": "must not",
72
+ "mustn't've": "must not have",
73
+ "needn't": "need not",
74
+ "needn't've": "need not have",
75
+ "o'clock": "of the clock",
76
+ "oughtn't": "ought not",
77
+ "oughtn't've": "ought not have",
78
+ "shan't": "shall not",
79
+ "sha'n't": "shall not",
80
+ "shan't've": "shall not have",
81
+ "she'd": "she would",
82
+ "she'd've": "she would have",
83
+ "she'll": "she will",
84
+ "she'll've": "she will have",
85
+ "she's": "she is",
86
+ "should've": "should have",
87
+ "shouldn't": "should not",
88
+ "shouldn't've": "should not have",
89
+ "so've": "so have",
90
+ "so's": "so as",
91
+ "that'd": "that would",
92
+ "that'd've": "that would have",
93
+ "that's": "that is",
94
+ "there'd": "there would",
95
+ "there'd've": "there would have",
96
+ "there's": "there is",
97
+ "they'd": "they would",
98
+ "they'd've": "they would have",
99
+ "they'll": "they will",
100
+ "they'll've": "they will have",
101
+ "they're": "they are",
102
+ "they've": "they have",
103
+ "to've": "to have",
104
+ "wasn't": "was not",
105
+ "we'd": "we would",
106
+ "we'd've": "we would have",
107
+ "we'll": "we will",
108
+ "we'll've": "we will have",
109
+ "we're": "we are",
110
+ "we've": "we have",
111
+ "weren't": "were not",
112
+ "what'll": "what will",
113
+ "what'll've": "what will have",
114
+ "what're": "what are",
115
+ "what's": "what is",
116
+ "what've": "what have",
117
+ "when's": "when is",
118
+ "when've": "when have",
119
+ "where'd": "where did",
120
+ "where's": "where is",
121
+ "where've": "where have",
122
+ "who'll": "who will",
123
+ "who'll've": "who will have",
124
+ "who's": "who is",
125
+ "who've": "who have",
126
+ "why's": "why is",
127
+ "why've": "why have",
128
+ "will've": "will have",
129
+ "won't": "will not",
130
+ "won't've": "will not have",
131
+ "would've": "would have",
132
+ "wouldn't": "would not",
133
+ "wouldn't've": "would not have",
134
+ "y'all": "you all",
135
+ "y'all'd": "you all would",
136
+ "y'all'd've": "you all would have",
137
+ "y'all're": "you all are",
138
+ "y'all've": "you all have",
139
+ "you'd": "you would",
140
+ "you'd've": "you would have",
141
+ "you'll": "you will",
142
+ "you'll've": "you will have",
143
+ "you're": "you are",
144
+ "you've": "you have"
145
+ }
146
+
147
+ q_decontracted = []
148
+
149
+ for word in q.split():
150
+ if word in contractions:
151
+ word = contractions[word]
152
+
153
+ q_decontracted.append(word)
154
+
155
+ q = ' '.join(q_decontracted)
156
+ q = q.replace("'ve", " have")
157
+ q = q.replace("n't", " not")
158
+ q = q.replace("'re", " are")
159
+ q = q.replace("'ll", " will")
160
+
161
+ # Removing HTML tags
162
+ q = BeautifulSoup(q)
163
+ q = q.get_text()
164
+
165
+ # Remove punctuations
166
+ pattern = re.compile('\W')
167
+ q = re.sub(pattern, ' ', q).strip()
168
+
169
+ return q
170
+
171
+
172
+ def test_common_words(q1, q2):
173
+ w1 = set(map(lambda words : words.lower().strip(), q1.split(" ")))
174
+ w2 = set(map(lambda words : words.lower().strip(), q2.split(" ")))
175
+ return len(w1 & w2)
176
+
177
+ def test_total_words(q1, q2):
178
+ w1 = set(map(lambda words : words.strip(), q1.split(" ")))
179
+ w2 = set(map(lambda words : words.strip(), q2.split(" ")))
180
+ return (len(w1) + len(w2))
181
+
182
+ def fetch_test_features(q1, q2):
183
+ SAFE_DIV = 0.00001
184
+ STOP_WORDS = stopwords.words('english')
185
+ token_features = [0.0] * 11
186
+
187
+ q1_tokens = q1.split()
188
+ q2_tokens = q2.split()
189
+
190
+ if len(q1_tokens) == 0 or len(q2_tokens) == 0:
191
+ return token_features
192
+
193
+ q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
194
+ q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
195
+
196
+ q1_stop = set([word for word in q1_tokens if word in STOP_WORDS])
197
+ q2_stop = set([word for word in q2_tokens if word in STOP_WORDS])
198
+
199
+ common_word_cnt = len(q1_words.intersection(q2_words))
200
+ common_stop_cnt = len(q1_stop.intersection(q2_stop))
201
+ common_token_cnt = len(set(q1_tokens).intersection(set(q2_tokens)))
202
+
203
+ token_features[0] = round(common_word_cnt / (min(len(q1_words), len(q2_words)) + SAFE_DIV), 2)
204
+ token_features[1] = round(common_word_cnt / (max(len(q1_words), len(q2_words)) + SAFE_DIV), 2)
205
+
206
+ token_features[2] = round(common_stop_cnt / (min(len(q1_stop), len(q2_stop)) + SAFE_DIV), 2)
207
+ token_features[3] = round(common_stop_cnt / (max(len(q1_stop), len(q2_stop)) + SAFE_DIV), 2)
208
+
209
+ token_features[4] = round(common_token_cnt / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV), 2)
210
+ token_features[5] = round(common_token_cnt / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV), 2)
211
+
212
+ token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
213
+ token_features[7] = int(q1_tokens[0] == q2_tokens[0])
214
+
215
+ token_features[8] = abs(len(q1_tokens) - len(q2_tokens))
216
+ token_features[9] = round((len(q1_tokens) - len(q2_tokens)) / 2, 2)
217
+
218
+ strs = list(distance.lcsubstrings(q1, q2))
219
+ token_features[10] = round(len(strs[0]) / (min(len(q1), len(q2)) + 1), 2)
220
+ return token_features
221
+
222
+
223
+ def fetch_test_fuzzy_features(q1, q2):
224
+ fuzzy_features = [0.0] * 4
225
+
226
+ # fuzz_ratio
227
+ fuzzy_features[0] = fuzz.QRatio(q1, q2)
228
+
229
+ # fuzz_partial_ratio
230
+ fuzzy_features[1] = fuzz.partial_ratio(q1, q2)
231
+
232
+ # token_sort_ratio
233
+ fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)
234
+
235
+ # token_set_ratio
236
+ fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)
237
+
238
+ return fuzzy_features
239
+
240
+
241
+ def query_point_creator(q1, q2):
242
+ input_query = []
243
+ q1 = preprocess(q1)
244
+ q2 = preprocess(q2)
245
+
246
+ input_query.append(len(q1))
247
+ input_query.append(len(q2))
248
+
249
+ input_query.append(len(q1.split(" ")))
250
+ input_query.append(len(q2.split(" ")))
251
+
252
+ input_query.append(test_common_words(q1, q2))
253
+ input_query.append(test_total_words(q1, q2))
254
+
255
+ input_query.append(round(input_query[4] / input_query[5], 2))
256
+
257
+ token_features = fetch_test_features(q1, q2)
258
+ input_query.extend(token_features)
259
+
260
+ fuzzy_features = fetch_test_fuzzy_features(q1, q2)
261
+ input_query.extend(fuzzy_features)
262
+
263
+ q1_arr = cv.transform([q1]).toarray()
264
+ q2_arr = cv.transform([q2]).toarray()
265
+
266
+ return np.hstack((np.array(input_query).reshape(1, 22), q1_arr, q2_arr))
model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e09f34acdafdfb0dbe60ee9c92b82478dfdb3316dc5acfc4268f067985063b5
3
+ size 88637720
requirements.txt ADDED
Binary file (172 Bytes). View file