Spaces:
Sleeping
Sleeping
first commit
Browse files- .ipynb_checkpoints/Spam_Filter-checkpoint.ipynb +307 -0
- Spam_Filter.ipynb +307 -0
- app.py +189 -0
- requirements.txt +2 -0
- spam_db.csv +0 -0
- test.py +86 -0
- testing.py +86 -0
.ipynb_checkpoints/Spam_Filter-checkpoint.ipynb
ADDED
@@ -0,0 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 40,
|
6 |
+
"id": "b9bb9dcd",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [
|
9 |
+
{
|
10 |
+
"name": "stderr",
|
11 |
+
"output_type": "stream",
|
12 |
+
"text": [
|
13 |
+
"[nltk_data] Downloading package averaged_perceptron_tagger to\n",
|
14 |
+
"[nltk_data] C:\\Users\\shiva\\AppData\\Roaming\\nltk_data...\n",
|
15 |
+
"[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
|
16 |
+
"[nltk_data] date!\n",
|
17 |
+
"[nltk_data] Downloading package punkt to\n",
|
18 |
+
"[nltk_data] C:\\Users\\shiva\\AppData\\Roaming\\nltk_data...\n",
|
19 |
+
"[nltk_data] Package punkt is already up-to-date!\n"
|
20 |
+
]
|
21 |
+
}
|
22 |
+
],
|
23 |
+
"source": [
|
24 |
+
"import nltk\n",
|
25 |
+
"nltk.download(\"averaged_perceptron_tagger\")\n",
|
26 |
+
"nltk.download(\"punkt\")\n",
|
27 |
+
"from nltk.tokenize import word_tokenize\n",
|
28 |
+
"import pandas as pd\n",
|
29 |
+
"import csv\n",
|
30 |
+
"import numpy as np\n",
|
31 |
+
"from sklearn import preprocessing , svm , model_selection, metrics\n",
|
32 |
+
"from sklearn.preprocessing import MinMaxScaler"
|
33 |
+
]
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"cell_type": "code",
|
37 |
+
"execution_count": 26,
|
38 |
+
"id": "c1e93bc2",
|
39 |
+
"metadata": {},
|
40 |
+
"outputs": [
|
41 |
+
{
|
42 |
+
"name": "stdout",
|
43 |
+
"output_type": "stream",
|
44 |
+
"text": [
|
45 |
+
" msg label label_no \\\n",
|
46 |
+
"0 Go until jurong point, crazy.. Available only ... ham 0 \n",
|
47 |
+
"1 Ok lar... Joking wif u oni... ham 0 \n",
|
48 |
+
"2 Free entry in 2 a wkly comp to win FA Cup fina... spam 1 \n",
|
49 |
+
"3 U dun say so early hor... U c already then say... ham 0 \n",
|
50 |
+
"4 Nah I don't think he goes to usf, he lives aro... ham 0 \n",
|
51 |
+
"\n",
|
52 |
+
" NNP IN JJ NN , RB : ... \\\n",
|
53 |
+
"0 0.071429 0.1250 0.142857 0.152174 0.076923 0.176471 0.054054 ... \n",
|
54 |
+
"1 0.047619 0.0000 0.047619 0.043478 0.000000 0.000000 0.054054 ... \n",
|
55 |
+
"2 0.142857 0.0625 0.190476 0.152174 0.000000 0.000000 0.000000 ... \n",
|
56 |
+
"3 0.023810 0.0000 0.095238 0.021739 0.000000 0.176471 0.054054 ... \n",
|
57 |
+
"4 0.023810 0.0625 0.000000 0.000000 0.076923 0.176471 0.000000 ... \n",
|
58 |
+
"\n",
|
59 |
+
" MD PRP$ JJR JJS UH RP WP WDT # '' \n",
|
60 |
+
"0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
61 |
+
"1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
62 |
+
"2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
63 |
+
"3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
64 |
+
"4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
65 |
+
"\n",
|
66 |
+
"[5 rows x 38 columns]\n"
|
67 |
+
]
|
68 |
+
}
|
69 |
+
],
|
70 |
+
"source": [
|
71 |
+
"tok_dict={}\n",
|
72 |
+
"\n",
|
73 |
+
"lst=['NNP', 'IN', 'JJ', 'NN', ',', 'RB', ':', 'EX', 'VBD', 'WRB', 'CD', 'DT', 'TO', 'VB', '.',\n",
|
74 |
+
" '(', ')', 'CC', 'POS', 'VBP', 'NNS', 'PRP', 'VBZ', 'VBG', 'VBN', 'MD', 'PRP$', 'JJR', 'JJS', 'UH', 'RP', 'WP', 'WDT', '#', \"''\"]\n",
|
75 |
+
"\n",
|
76 |
+
"pd_dict={'msg':[],'label':[],'label_no':[],'NNP':[], 'IN':[], 'JJ':[], 'NN':[], ',':[], 'RB':[], ':':[], 'EX':[], 'VBD':[], \n",
|
77 |
+
"'WRB':[], 'CD':[], 'DT':[], 'TO':[], 'VB':[], '.':[], '(':[], ')':[], 'CC':[],\n",
|
78 |
+
"'POS':[], 'VBP':[], 'NNS':[], 'PRP':[], 'VBZ':[], 'VBG':[], 'VBN':[], 'MD':[], \n",
|
79 |
+
"'PRP$':[], 'JJR':[], 'JJS':[], 'UH':[], 'RP':[], 'WP':[], 'WDT':[], '#':[], \"''\":[]}\n",
|
80 |
+
"\n",
|
81 |
+
"with open(\"spam_db.csv\", 'r') as file:\n",
|
82 |
+
" csvreader = csv.reader(file)\n",
|
83 |
+
" j=0\n",
|
84 |
+
" k=0\n",
|
85 |
+
" for row in csvreader:\n",
|
86 |
+
" if j==0:\n",
|
87 |
+
" j=1\n",
|
88 |
+
" continue\n",
|
89 |
+
" pd_dict['msg'].append(row[1])\n",
|
90 |
+
" pd_dict['label'].append(row[0])\n",
|
91 |
+
" if row[0]=='spam':\n",
|
92 |
+
" pd_dict['label_no'].append(1)\n",
|
93 |
+
" else:\n",
|
94 |
+
" pd_dict['label_no'].append(0)\n",
|
95 |
+
" for label in lst:\n",
|
96 |
+
" pd_dict[label].append(0)\n",
|
97 |
+
" text=row[1]\n",
|
98 |
+
" tokens=word_tokenize(text)\n",
|
99 |
+
" tokens_tagged=nltk.pos_tag(tokens)\n",
|
100 |
+
" for i in tokens_tagged:\n",
|
101 |
+
" if i[1] in tok_dict:\n",
|
102 |
+
" tok_dict[i[1]].append(i[0])\n",
|
103 |
+
" else:\n",
|
104 |
+
" tok_dict[i[1]]=[i[0]]\n",
|
105 |
+
" if i[1] in pd_dict:\n",
|
106 |
+
" pd_dict[i[1]][k]+=1\n",
|
107 |
+
" k+=1\n",
|
108 |
+
" \n",
|
109 |
+
"tok_dict1={}\n",
|
110 |
+
"for i in tok_dict:\n",
|
111 |
+
" tok_dict1[i]=len(tok_dict[i])\n",
|
112 |
+
"\n",
|
113 |
+
"del_lst=[]\n",
|
114 |
+
"for i in tok_dict1:\n",
|
115 |
+
" if tok_dict1[i]<100:\n",
|
116 |
+
" del_lst.append(i)\n",
|
117 |
+
"\n",
|
118 |
+
"for i in del_lst:\n",
|
119 |
+
" tok_dict1.pop(i)\n",
|
120 |
+
"\n",
|
121 |
+
"lst=[]\n",
|
122 |
+
"for i in tok_dict1:\n",
|
123 |
+
" lst.append(i)\n",
|
124 |
+
"\n",
|
125 |
+
"df=pd.DataFrame(pd_dict)\n",
|
126 |
+
"numeric_columns = df.drop(['msg', 'label', 'label_no'], axis=1).columns\n",
|
127 |
+
"\n",
|
128 |
+
"# Create the MinMaxScaler object\n",
|
129 |
+
"scaler = MinMaxScaler()\n",
|
130 |
+
"\n",
|
131 |
+
"# Normalize the numeric columns using min-max normalization\n",
|
132 |
+
"df[numeric_columns] = scaler.fit_transform(df[numeric_columns])\n",
|
133 |
+
"\n",
|
134 |
+
"print(df.head())"
|
135 |
+
]
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"cell_type": "code",
|
139 |
+
"execution_count": 27,
|
140 |
+
"id": "35824c58",
|
141 |
+
"metadata": {},
|
142 |
+
"outputs": [
|
143 |
+
{
|
144 |
+
"name": "stderr",
|
145 |
+
"output_type": "stream",
|
146 |
+
"text": [
|
147 |
+
"C:\\Users\\shiva\\AppData\\Local\\Temp\\ipykernel_9568\\3238635716.py:1: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.\n",
|
148 |
+
" X=np.array(df.drop(['msg','label','label_no'],1))\n"
|
149 |
+
]
|
150 |
+
}
|
151 |
+
],
|
152 |
+
"source": [
|
153 |
+
"X=np.array(df.drop(['msg','label','label_no'],1))\n",
|
154 |
+
"y=np.array(df['label_no'])"
|
155 |
+
]
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"cell_type": "code",
|
159 |
+
"execution_count": 32,
|
160 |
+
"id": "aec84e0c",
|
161 |
+
"metadata": {},
|
162 |
+
"outputs": [
|
163 |
+
{
|
164 |
+
"name": "stdout",
|
165 |
+
"output_type": "stream",
|
166 |
+
"text": [
|
167 |
+
"0.9676956209619526\n"
|
168 |
+
]
|
169 |
+
}
|
170 |
+
],
|
171 |
+
"source": [
|
172 |
+
"X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)\n",
|
173 |
+
"clf=svm.SVC(kernel='poly')\n",
|
174 |
+
"clf.fit(X_train, y_train)\n",
|
175 |
+
"accuracy = clf.score(X_test, y_test)\n",
|
176 |
+
"print(accuracy)"
|
177 |
+
]
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"cell_type": "code",
|
181 |
+
"execution_count": 36,
|
182 |
+
"id": "62e97e65",
|
183 |
+
"metadata": {},
|
184 |
+
"outputs": [
|
185 |
+
{
|
186 |
+
"name": "stdout",
|
187 |
+
"output_type": "stream",
|
188 |
+
"text": [
|
189 |
+
"Precision: 0.9669448190530422\n",
|
190 |
+
"Recall: 0.9676956209619526\n",
|
191 |
+
"F1 score: 0.9667034979766862\n"
|
192 |
+
]
|
193 |
+
},
|
194 |
+
{
|
195 |
+
"data": {
|
196 |
+
"text/plain": [
|
197 |
+
"array([[1208, 11],\n",
|
198 |
+
" [ 34, 140]], dtype=int64)"
|
199 |
+
]
|
200 |
+
},
|
201 |
+
"execution_count": 36,
|
202 |
+
"metadata": {},
|
203 |
+
"output_type": "execute_result"
|
204 |
+
}
|
205 |
+
],
|
206 |
+
"source": [
|
207 |
+
"y_pred = clf.predict(X_test)\n",
|
208 |
+
"\n",
|
209 |
+
"precision = metrics.precision_score(y_test, y_pred, average='weighted')\n",
|
210 |
+
"recall = metrics.recall_score(y_test, y_pred, average='weighted')\n",
|
211 |
+
"f1 = metrics.f1_score(y_test, y_pred, average='weighted')\n",
|
212 |
+
"\n",
|
213 |
+
"print(\"Precision:\", precision)\n",
|
214 |
+
"print(\"Recall:\", recall)\n",
|
215 |
+
"print(\"F1 score:\", f1)\n",
|
216 |
+
"\n",
|
217 |
+
"confusion_mat = metrics.confusion_matrix(y_test, y_pred)\n",
|
218 |
+
"confusion_mat"
|
219 |
+
]
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"cell_type": "code",
|
223 |
+
"execution_count": 33,
|
224 |
+
"id": "ccce58e6",
|
225 |
+
"metadata": {},
|
226 |
+
"outputs": [],
|
227 |
+
"source": [
|
228 |
+
"text='''WINNER!! As a valued network customer you have been selected to receivea \n",
|
229 |
+
"å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'''"
|
230 |
+
]
|
231 |
+
},
|
232 |
+
{
|
233 |
+
"cell_type": "code",
|
234 |
+
"execution_count": 34,
|
235 |
+
"id": "f53b1187",
|
236 |
+
"metadata": {},
|
237 |
+
"outputs": [],
|
238 |
+
"source": [
|
239 |
+
"tokens=word_tokenize(text)\n",
|
240 |
+
"tokens_tagged=nltk.pos_tag(tokens)\n",
|
241 |
+
"x=[]\n",
|
242 |
+
"for i in range(35):\n",
|
243 |
+
" x.append(0)\n",
|
244 |
+
"pos_dict={'NNP':[0], 'IN':[1], 'JJ':[2], 'NN':[3], ',':[4], 'RB':[5], ':':[6], 'EX':[7], 'VBD':[8], \n",
|
245 |
+
"'WRB':[9], 'CD':[10], 'DT':[11], 'TO':[12], 'VB':[13], '.':[14], '(':[15], ')':[16], 'CC':[17],\n",
|
246 |
+
"'POS':[18], 'VBP':[19], 'NNS':[20], 'PRP':[21], 'VBZ':[22], 'VBG':[23], 'VBN':[24], 'MD':[25], \n",
|
247 |
+
"'PRP$':[26], 'JJR':[27], 'JJS':[28], 'UH':[29], 'RP':[30], 'WP':[31], 'WDT':[32], '#':[33], \"''\":[34]}\n",
|
248 |
+
"for i in tokens_tagged:\n",
|
249 |
+
" x[pos_dict[i[1]][0]]+=1\n",
|
250 |
+
"x=np.array(x)\n",
|
251 |
+
"x=x.reshape(1,-1)\n",
|
252 |
+
"# x"
|
253 |
+
]
|
254 |
+
},
|
255 |
+
{
|
256 |
+
"cell_type": "code",
|
257 |
+
"execution_count": 35,
|
258 |
+
"id": "1d0066d6",
|
259 |
+
"metadata": {},
|
260 |
+
"outputs": [
|
261 |
+
{
|
262 |
+
"name": "stdout",
|
263 |
+
"output_type": "stream",
|
264 |
+
"text": [
|
265 |
+
"SPAM\n"
|
266 |
+
]
|
267 |
+
}
|
268 |
+
],
|
269 |
+
"source": [
|
270 |
+
"pred=clf.predict(x)\n",
|
271 |
+
"if pred==0:\n",
|
272 |
+
" print(\"NOT SPAM\")\n",
|
273 |
+
"else:\n",
|
274 |
+
" print(\"SPAM\")"
|
275 |
+
]
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"cell_type": "code",
|
279 |
+
"execution_count": null,
|
280 |
+
"id": "7440777a",
|
281 |
+
"metadata": {},
|
282 |
+
"outputs": [],
|
283 |
+
"source": []
|
284 |
+
}
|
285 |
+
],
|
286 |
+
"metadata": {
|
287 |
+
"kernelspec": {
|
288 |
+
"display_name": "Python 3 (ipykernel)",
|
289 |
+
"language": "python",
|
290 |
+
"name": "python3"
|
291 |
+
},
|
292 |
+
"language_info": {
|
293 |
+
"codemirror_mode": {
|
294 |
+
"name": "ipython",
|
295 |
+
"version": 3
|
296 |
+
},
|
297 |
+
"file_extension": ".py",
|
298 |
+
"mimetype": "text/x-python",
|
299 |
+
"name": "python",
|
300 |
+
"nbconvert_exporter": "python",
|
301 |
+
"pygments_lexer": "ipython3",
|
302 |
+
"version": "3.10.4"
|
303 |
+
}
|
304 |
+
},
|
305 |
+
"nbformat": 4,
|
306 |
+
"nbformat_minor": 5
|
307 |
+
}
|
Spam_Filter.ipynb
ADDED
@@ -0,0 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 40,
|
6 |
+
"id": "b9bb9dcd",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [
|
9 |
+
{
|
10 |
+
"name": "stderr",
|
11 |
+
"output_type": "stream",
|
12 |
+
"text": [
|
13 |
+
"[nltk_data] Downloading package averaged_perceptron_tagger to\n",
|
14 |
+
"[nltk_data] C:\\Users\\shiva\\AppData\\Roaming\\nltk_data...\n",
|
15 |
+
"[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
|
16 |
+
"[nltk_data] date!\n",
|
17 |
+
"[nltk_data] Downloading package punkt to\n",
|
18 |
+
"[nltk_data] C:\\Users\\shiva\\AppData\\Roaming\\nltk_data...\n",
|
19 |
+
"[nltk_data] Package punkt is already up-to-date!\n"
|
20 |
+
]
|
21 |
+
}
|
22 |
+
],
|
23 |
+
"source": [
|
24 |
+
"import nltk\n",
|
25 |
+
"nltk.download(\"averaged_perceptron_tagger\")\n",
|
26 |
+
"nltk.download(\"punkt\")\n",
|
27 |
+
"from nltk.tokenize import word_tokenize\n",
|
28 |
+
"import pandas as pd\n",
|
29 |
+
"import csv\n",
|
30 |
+
"import numpy as np\n",
|
31 |
+
"from sklearn import preprocessing , svm , model_selection, metrics\n",
|
32 |
+
"from sklearn.preprocessing import MinMaxScaler"
|
33 |
+
]
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"cell_type": "code",
|
37 |
+
"execution_count": 26,
|
38 |
+
"id": "c1e93bc2",
|
39 |
+
"metadata": {},
|
40 |
+
"outputs": [
|
41 |
+
{
|
42 |
+
"name": "stdout",
|
43 |
+
"output_type": "stream",
|
44 |
+
"text": [
|
45 |
+
" msg label label_no \\\n",
|
46 |
+
"0 Go until jurong point, crazy.. Available only ... ham 0 \n",
|
47 |
+
"1 Ok lar... Joking wif u oni... ham 0 \n",
|
48 |
+
"2 Free entry in 2 a wkly comp to win FA Cup fina... spam 1 \n",
|
49 |
+
"3 U dun say so early hor... U c already then say... ham 0 \n",
|
50 |
+
"4 Nah I don't think he goes to usf, he lives aro... ham 0 \n",
|
51 |
+
"\n",
|
52 |
+
" NNP IN JJ NN , RB : ... \\\n",
|
53 |
+
"0 0.071429 0.1250 0.142857 0.152174 0.076923 0.176471 0.054054 ... \n",
|
54 |
+
"1 0.047619 0.0000 0.047619 0.043478 0.000000 0.000000 0.054054 ... \n",
|
55 |
+
"2 0.142857 0.0625 0.190476 0.152174 0.000000 0.000000 0.000000 ... \n",
|
56 |
+
"3 0.023810 0.0000 0.095238 0.021739 0.000000 0.176471 0.054054 ... \n",
|
57 |
+
"4 0.023810 0.0625 0.000000 0.000000 0.076923 0.176471 0.000000 ... \n",
|
58 |
+
"\n",
|
59 |
+
" MD PRP$ JJR JJS UH RP WP WDT # '' \n",
|
60 |
+
"0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
61 |
+
"1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
62 |
+
"2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
63 |
+
"3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
64 |
+
"4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
65 |
+
"\n",
|
66 |
+
"[5 rows x 38 columns]\n"
|
67 |
+
]
|
68 |
+
}
|
69 |
+
],
|
70 |
+
"source": [
|
71 |
+
"tok_dict={}\n",
|
72 |
+
"\n",
|
73 |
+
"lst=['NNP', 'IN', 'JJ', 'NN', ',', 'RB', ':', 'EX', 'VBD', 'WRB', 'CD', 'DT', 'TO', 'VB', '.',\n",
|
74 |
+
" '(', ')', 'CC', 'POS', 'VBP', 'NNS', 'PRP', 'VBZ', 'VBG', 'VBN', 'MD', 'PRP$', 'JJR', 'JJS', 'UH', 'RP', 'WP', 'WDT', '#', \"''\"]\n",
|
75 |
+
"\n",
|
76 |
+
"pd_dict={'msg':[],'label':[],'label_no':[],'NNP':[], 'IN':[], 'JJ':[], 'NN':[], ',':[], 'RB':[], ':':[], 'EX':[], 'VBD':[], \n",
|
77 |
+
"'WRB':[], 'CD':[], 'DT':[], 'TO':[], 'VB':[], '.':[], '(':[], ')':[], 'CC':[],\n",
|
78 |
+
"'POS':[], 'VBP':[], 'NNS':[], 'PRP':[], 'VBZ':[], 'VBG':[], 'VBN':[], 'MD':[], \n",
|
79 |
+
"'PRP$':[], 'JJR':[], 'JJS':[], 'UH':[], 'RP':[], 'WP':[], 'WDT':[], '#':[], \"''\":[]}\n",
|
80 |
+
"\n",
|
81 |
+
"with open(\"spam_db.csv\", 'r') as file:\n",
|
82 |
+
" csvreader = csv.reader(file)\n",
|
83 |
+
" j=0\n",
|
84 |
+
" k=0\n",
|
85 |
+
" for row in csvreader:\n",
|
86 |
+
" if j==0:\n",
|
87 |
+
" j=1\n",
|
88 |
+
" continue\n",
|
89 |
+
" pd_dict['msg'].append(row[1])\n",
|
90 |
+
" pd_dict['label'].append(row[0])\n",
|
91 |
+
" if row[0]=='spam':\n",
|
92 |
+
" pd_dict['label_no'].append(1)\n",
|
93 |
+
" else:\n",
|
94 |
+
" pd_dict['label_no'].append(0)\n",
|
95 |
+
" for label in lst:\n",
|
96 |
+
" pd_dict[label].append(0)\n",
|
97 |
+
" text=row[1]\n",
|
98 |
+
" tokens=word_tokenize(text)\n",
|
99 |
+
" tokens_tagged=nltk.pos_tag(tokens)\n",
|
100 |
+
" for i in tokens_tagged:\n",
|
101 |
+
" if i[1] in tok_dict:\n",
|
102 |
+
" tok_dict[i[1]].append(i[0])\n",
|
103 |
+
" else:\n",
|
104 |
+
" tok_dict[i[1]]=[i[0]]\n",
|
105 |
+
" if i[1] in pd_dict:\n",
|
106 |
+
" pd_dict[i[1]][k]+=1\n",
|
107 |
+
" k+=1\n",
|
108 |
+
" \n",
|
109 |
+
"tok_dict1={}\n",
|
110 |
+
"for i in tok_dict:\n",
|
111 |
+
" tok_dict1[i]=len(tok_dict[i])\n",
|
112 |
+
"\n",
|
113 |
+
"del_lst=[]\n",
|
114 |
+
"for i in tok_dict1:\n",
|
115 |
+
" if tok_dict1[i]<100:\n",
|
116 |
+
" del_lst.append(i)\n",
|
117 |
+
"\n",
|
118 |
+
"for i in del_lst:\n",
|
119 |
+
" tok_dict1.pop(i)\n",
|
120 |
+
"\n",
|
121 |
+
"lst=[]\n",
|
122 |
+
"for i in tok_dict1:\n",
|
123 |
+
" lst.append(i)\n",
|
124 |
+
"\n",
|
125 |
+
"df=pd.DataFrame(pd_dict)\n",
|
126 |
+
"numeric_columns = df.drop(['msg', 'label', 'label_no'], axis=1).columns\n",
|
127 |
+
"\n",
|
128 |
+
"# Create the MinMaxScaler object\n",
|
129 |
+
"scaler = MinMaxScaler()\n",
|
130 |
+
"\n",
|
131 |
+
"# Normalize the numeric columns using min-max normalization\n",
|
132 |
+
"df[numeric_columns] = scaler.fit_transform(df[numeric_columns])\n",
|
133 |
+
"\n",
|
134 |
+
"print(df.head())"
|
135 |
+
]
|
136 |
+
},
|
137 |
+
{
|
138 |
+
"cell_type": "code",
|
139 |
+
"execution_count": 27,
|
140 |
+
"id": "35824c58",
|
141 |
+
"metadata": {},
|
142 |
+
"outputs": [
|
143 |
+
{
|
144 |
+
"name": "stderr",
|
145 |
+
"output_type": "stream",
|
146 |
+
"text": [
|
147 |
+
"C:\\Users\\shiva\\AppData\\Local\\Temp\\ipykernel_9568\\3238635716.py:1: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.\n",
|
148 |
+
" X=np.array(df.drop(['msg','label','label_no'],1))\n"
|
149 |
+
]
|
150 |
+
}
|
151 |
+
],
|
152 |
+
"source": [
|
153 |
+
"X=np.array(df.drop(['msg','label','label_no'],1))\n",
|
154 |
+
"y=np.array(df['label_no'])"
|
155 |
+
]
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"cell_type": "code",
|
159 |
+
"execution_count": 32,
|
160 |
+
"id": "aec84e0c",
|
161 |
+
"metadata": {},
|
162 |
+
"outputs": [
|
163 |
+
{
|
164 |
+
"name": "stdout",
|
165 |
+
"output_type": "stream",
|
166 |
+
"text": [
|
167 |
+
"0.9676956209619526\n"
|
168 |
+
]
|
169 |
+
}
|
170 |
+
],
|
171 |
+
"source": [
|
172 |
+
"X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)\n",
|
173 |
+
"clf=svm.SVC(kernel='poly')\n",
|
174 |
+
"clf.fit(X_train, y_train)\n",
|
175 |
+
"accuracy = clf.score(X_test, y_test)\n",
|
176 |
+
"print(accuracy)"
|
177 |
+
]
|
178 |
+
},
|
179 |
+
{
|
180 |
+
"cell_type": "code",
|
181 |
+
"execution_count": 36,
|
182 |
+
"id": "62e97e65",
|
183 |
+
"metadata": {},
|
184 |
+
"outputs": [
|
185 |
+
{
|
186 |
+
"name": "stdout",
|
187 |
+
"output_type": "stream",
|
188 |
+
"text": [
|
189 |
+
"Precision: 0.9669448190530422\n",
|
190 |
+
"Recall: 0.9676956209619526\n",
|
191 |
+
"F1 score: 0.9667034979766862\n"
|
192 |
+
]
|
193 |
+
},
|
194 |
+
{
|
195 |
+
"data": {
|
196 |
+
"text/plain": [
|
197 |
+
"array([[1208, 11],\n",
|
198 |
+
" [ 34, 140]], dtype=int64)"
|
199 |
+
]
|
200 |
+
},
|
201 |
+
"execution_count": 36,
|
202 |
+
"metadata": {},
|
203 |
+
"output_type": "execute_result"
|
204 |
+
}
|
205 |
+
],
|
206 |
+
"source": [
|
207 |
+
"y_pred = clf.predict(X_test)\n",
|
208 |
+
"\n",
|
209 |
+
"precision = metrics.precision_score(y_test, y_pred, average='weighted')\n",
|
210 |
+
"recall = metrics.recall_score(y_test, y_pred, average='weighted')\n",
|
211 |
+
"f1 = metrics.f1_score(y_test, y_pred, average='weighted')\n",
|
212 |
+
"\n",
|
213 |
+
"print(\"Precision:\", precision)\n",
|
214 |
+
"print(\"Recall:\", recall)\n",
|
215 |
+
"print(\"F1 score:\", f1)\n",
|
216 |
+
"\n",
|
217 |
+
"confusion_mat = metrics.confusion_matrix(y_test, y_pred)\n",
|
218 |
+
"confusion_mat"
|
219 |
+
]
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"cell_type": "code",
|
223 |
+
"execution_count": 33,
|
224 |
+
"id": "ccce58e6",
|
225 |
+
"metadata": {},
|
226 |
+
"outputs": [],
|
227 |
+
"source": [
|
228 |
+
"text='''WINNER!! As a valued network customer you have been selected to receivea \n",
|
229 |
+
"å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'''"
|
230 |
+
]
|
231 |
+
},
|
232 |
+
{
|
233 |
+
"cell_type": "code",
|
234 |
+
"execution_count": 34,
|
235 |
+
"id": "f53b1187",
|
236 |
+
"metadata": {},
|
237 |
+
"outputs": [],
|
238 |
+
"source": [
|
239 |
+
"tokens=word_tokenize(text)\n",
|
240 |
+
"tokens_tagged=nltk.pos_tag(tokens)\n",
|
241 |
+
"x=[]\n",
|
242 |
+
"for i in range(35):\n",
|
243 |
+
" x.append(0)\n",
|
244 |
+
"pos_dict={'NNP':[0], 'IN':[1], 'JJ':[2], 'NN':[3], ',':[4], 'RB':[5], ':':[6], 'EX':[7], 'VBD':[8], \n",
|
245 |
+
"'WRB':[9], 'CD':[10], 'DT':[11], 'TO':[12], 'VB':[13], '.':[14], '(':[15], ')':[16], 'CC':[17],\n",
|
246 |
+
"'POS':[18], 'VBP':[19], 'NNS':[20], 'PRP':[21], 'VBZ':[22], 'VBG':[23], 'VBN':[24], 'MD':[25], \n",
|
247 |
+
"'PRP$':[26], 'JJR':[27], 'JJS':[28], 'UH':[29], 'RP':[30], 'WP':[31], 'WDT':[32], '#':[33], \"''\":[34]}\n",
|
248 |
+
"for i in tokens_tagged:\n",
|
249 |
+
" x[pos_dict[i[1]][0]]+=1\n",
|
250 |
+
"x=np.array(x)\n",
|
251 |
+
"x=x.reshape(1,-1)\n",
|
252 |
+
"# x"
|
253 |
+
]
|
254 |
+
},
|
255 |
+
{
|
256 |
+
"cell_type": "code",
|
257 |
+
"execution_count": 35,
|
258 |
+
"id": "1d0066d6",
|
259 |
+
"metadata": {},
|
260 |
+
"outputs": [
|
261 |
+
{
|
262 |
+
"name": "stdout",
|
263 |
+
"output_type": "stream",
|
264 |
+
"text": [
|
265 |
+
"SPAM\n"
|
266 |
+
]
|
267 |
+
}
|
268 |
+
],
|
269 |
+
"source": [
|
270 |
+
"pred=clf.predict(x)\n",
|
271 |
+
"if pred==0:\n",
|
272 |
+
" print(\"NOT SPAM\")\n",
|
273 |
+
"else:\n",
|
274 |
+
" print(\"SPAM\")"
|
275 |
+
]
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"cell_type": "code",
|
279 |
+
"execution_count": null,
|
280 |
+
"id": "7440777a",
|
281 |
+
"metadata": {},
|
282 |
+
"outputs": [],
|
283 |
+
"source": []
|
284 |
+
}
|
285 |
+
],
|
286 |
+
"metadata": {
|
287 |
+
"kernelspec": {
|
288 |
+
"display_name": "Python 3 (ipykernel)",
|
289 |
+
"language": "python",
|
290 |
+
"name": "python3"
|
291 |
+
},
|
292 |
+
"language_info": {
|
293 |
+
"codemirror_mode": {
|
294 |
+
"name": "ipython",
|
295 |
+
"version": 3
|
296 |
+
},
|
297 |
+
"file_extension": ".py",
|
298 |
+
"mimetype": "text/x-python",
|
299 |
+
"name": "python",
|
300 |
+
"nbconvert_exporter": "python",
|
301 |
+
"pygments_lexer": "ipython3",
|
302 |
+
"version": "3.10.4"
|
303 |
+
}
|
304 |
+
},
|
305 |
+
"nbformat": 4,
|
306 |
+
"nbformat_minor": 5
|
307 |
+
}
|
app.py
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# In[25]:
|
5 |
+
|
6 |
+
|
7 |
+
import nltk
|
8 |
+
nltk.download("averaged_perceptron_tagger")
|
9 |
+
nltk.download("punkt")
|
10 |
+
from nltk.tokenize import word_tokenize
|
11 |
+
import pandas as pd
|
12 |
+
import csv
|
13 |
+
import numpy as np
|
14 |
+
from sklearn import preprocessing , svm , model_selection, metrics
|
15 |
+
from sklearn.preprocessing import MinMaxScaler
|
16 |
+
import gradio as gr
|
17 |
+
|
18 |
+
|
19 |
+
# In[26]:
|
20 |
+
|
21 |
+
|
22 |
+
tok_dict={}
|
23 |
+
|
24 |
+
lst=['NNP', 'IN', 'JJ', 'NN', ',', 'RB', ':', 'EX', 'VBD', 'WRB', 'CD', 'DT', 'TO', 'VB', '.',
|
25 |
+
'(', ')', 'CC', 'POS', 'VBP', 'NNS', 'PRP', 'VBZ', 'VBG', 'VBN', 'MD', 'PRP$', 'JJR', 'JJS', 'UH', 'RP', 'WP', 'WDT', '#', "''"]
|
26 |
+
|
27 |
+
pd_dict={'msg':[],'label':[],'label_no':[],'NNP':[], 'IN':[], 'JJ':[], 'NN':[], ',':[], 'RB':[], ':':[], 'EX':[], 'VBD':[],
|
28 |
+
'WRB':[], 'CD':[], 'DT':[], 'TO':[], 'VB':[], '.':[], '(':[], ')':[], 'CC':[],
|
29 |
+
'POS':[], 'VBP':[], 'NNS':[], 'PRP':[], 'VBZ':[], 'VBG':[], 'VBN':[], 'MD':[],
|
30 |
+
'PRP$':[], 'JJR':[], 'JJS':[], 'UH':[], 'RP':[], 'WP':[], 'WDT':[], '#':[], "''":[]}
|
31 |
+
|
32 |
+
with open("spam_db.csv", 'r') as file:
|
33 |
+
csvreader = csv.reader(file)
|
34 |
+
j=0
|
35 |
+
k=0
|
36 |
+
for row in csvreader:
|
37 |
+
if j==0:
|
38 |
+
j=1
|
39 |
+
continue
|
40 |
+
pd_dict['msg'].append(row[1])
|
41 |
+
pd_dict['label'].append(row[0])
|
42 |
+
if row[0]=='spam':
|
43 |
+
pd_dict['label_no'].append(1)
|
44 |
+
else:
|
45 |
+
pd_dict['label_no'].append(0)
|
46 |
+
for label in lst:
|
47 |
+
pd_dict[label].append(0)
|
48 |
+
text=row[1]
|
49 |
+
tokens=word_tokenize(text)
|
50 |
+
tokens_tagged=nltk.pos_tag(tokens)
|
51 |
+
for i in tokens_tagged:
|
52 |
+
if i[1] in tok_dict:
|
53 |
+
tok_dict[i[1]].append(i[0])
|
54 |
+
else:
|
55 |
+
tok_dict[i[1]]=[i[0]]
|
56 |
+
if i[1] in pd_dict:
|
57 |
+
pd_dict[i[1]][k]+=1
|
58 |
+
k+=1
|
59 |
+
|
60 |
+
tok_dict1={}
|
61 |
+
for i in tok_dict:
|
62 |
+
tok_dict1[i]=len(tok_dict[i])
|
63 |
+
|
64 |
+
del_lst=[]
|
65 |
+
for i in tok_dict1:
|
66 |
+
if tok_dict1[i]<100:
|
67 |
+
del_lst.append(i)
|
68 |
+
|
69 |
+
for i in del_lst:
|
70 |
+
tok_dict1.pop(i)
|
71 |
+
|
72 |
+
lst=[]
|
73 |
+
for i in tok_dict1:
|
74 |
+
lst.append(i)
|
75 |
+
|
76 |
+
df=pd.DataFrame(pd_dict)
|
77 |
+
numeric_columns = df.drop(['msg', 'label', 'label_no'], axis=1).columns
|
78 |
+
|
79 |
+
# Create the MinMaxScaler object
|
80 |
+
scaler = MinMaxScaler()
|
81 |
+
|
82 |
+
# Normalize the numeric columns using min-max normalization
|
83 |
+
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
|
84 |
+
|
85 |
+
print(df.head())
|
86 |
+
|
87 |
+
|
88 |
+
# In[27]:
|
89 |
+
|
90 |
+
|
91 |
+
X=np.array(df.drop(['msg','label','label_no'],axis = 1))
|
92 |
+
y=np.array(df['label_no'])
|
93 |
+
|
94 |
+
|
95 |
+
# In[32]:
|
96 |
+
|
97 |
+
|
98 |
+
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)
|
99 |
+
clf=svm.SVC(kernel='poly')
|
100 |
+
clf.fit(X_train, y_train)
|
101 |
+
accuracy = clf.score(X_test, y_test)
|
102 |
+
print(accuracy)
|
103 |
+
|
104 |
+
|
105 |
+
# In[36]:
|
106 |
+
|
107 |
+
|
108 |
+
y_pred = clf.predict(X_test)
|
109 |
+
|
110 |
+
precision = metrics.precision_score(y_test, y_pred, average='weighted')
|
111 |
+
recall = metrics.recall_score(y_test, y_pred, average='weighted')
|
112 |
+
f1 = metrics.f1_score(y_test, y_pred, average='weighted')
|
113 |
+
|
114 |
+
print("Precision:", precision)
|
115 |
+
print("Recall:", recall)
|
116 |
+
print("F1 score:", f1)
|
117 |
+
|
118 |
+
confusion_mat = metrics.confusion_matrix(y_test, y_pred)
|
119 |
+
confusion_mat
|
120 |
+
|
121 |
+
|
122 |
+
# In[33]:
|
123 |
+
|
124 |
+
|
125 |
+
text='''WINNER!! As a valued network customer you have been selected to receivea
|
126 |
+
å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'''
|
127 |
+
|
128 |
+
|
129 |
+
# In[34]:
|
130 |
+
|
131 |
+
|
132 |
+
tokens=word_tokenize(text)
|
133 |
+
tokens_tagged=nltk.pos_tag(tokens)
|
134 |
+
x=[]
|
135 |
+
for i in range(35):
|
136 |
+
x.append(0)
|
137 |
+
pos_dict={'NNP':[0], 'IN':[1], 'JJ':[2], 'NN':[3], ',':[4], 'RB':[5], ':':[6], 'EX':[7], 'VBD':[8],
|
138 |
+
'WRB':[9], 'CD':[10], 'DT':[11], 'TO':[12], 'VB':[13], '.':[14], '(':[15], ')':[16], 'CC':[17],
|
139 |
+
'POS':[18], 'VBP':[19], 'NNS':[20], 'PRP':[21], 'VBZ':[22], 'VBG':[23], 'VBN':[24], 'MD':[25],
|
140 |
+
'PRP$':[26], 'JJR':[27], 'JJS':[28], 'UH':[29], 'RP':[30], 'WP':[31], 'WDT':[32], '#':[33], "''":[34]}
|
141 |
+
for i in tokens_tagged:
|
142 |
+
x[pos_dict[i[1]][0]]+=1
|
143 |
+
x=np.array(x)
|
144 |
+
x=x.reshape(1,-1)
|
145 |
+
# x
|
146 |
+
|
147 |
+
|
148 |
+
# In[35]:
|
149 |
+
|
150 |
+
|
151 |
+
pred=clf.predict(x)
|
152 |
+
if pred==0:
|
153 |
+
print("NOT SPAM")
|
154 |
+
else:
|
155 |
+
print("SPAM")
|
156 |
+
|
157 |
+
|
158 |
+
# In[ ]:
|
159 |
+
def spam_detection(txt):
|
160 |
+
tokens=word_tokenize(text)
|
161 |
+
tokens_tagged=nltk.pos_tag(tokens)
|
162 |
+
x=[]
|
163 |
+
for i in range(35):
|
164 |
+
x.append(0)
|
165 |
+
pos_dict={'NNP':[0], 'IN':[1], 'JJ':[2], 'NN':[3], ',':[4], 'RB':[5], ':':[6], 'EX':[7], 'VBD':[8],
|
166 |
+
'WRB':[9], 'CD':[10], 'DT':[11], 'TO':[12], 'VB':[13], '.':[14], '(':[15], ')':[16], 'CC':[17],
|
167 |
+
'POS':[18], 'VBP':[19], 'NNS':[20], 'PRP':[21], 'VBZ':[22], 'VBG':[23], 'VBN':[24], 'MD':[25],
|
168 |
+
'PRP$':[26], 'JJR':[27], 'JJS':[28], 'UH':[29], 'RP':[30], 'WP':[31], 'WDT':[32], '#':[33], "''":[34]}
|
169 |
+
for i in tokens_tagged:
|
170 |
+
x[pos_dict[i[1]][0]]+=1
|
171 |
+
x=np.array(x)
|
172 |
+
x=x.reshape(1,-1)
|
173 |
+
# x
|
174 |
+
|
175 |
+
|
176 |
+
# In[35]:
|
177 |
+
|
178 |
+
|
179 |
+
pred=clf.predict(x)
|
180 |
+
if pred==0:
|
181 |
+
return "NOT SPAM"
|
182 |
+
else:
|
183 |
+
return "SPAM"
|
184 |
+
|
185 |
+
iface = gr.Interface(fn=spam_detection, inputs="text", outputs="text")
|
186 |
+
iface.launch()
|
187 |
+
|
188 |
+
|
189 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
nltk
|
2 |
+
scikit-learn
|
spam_db.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
from nltk.tokenize import word_tokenize
|
3 |
+
import pandas as pd
|
4 |
+
import csv
|
5 |
+
|
6 |
+
|
7 |
+
tok_dict={}
|
8 |
+
|
9 |
+
lst=['NNP', 'IN', 'JJ', 'NN', ',', 'RB', ':', 'EX', 'VBD', 'WRB', 'CD', 'DT', 'TO', 'VB', '.',
|
10 |
+
'(', ')', 'CC', 'POS', 'VBP', 'NNS', 'PRP', 'VBZ', 'VBG', 'VBN', 'MD', 'PRP$', 'JJR', 'JJS', 'UH', 'RP', 'WP', 'WDT', '#', "''"]
|
11 |
+
|
12 |
+
pd_dict={'msg':[],'label':[],'label_no':[],'NNP':[], 'IN':[], 'JJ':[], 'NN':[], ',':[], 'RB':[], ':':[], 'EX':[], 'VBD':[],
|
13 |
+
'WRB':[], 'CD':[], 'DT':[], 'TO':[], 'VB':[], '.':[], '(':[], ')':[], 'CC':[],
|
14 |
+
'POS':[], 'VBP':[], 'NNS':[], 'PRP':[], 'VBZ':[], 'VBG':[], 'VBN':[], 'MD':[],
|
15 |
+
'PRP$':[], 'JJR':[], 'JJS':[], 'UH':[], 'RP':[], 'WP':[], 'WDT':[], '#':[], "''":[]}
|
16 |
+
|
17 |
+
with open("spam.csv", 'r') as file:
|
18 |
+
csvreader = csv.reader(file)
|
19 |
+
j=0
|
20 |
+
k=0
|
21 |
+
for row in csvreader:
|
22 |
+
if j==0:
|
23 |
+
j=1
|
24 |
+
continue
|
25 |
+
pd_dict['msg'].append(row[1])
|
26 |
+
pd_dict['label'].append(row[0])
|
27 |
+
if row[0]=='spam':
|
28 |
+
pd_dict['label_no'].append(1)
|
29 |
+
else:
|
30 |
+
pd_dict['label_no'].append(0)
|
31 |
+
for label in lst:
|
32 |
+
pd_dict[label].append(0)
|
33 |
+
text=row[1]
|
34 |
+
tokens=word_tokenize(text)
|
35 |
+
tokens_tagged=nltk.pos_tag(tokens)
|
36 |
+
# print(tokens_tagged,end='\n\n')
|
37 |
+
for i in tokens_tagged:
|
38 |
+
if i[1] in tok_dict:
|
39 |
+
tok_dict[i[1]].append(i[0])
|
40 |
+
else:
|
41 |
+
tok_dict[i[1]]=[i[0]]
|
42 |
+
if i[1] in pd_dict:
|
43 |
+
pd_dict[i[1]][k]+=1
|
44 |
+
k+=1
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
#text=""
|
50 |
+
#tokens=word_tokenize(text)
|
51 |
+
#tokens_tagged=nltk.pos_tag(tokens)
|
52 |
+
#print(tokens_tagged,end='\n\n')
|
53 |
+
#for i in tokens_tagged:
|
54 |
+
# if i[1] in tok_dict:
|
55 |
+
# tok_dict[i[1]].append(i[0])
|
56 |
+
# else:
|
57 |
+
# tok_dict[i[1]]=[i[0]]
|
58 |
+
|
59 |
+
#print(tok_dict, end="\n\n")
|
60 |
+
|
61 |
+
tok_dict1={}
|
62 |
+
for i in tok_dict:
|
63 |
+
tok_dict1[i]=len(tok_dict[i])
|
64 |
+
|
65 |
+
del_lst=[]
|
66 |
+
for i in tok_dict1:
|
67 |
+
print(i," ",tok_dict1[i])
|
68 |
+
if tok_dict1[i]<100:
|
69 |
+
del_lst.append(i)
|
70 |
+
|
71 |
+
print(del_lst)
|
72 |
+
for i in del_lst:
|
73 |
+
tok_dict1.pop(i)
|
74 |
+
|
75 |
+
|
76 |
+
print(tok_dict1)
|
77 |
+
|
78 |
+
lst=[]
|
79 |
+
for i in tok_dict1:
|
80 |
+
lst.append(i)
|
81 |
+
|
82 |
+
|
83 |
+
print(lst,len(lst))
|
84 |
+
|
85 |
+
df=pd.DataFrame(pd_dict)
|
86 |
+
print(df.head())
|
testing.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
from nltk.tokenize import word_tokenize
|
3 |
+
import pandas as pd
|
4 |
+
import csv
|
5 |
+
|
6 |
+
|
7 |
+
tok_dict={}
|
8 |
+
|
9 |
+
lst=['NNP', 'IN', 'JJ', 'NN', ',', 'RB', ':', 'EX', 'VBD', 'WRB', 'CD', 'DT', 'TO', 'VB', '.',
|
10 |
+
'(', ')', 'CC', 'POS', 'VBP', 'NNS', 'PRP', 'VBZ', 'VBG', 'VBN', 'MD', 'PRP$', 'JJR', 'JJS', 'UH', 'RP', 'WP', 'WDT', '#', "''"]
|
11 |
+
|
12 |
+
pd_dict={'msg':[],'label':[],'label_no':[],'NNP':[], 'IN':[], 'JJ':[], 'NN':[], ',':[], 'RB':[], ':':[], 'EX':[], 'VBD':[],
|
13 |
+
'WRB':[], 'CD':[], 'DT':[], 'TO':[], 'VB':[], '.':[], '(':[], ')':[], 'CC':[],
|
14 |
+
'POS':[], 'VBP':[], 'NNS':[], 'PRP':[], 'VBZ':[], 'VBG':[], 'VBN':[], 'MD':[],
|
15 |
+
'PRP$':[], 'JJR':[], 'JJS':[], 'UH':[], 'RP':[], 'WP':[], 'WDT':[], '#':[], "''":[]}
|
16 |
+
|
17 |
+
with open("spam_db.csv", 'r') as file:
|
18 |
+
csvreader = csv.reader(file)
|
19 |
+
j=0
|
20 |
+
k=0
|
21 |
+
for row in csvreader:
|
22 |
+
if j==0:
|
23 |
+
j=1
|
24 |
+
continue
|
25 |
+
pd_dict['msg'].append(row[1])
|
26 |
+
pd_dict['label'].append(row[0])
|
27 |
+
if row[0]=='spam':
|
28 |
+
pd_dict['label_no'].append(1)
|
29 |
+
else:
|
30 |
+
pd_dict['label_no'].append(0)
|
31 |
+
for label in lst:
|
32 |
+
pd_dict[label].append(0)
|
33 |
+
text=row[1]
|
34 |
+
tokens=word_tokenize(text)
|
35 |
+
tokens_tagged=nltk.pos_tag(tokens)
|
36 |
+
# print(tokens_tagged,end='\n\n')
|
37 |
+
for i in tokens_tagged:
|
38 |
+
if i[1] in tok_dict:
|
39 |
+
tok_dict[i[1]].append(i[0])
|
40 |
+
else:
|
41 |
+
tok_dict[i[1]]=[i[0]]
|
42 |
+
if i[1] in pd_dict:
|
43 |
+
pd_dict[i[1]][k]+=1
|
44 |
+
k+=1
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
#text=""
|
50 |
+
#tokens=word_tokenize(text)
|
51 |
+
#tokens_tagged=nltk.pos_tag(tokens)
|
52 |
+
#print(tokens_tagged,end='\n\n')
|
53 |
+
#for i in tokens_tagged:
|
54 |
+
# if i[1] in tok_dict:
|
55 |
+
# tok_dict[i[1]].append(i[0])
|
56 |
+
# else:
|
57 |
+
# tok_dict[i[1]]=[i[0]]
|
58 |
+
|
59 |
+
#print(tok_dict, end="\n\n")
|
60 |
+
|
61 |
+
tok_dict1={}
|
62 |
+
for i in tok_dict:
|
63 |
+
tok_dict1[i]=len(tok_dict[i])
|
64 |
+
|
65 |
+
del_lst=[]
|
66 |
+
for i in tok_dict1:
|
67 |
+
print(i," ",tok_dict1[i])
|
68 |
+
if tok_dict1[i]<100:
|
69 |
+
del_lst.append(i)
|
70 |
+
|
71 |
+
print(del_lst)
|
72 |
+
for i in del_lst:
|
73 |
+
tok_dict1.pop(i)
|
74 |
+
|
75 |
+
|
76 |
+
print(tok_dict1)
|
77 |
+
|
78 |
+
lst=[]
|
79 |
+
for i in tok_dict1:
|
80 |
+
lst.append(i)
|
81 |
+
|
82 |
+
|
83 |
+
print(lst,len(lst))
|
84 |
+
|
85 |
+
df=pd.DataFrame(pd_dict)
|
86 |
+
print(df.head())
|