zen21 commited on
Commit
b986fa0
1 Parent(s): 9e75792

first commit

Browse files
.ipynb_checkpoints/Spam_Filter-checkpoint.ipynb ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 40,
6
+ "id": "b9bb9dcd",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stderr",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
14
+ "[nltk_data] C:\\Users\\shiva\\AppData\\Roaming\\nltk_data...\n",
15
+ "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
16
+ "[nltk_data] date!\n",
17
+ "[nltk_data] Downloading package punkt to\n",
18
+ "[nltk_data] C:\\Users\\shiva\\AppData\\Roaming\\nltk_data...\n",
19
+ "[nltk_data] Package punkt is already up-to-date!\n"
20
+ ]
21
+ }
22
+ ],
23
+ "source": [
24
+ "import nltk\n",
25
+ "nltk.download(\"averaged_perceptron_tagger\")\n",
26
+ "nltk.download(\"punkt\")\n",
27
+ "from nltk.tokenize import word_tokenize\n",
28
+ "import pandas as pd\n",
29
+ "import csv\n",
30
+ "import numpy as np\n",
31
+ "from sklearn import preprocessing , svm , model_selection, metrics\n",
32
+ "from sklearn.preprocessing import MinMaxScaler"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 26,
38
+ "id": "c1e93bc2",
39
+ "metadata": {},
40
+ "outputs": [
41
+ {
42
+ "name": "stdout",
43
+ "output_type": "stream",
44
+ "text": [
45
+ " msg label label_no \\\n",
46
+ "0 Go until jurong point, crazy.. Available only ... ham 0 \n",
47
+ "1 Ok lar... Joking wif u oni... ham 0 \n",
48
+ "2 Free entry in 2 a wkly comp to win FA Cup fina... spam 1 \n",
49
+ "3 U dun say so early hor... U c already then say... ham 0 \n",
50
+ "4 Nah I don't think he goes to usf, he lives aro... ham 0 \n",
51
+ "\n",
52
+ " NNP IN JJ NN , RB : ... \\\n",
53
+ "0 0.071429 0.1250 0.142857 0.152174 0.076923 0.176471 0.054054 ... \n",
54
+ "1 0.047619 0.0000 0.047619 0.043478 0.000000 0.000000 0.054054 ... \n",
55
+ "2 0.142857 0.0625 0.190476 0.152174 0.000000 0.000000 0.000000 ... \n",
56
+ "3 0.023810 0.0000 0.095238 0.021739 0.000000 0.176471 0.054054 ... \n",
57
+ "4 0.023810 0.0625 0.000000 0.000000 0.076923 0.176471 0.000000 ... \n",
58
+ "\n",
59
+ " MD PRP$ JJR JJS UH RP WP WDT # '' \n",
60
+ "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
61
+ "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
62
+ "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
63
+ "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
64
+ "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
65
+ "\n",
66
+ "[5 rows x 38 columns]\n"
67
+ ]
68
+ }
69
+ ],
70
+ "source": [
71
+ "tok_dict={}\n",
72
+ "\n",
73
+ "lst=['NNP', 'IN', 'JJ', 'NN', ',', 'RB', ':', 'EX', 'VBD', 'WRB', 'CD', 'DT', 'TO', 'VB', '.',\n",
74
+ " '(', ')', 'CC', 'POS', 'VBP', 'NNS', 'PRP', 'VBZ', 'VBG', 'VBN', 'MD', 'PRP$', 'JJR', 'JJS', 'UH', 'RP', 'WP', 'WDT', '#', \"''\"]\n",
75
+ "\n",
76
+ "pd_dict={'msg':[],'label':[],'label_no':[],'NNP':[], 'IN':[], 'JJ':[], 'NN':[], ',':[], 'RB':[], ':':[], 'EX':[], 'VBD':[], \n",
77
+ "'WRB':[], 'CD':[], 'DT':[], 'TO':[], 'VB':[], '.':[], '(':[], ')':[], 'CC':[],\n",
78
+ "'POS':[], 'VBP':[], 'NNS':[], 'PRP':[], 'VBZ':[], 'VBG':[], 'VBN':[], 'MD':[], \n",
79
+ "'PRP$':[], 'JJR':[], 'JJS':[], 'UH':[], 'RP':[], 'WP':[], 'WDT':[], '#':[], \"''\":[]}\n",
80
+ "\n",
81
+ "with open(\"spam_db.csv\", 'r') as file:\n",
82
+ " csvreader = csv.reader(file)\n",
83
+ " j=0\n",
84
+ " k=0\n",
85
+ " for row in csvreader:\n",
86
+ " if j==0:\n",
87
+ " j=1\n",
88
+ " continue\n",
89
+ " pd_dict['msg'].append(row[1])\n",
90
+ " pd_dict['label'].append(row[0])\n",
91
+ " if row[0]=='spam':\n",
92
+ " pd_dict['label_no'].append(1)\n",
93
+ " else:\n",
94
+ " pd_dict['label_no'].append(0)\n",
95
+ " for label in lst:\n",
96
+ " pd_dict[label].append(0)\n",
97
+ " text=row[1]\n",
98
+ " tokens=word_tokenize(text)\n",
99
+ " tokens_tagged=nltk.pos_tag(tokens)\n",
100
+ " for i in tokens_tagged:\n",
101
+ " if i[1] in tok_dict:\n",
102
+ " tok_dict[i[1]].append(i[0])\n",
103
+ " else:\n",
104
+ " tok_dict[i[1]]=[i[0]]\n",
105
+ " if i[1] in pd_dict:\n",
106
+ " pd_dict[i[1]][k]+=1\n",
107
+ " k+=1\n",
108
+ " \n",
109
+ "tok_dict1={}\n",
110
+ "for i in tok_dict:\n",
111
+ " tok_dict1[i]=len(tok_dict[i])\n",
112
+ "\n",
113
+ "del_lst=[]\n",
114
+ "for i in tok_dict1:\n",
115
+ " if tok_dict1[i]<100:\n",
116
+ " del_lst.append(i)\n",
117
+ "\n",
118
+ "for i in del_lst:\n",
119
+ " tok_dict1.pop(i)\n",
120
+ "\n",
121
+ "lst=[]\n",
122
+ "for i in tok_dict1:\n",
123
+ " lst.append(i)\n",
124
+ "\n",
125
+ "df=pd.DataFrame(pd_dict)\n",
126
+ "numeric_columns = df.drop(['msg', 'label', 'label_no'], axis=1).columns\n",
127
+ "\n",
128
+ "# Create the MinMaxScaler object\n",
129
+ "scaler = MinMaxScaler()\n",
130
+ "\n",
131
+ "# Normalize the numeric columns using min-max normalization\n",
132
+ "df[numeric_columns] = scaler.fit_transform(df[numeric_columns])\n",
133
+ "\n",
134
+ "print(df.head())"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": 27,
140
+ "id": "35824c58",
141
+ "metadata": {},
142
+ "outputs": [
143
+ {
144
+ "name": "stderr",
145
+ "output_type": "stream",
146
+ "text": [
147
+ "C:\\Users\\shiva\\AppData\\Local\\Temp\\ipykernel_9568\\3238635716.py:1: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.\n",
148
+ " X=np.array(df.drop(['msg','label','label_no'],1))\n"
149
+ ]
150
+ }
151
+ ],
152
+ "source": [
153
+ "X=np.array(df.drop(['msg','label','label_no'],1))\n",
154
+ "y=np.array(df['label_no'])"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": 32,
160
+ "id": "aec84e0c",
161
+ "metadata": {},
162
+ "outputs": [
163
+ {
164
+ "name": "stdout",
165
+ "output_type": "stream",
166
+ "text": [
167
+ "0.9676956209619526\n"
168
+ ]
169
+ }
170
+ ],
171
+ "source": [
172
+ "X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)\n",
173
+ "clf=svm.SVC(kernel='poly')\n",
174
+ "clf.fit(X_train, y_train)\n",
175
+ "accuracy = clf.score(X_test, y_test)\n",
176
+ "print(accuracy)"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": 36,
182
+ "id": "62e97e65",
183
+ "metadata": {},
184
+ "outputs": [
185
+ {
186
+ "name": "stdout",
187
+ "output_type": "stream",
188
+ "text": [
189
+ "Precision: 0.9669448190530422\n",
190
+ "Recall: 0.9676956209619526\n",
191
+ "F1 score: 0.9667034979766862\n"
192
+ ]
193
+ },
194
+ {
195
+ "data": {
196
+ "text/plain": [
197
+ "array([[1208, 11],\n",
198
+ " [ 34, 140]], dtype=int64)"
199
+ ]
200
+ },
201
+ "execution_count": 36,
202
+ "metadata": {},
203
+ "output_type": "execute_result"
204
+ }
205
+ ],
206
+ "source": [
207
+ "y_pred = clf.predict(X_test)\n",
208
+ "\n",
209
+ "precision = metrics.precision_score(y_test, y_pred, average='weighted')\n",
210
+ "recall = metrics.recall_score(y_test, y_pred, average='weighted')\n",
211
+ "f1 = metrics.f1_score(y_test, y_pred, average='weighted')\n",
212
+ "\n",
213
+ "print(\"Precision:\", precision)\n",
214
+ "print(\"Recall:\", recall)\n",
215
+ "print(\"F1 score:\", f1)\n",
216
+ "\n",
217
+ "confusion_mat = metrics.confusion_matrix(y_test, y_pred)\n",
218
+ "confusion_mat"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": 33,
224
+ "id": "ccce58e6",
225
+ "metadata": {},
226
+ "outputs": [],
227
+ "source": [
228
+ "text='''WINNER!! As a valued network customer you have been selected to receivea \n",
229
+ "å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'''"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "code",
234
+ "execution_count": 34,
235
+ "id": "f53b1187",
236
+ "metadata": {},
237
+ "outputs": [],
238
+ "source": [
239
+ "tokens=word_tokenize(text)\n",
240
+ "tokens_tagged=nltk.pos_tag(tokens)\n",
241
+ "x=[]\n",
242
+ "for i in range(35):\n",
243
+ " x.append(0)\n",
244
+ "pos_dict={'NNP':[0], 'IN':[1], 'JJ':[2], 'NN':[3], ',':[4], 'RB':[5], ':':[6], 'EX':[7], 'VBD':[8], \n",
245
+ "'WRB':[9], 'CD':[10], 'DT':[11], 'TO':[12], 'VB':[13], '.':[14], '(':[15], ')':[16], 'CC':[17],\n",
246
+ "'POS':[18], 'VBP':[19], 'NNS':[20], 'PRP':[21], 'VBZ':[22], 'VBG':[23], 'VBN':[24], 'MD':[25], \n",
247
+ "'PRP$':[26], 'JJR':[27], 'JJS':[28], 'UH':[29], 'RP':[30], 'WP':[31], 'WDT':[32], '#':[33], \"''\":[34]}\n",
248
+ "for i in tokens_tagged:\n",
249
+ " x[pos_dict[i[1]][0]]+=1\n",
250
+ "x=np.array(x)\n",
251
+ "x=x.reshape(1,-1)\n",
252
+ "# x"
253
+ ]
254
+ },
255
+ {
256
+ "cell_type": "code",
257
+ "execution_count": 35,
258
+ "id": "1d0066d6",
259
+ "metadata": {},
260
+ "outputs": [
261
+ {
262
+ "name": "stdout",
263
+ "output_type": "stream",
264
+ "text": [
265
+ "SPAM\n"
266
+ ]
267
+ }
268
+ ],
269
+ "source": [
270
+ "pred=clf.predict(x)\n",
271
+ "if pred==0:\n",
272
+ " print(\"NOT SPAM\")\n",
273
+ "else:\n",
274
+ " print(\"SPAM\")"
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": null,
280
+ "id": "7440777a",
281
+ "metadata": {},
282
+ "outputs": [],
283
+ "source": []
284
+ }
285
+ ],
286
+ "metadata": {
287
+ "kernelspec": {
288
+ "display_name": "Python 3 (ipykernel)",
289
+ "language": "python",
290
+ "name": "python3"
291
+ },
292
+ "language_info": {
293
+ "codemirror_mode": {
294
+ "name": "ipython",
295
+ "version": 3
296
+ },
297
+ "file_extension": ".py",
298
+ "mimetype": "text/x-python",
299
+ "name": "python",
300
+ "nbconvert_exporter": "python",
301
+ "pygments_lexer": "ipython3",
302
+ "version": "3.10.4"
303
+ }
304
+ },
305
+ "nbformat": 4,
306
+ "nbformat_minor": 5
307
+ }
Spam_Filter.ipynb ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 40,
6
+ "id": "b9bb9dcd",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stderr",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
14
+ "[nltk_data] C:\\Users\\shiva\\AppData\\Roaming\\nltk_data...\n",
15
+ "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
16
+ "[nltk_data] date!\n",
17
+ "[nltk_data] Downloading package punkt to\n",
18
+ "[nltk_data] C:\\Users\\shiva\\AppData\\Roaming\\nltk_data...\n",
19
+ "[nltk_data] Package punkt is already up-to-date!\n"
20
+ ]
21
+ }
22
+ ],
23
+ "source": [
24
+ "import nltk\n",
25
+ "nltk.download(\"averaged_perceptron_tagger\")\n",
26
+ "nltk.download(\"punkt\")\n",
27
+ "from nltk.tokenize import word_tokenize\n",
28
+ "import pandas as pd\n",
29
+ "import csv\n",
30
+ "import numpy as np\n",
31
+ "from sklearn import preprocessing , svm , model_selection, metrics\n",
32
+ "from sklearn.preprocessing import MinMaxScaler"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 26,
38
+ "id": "c1e93bc2",
39
+ "metadata": {},
40
+ "outputs": [
41
+ {
42
+ "name": "stdout",
43
+ "output_type": "stream",
44
+ "text": [
45
+ " msg label label_no \\\n",
46
+ "0 Go until jurong point, crazy.. Available only ... ham 0 \n",
47
+ "1 Ok lar... Joking wif u oni... ham 0 \n",
48
+ "2 Free entry in 2 a wkly comp to win FA Cup fina... spam 1 \n",
49
+ "3 U dun say so early hor... U c already then say... ham 0 \n",
50
+ "4 Nah I don't think he goes to usf, he lives aro... ham 0 \n",
51
+ "\n",
52
+ " NNP IN JJ NN , RB : ... \\\n",
53
+ "0 0.071429 0.1250 0.142857 0.152174 0.076923 0.176471 0.054054 ... \n",
54
+ "1 0.047619 0.0000 0.047619 0.043478 0.000000 0.000000 0.054054 ... \n",
55
+ "2 0.142857 0.0625 0.190476 0.152174 0.000000 0.000000 0.000000 ... \n",
56
+ "3 0.023810 0.0000 0.095238 0.021739 0.000000 0.176471 0.054054 ... \n",
57
+ "4 0.023810 0.0625 0.000000 0.000000 0.076923 0.176471 0.000000 ... \n",
58
+ "\n",
59
+ " MD PRP$ JJR JJS UH RP WP WDT # '' \n",
60
+ "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
61
+ "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
62
+ "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
63
+ "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
64
+ "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
65
+ "\n",
66
+ "[5 rows x 38 columns]\n"
67
+ ]
68
+ }
69
+ ],
70
+ "source": [
71
+ "tok_dict={}\n",
72
+ "\n",
73
+ "lst=['NNP', 'IN', 'JJ', 'NN', ',', 'RB', ':', 'EX', 'VBD', 'WRB', 'CD', 'DT', 'TO', 'VB', '.',\n",
74
+ " '(', ')', 'CC', 'POS', 'VBP', 'NNS', 'PRP', 'VBZ', 'VBG', 'VBN', 'MD', 'PRP$', 'JJR', 'JJS', 'UH', 'RP', 'WP', 'WDT', '#', \"''\"]\n",
75
+ "\n",
76
+ "pd_dict={'msg':[],'label':[],'label_no':[],'NNP':[], 'IN':[], 'JJ':[], 'NN':[], ',':[], 'RB':[], ':':[], 'EX':[], 'VBD':[], \n",
77
+ "'WRB':[], 'CD':[], 'DT':[], 'TO':[], 'VB':[], '.':[], '(':[], ')':[], 'CC':[],\n",
78
+ "'POS':[], 'VBP':[], 'NNS':[], 'PRP':[], 'VBZ':[], 'VBG':[], 'VBN':[], 'MD':[], \n",
79
+ "'PRP$':[], 'JJR':[], 'JJS':[], 'UH':[], 'RP':[], 'WP':[], 'WDT':[], '#':[], \"''\":[]}\n",
80
+ "\n",
81
+ "with open(\"spam_db.csv\", 'r') as file:\n",
82
+ " csvreader = csv.reader(file)\n",
83
+ " j=0\n",
84
+ " k=0\n",
85
+ " for row in csvreader:\n",
86
+ " if j==0:\n",
87
+ " j=1\n",
88
+ " continue\n",
89
+ " pd_dict['msg'].append(row[1])\n",
90
+ " pd_dict['label'].append(row[0])\n",
91
+ " if row[0]=='spam':\n",
92
+ " pd_dict['label_no'].append(1)\n",
93
+ " else:\n",
94
+ " pd_dict['label_no'].append(0)\n",
95
+ " for label in lst:\n",
96
+ " pd_dict[label].append(0)\n",
97
+ " text=row[1]\n",
98
+ " tokens=word_tokenize(text)\n",
99
+ " tokens_tagged=nltk.pos_tag(tokens)\n",
100
+ " for i in tokens_tagged:\n",
101
+ " if i[1] in tok_dict:\n",
102
+ " tok_dict[i[1]].append(i[0])\n",
103
+ " else:\n",
104
+ " tok_dict[i[1]]=[i[0]]\n",
105
+ " if i[1] in pd_dict:\n",
106
+ " pd_dict[i[1]][k]+=1\n",
107
+ " k+=1\n",
108
+ " \n",
109
+ "tok_dict1={}\n",
110
+ "for i in tok_dict:\n",
111
+ " tok_dict1[i]=len(tok_dict[i])\n",
112
+ "\n",
113
+ "del_lst=[]\n",
114
+ "for i in tok_dict1:\n",
115
+ " if tok_dict1[i]<100:\n",
116
+ " del_lst.append(i)\n",
117
+ "\n",
118
+ "for i in del_lst:\n",
119
+ " tok_dict1.pop(i)\n",
120
+ "\n",
121
+ "lst=[]\n",
122
+ "for i in tok_dict1:\n",
123
+ " lst.append(i)\n",
124
+ "\n",
125
+ "df=pd.DataFrame(pd_dict)\n",
126
+ "numeric_columns = df.drop(['msg', 'label', 'label_no'], axis=1).columns\n",
127
+ "\n",
128
+ "# Create the MinMaxScaler object\n",
129
+ "scaler = MinMaxScaler()\n",
130
+ "\n",
131
+ "# Normalize the numeric columns using min-max normalization\n",
132
+ "df[numeric_columns] = scaler.fit_transform(df[numeric_columns])\n",
133
+ "\n",
134
+ "print(df.head())"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": 27,
140
+ "id": "35824c58",
141
+ "metadata": {},
142
+ "outputs": [
143
+ {
144
+ "name": "stderr",
145
+ "output_type": "stream",
146
+ "text": [
147
+ "C:\\Users\\shiva\\AppData\\Local\\Temp\\ipykernel_9568\\3238635716.py:1: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.\n",
148
+ " X=np.array(df.drop(['msg','label','label_no'],1))\n"
149
+ ]
150
+ }
151
+ ],
152
+ "source": [
153
+ "X=np.array(df.drop(['msg','label','label_no'],1))\n",
154
+ "y=np.array(df['label_no'])"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": 32,
160
+ "id": "aec84e0c",
161
+ "metadata": {},
162
+ "outputs": [
163
+ {
164
+ "name": "stdout",
165
+ "output_type": "stream",
166
+ "text": [
167
+ "0.9676956209619526\n"
168
+ ]
169
+ }
170
+ ],
171
+ "source": [
172
+ "X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)\n",
173
+ "clf=svm.SVC(kernel='poly')\n",
174
+ "clf.fit(X_train, y_train)\n",
175
+ "accuracy = clf.score(X_test, y_test)\n",
176
+ "print(accuracy)"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": 36,
182
+ "id": "62e97e65",
183
+ "metadata": {},
184
+ "outputs": [
185
+ {
186
+ "name": "stdout",
187
+ "output_type": "stream",
188
+ "text": [
189
+ "Precision: 0.9669448190530422\n",
190
+ "Recall: 0.9676956209619526\n",
191
+ "F1 score: 0.9667034979766862\n"
192
+ ]
193
+ },
194
+ {
195
+ "data": {
196
+ "text/plain": [
197
+ "array([[1208, 11],\n",
198
+ " [ 34, 140]], dtype=int64)"
199
+ ]
200
+ },
201
+ "execution_count": 36,
202
+ "metadata": {},
203
+ "output_type": "execute_result"
204
+ }
205
+ ],
206
+ "source": [
207
+ "y_pred = clf.predict(X_test)\n",
208
+ "\n",
209
+ "precision = metrics.precision_score(y_test, y_pred, average='weighted')\n",
210
+ "recall = metrics.recall_score(y_test, y_pred, average='weighted')\n",
211
+ "f1 = metrics.f1_score(y_test, y_pred, average='weighted')\n",
212
+ "\n",
213
+ "print(\"Precision:\", precision)\n",
214
+ "print(\"Recall:\", recall)\n",
215
+ "print(\"F1 score:\", f1)\n",
216
+ "\n",
217
+ "confusion_mat = metrics.confusion_matrix(y_test, y_pred)\n",
218
+ "confusion_mat"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": 33,
224
+ "id": "ccce58e6",
225
+ "metadata": {},
226
+ "outputs": [],
227
+ "source": [
228
+ "text='''WINNER!! As a valued network customer you have been selected to receivea \n",
229
+ "å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'''"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "code",
234
+ "execution_count": 34,
235
+ "id": "f53b1187",
236
+ "metadata": {},
237
+ "outputs": [],
238
+ "source": [
239
+ "tokens=word_tokenize(text)\n",
240
+ "tokens_tagged=nltk.pos_tag(tokens)\n",
241
+ "x=[]\n",
242
+ "for i in range(35):\n",
243
+ " x.append(0)\n",
244
+ "pos_dict={'NNP':[0], 'IN':[1], 'JJ':[2], 'NN':[3], ',':[4], 'RB':[5], ':':[6], 'EX':[7], 'VBD':[8], \n",
245
+ "'WRB':[9], 'CD':[10], 'DT':[11], 'TO':[12], 'VB':[13], '.':[14], '(':[15], ')':[16], 'CC':[17],\n",
246
+ "'POS':[18], 'VBP':[19], 'NNS':[20], 'PRP':[21], 'VBZ':[22], 'VBG':[23], 'VBN':[24], 'MD':[25], \n",
247
+ "'PRP$':[26], 'JJR':[27], 'JJS':[28], 'UH':[29], 'RP':[30], 'WP':[31], 'WDT':[32], '#':[33], \"''\":[34]}\n",
248
+ "for i in tokens_tagged:\n",
249
+ " x[pos_dict[i[1]][0]]+=1\n",
250
+ "x=np.array(x)\n",
251
+ "x=x.reshape(1,-1)\n",
252
+ "# x"
253
+ ]
254
+ },
255
+ {
256
+ "cell_type": "code",
257
+ "execution_count": 35,
258
+ "id": "1d0066d6",
259
+ "metadata": {},
260
+ "outputs": [
261
+ {
262
+ "name": "stdout",
263
+ "output_type": "stream",
264
+ "text": [
265
+ "SPAM\n"
266
+ ]
267
+ }
268
+ ],
269
+ "source": [
270
+ "pred=clf.predict(x)\n",
271
+ "if pred==0:\n",
272
+ " print(\"NOT SPAM\")\n",
273
+ "else:\n",
274
+ " print(\"SPAM\")"
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": null,
280
+ "id": "7440777a",
281
+ "metadata": {},
282
+ "outputs": [],
283
+ "source": []
284
+ }
285
+ ],
286
+ "metadata": {
287
+ "kernelspec": {
288
+ "display_name": "Python 3 (ipykernel)",
289
+ "language": "python",
290
+ "name": "python3"
291
+ },
292
+ "language_info": {
293
+ "codemirror_mode": {
294
+ "name": "ipython",
295
+ "version": 3
296
+ },
297
+ "file_extension": ".py",
298
+ "mimetype": "text/x-python",
299
+ "name": "python",
300
+ "nbconvert_exporter": "python",
301
+ "pygments_lexer": "ipython3",
302
+ "version": "3.10.4"
303
+ }
304
+ },
305
+ "nbformat": 4,
306
+ "nbformat_minor": 5
307
+ }
app.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[25]:
5
+
6
+
7
+ import nltk
8
+ nltk.download("averaged_perceptron_tagger")
9
+ nltk.download("punkt")
10
+ from nltk.tokenize import word_tokenize
11
+ import pandas as pd
12
+ import csv
13
+ import numpy as np
14
+ from sklearn import preprocessing , svm , model_selection, metrics
15
+ from sklearn.preprocessing import MinMaxScaler
16
+ import gradio as gr
17
+
18
+
19
+ # In[26]:
20
+
21
+
22
+ tok_dict={}
23
+
24
+ lst=['NNP', 'IN', 'JJ', 'NN', ',', 'RB', ':', 'EX', 'VBD', 'WRB', 'CD', 'DT', 'TO', 'VB', '.',
25
+ '(', ')', 'CC', 'POS', 'VBP', 'NNS', 'PRP', 'VBZ', 'VBG', 'VBN', 'MD', 'PRP$', 'JJR', 'JJS', 'UH', 'RP', 'WP', 'WDT', '#', "''"]
26
+
27
+ pd_dict={'msg':[],'label':[],'label_no':[],'NNP':[], 'IN':[], 'JJ':[], 'NN':[], ',':[], 'RB':[], ':':[], 'EX':[], 'VBD':[],
28
+ 'WRB':[], 'CD':[], 'DT':[], 'TO':[], 'VB':[], '.':[], '(':[], ')':[], 'CC':[],
29
+ 'POS':[], 'VBP':[], 'NNS':[], 'PRP':[], 'VBZ':[], 'VBG':[], 'VBN':[], 'MD':[],
30
+ 'PRP$':[], 'JJR':[], 'JJS':[], 'UH':[], 'RP':[], 'WP':[], 'WDT':[], '#':[], "''":[]}
31
+
32
+ with open("spam_db.csv", 'r') as file:
33
+ csvreader = csv.reader(file)
34
+ j=0
35
+ k=0
36
+ for row in csvreader:
37
+ if j==0:
38
+ j=1
39
+ continue
40
+ pd_dict['msg'].append(row[1])
41
+ pd_dict['label'].append(row[0])
42
+ if row[0]=='spam':
43
+ pd_dict['label_no'].append(1)
44
+ else:
45
+ pd_dict['label_no'].append(0)
46
+ for label in lst:
47
+ pd_dict[label].append(0)
48
+ text=row[1]
49
+ tokens=word_tokenize(text)
50
+ tokens_tagged=nltk.pos_tag(tokens)
51
+ for i in tokens_tagged:
52
+ if i[1] in tok_dict:
53
+ tok_dict[i[1]].append(i[0])
54
+ else:
55
+ tok_dict[i[1]]=[i[0]]
56
+ if i[1] in pd_dict:
57
+ pd_dict[i[1]][k]+=1
58
+ k+=1
59
+
60
+ tok_dict1={}
61
+ for i in tok_dict:
62
+ tok_dict1[i]=len(tok_dict[i])
63
+
64
+ del_lst=[]
65
+ for i in tok_dict1:
66
+ if tok_dict1[i]<100:
67
+ del_lst.append(i)
68
+
69
+ for i in del_lst:
70
+ tok_dict1.pop(i)
71
+
72
+ lst=[]
73
+ for i in tok_dict1:
74
+ lst.append(i)
75
+
76
+ df=pd.DataFrame(pd_dict)
77
+ numeric_columns = df.drop(['msg', 'label', 'label_no'], axis=1).columns
78
+
79
+ # Create the MinMaxScaler object
80
+ scaler = MinMaxScaler()
81
+
82
+ # Normalize the numeric columns using min-max normalization
83
+ df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
84
+
85
+ print(df.head())
86
+
87
+
88
+ # In[27]:
89
+
90
+
91
+ X=np.array(df.drop(['msg','label','label_no'],axis = 1))
92
+ y=np.array(df['label_no'])
93
+
94
+
95
+ # In[32]:
96
+
97
+
98
+ X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)
99
+ clf=svm.SVC(kernel='poly')
100
+ clf.fit(X_train, y_train)
101
+ accuracy = clf.score(X_test, y_test)
102
+ print(accuracy)
103
+
104
+
105
+ # In[36]:
106
+
107
+
108
+ y_pred = clf.predict(X_test)
109
+
110
+ precision = metrics.precision_score(y_test, y_pred, average='weighted')
111
+ recall = metrics.recall_score(y_test, y_pred, average='weighted')
112
+ f1 = metrics.f1_score(y_test, y_pred, average='weighted')
113
+
114
+ print("Precision:", precision)
115
+ print("Recall:", recall)
116
+ print("F1 score:", f1)
117
+
118
+ confusion_mat = metrics.confusion_matrix(y_test, y_pred)
119
+ confusion_mat
120
+
121
+
122
+ # In[33]:
123
+
124
+
125
+ text='''WINNER!! As a valued network customer you have been selected to receivea
126
+ å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'''
127
+
128
+
129
+ # In[34]:
130
+
131
+
132
+ tokens=word_tokenize(text)
133
+ tokens_tagged=nltk.pos_tag(tokens)
134
+ x=[]
135
+ for i in range(35):
136
+ x.append(0)
137
+ pos_dict={'NNP':[0], 'IN':[1], 'JJ':[2], 'NN':[3], ',':[4], 'RB':[5], ':':[6], 'EX':[7], 'VBD':[8],
138
+ 'WRB':[9], 'CD':[10], 'DT':[11], 'TO':[12], 'VB':[13], '.':[14], '(':[15], ')':[16], 'CC':[17],
139
+ 'POS':[18], 'VBP':[19], 'NNS':[20], 'PRP':[21], 'VBZ':[22], 'VBG':[23], 'VBN':[24], 'MD':[25],
140
+ 'PRP$':[26], 'JJR':[27], 'JJS':[28], 'UH':[29], 'RP':[30], 'WP':[31], 'WDT':[32], '#':[33], "''":[34]}
141
+ for i in tokens_tagged:
142
+ x[pos_dict[i[1]][0]]+=1
143
+ x=np.array(x)
144
+ x=x.reshape(1,-1)
145
+ # x
146
+
147
+
148
+ # In[35]:
149
+
150
+
151
+ pred=clf.predict(x)
152
+ if pred==0:
153
+ print("NOT SPAM")
154
+ else:
155
+ print("SPAM")
156
+
157
+
158
+ # In[ ]:
159
+ def spam_detection(txt):
160
+ tokens=word_tokenize(text)
161
+ tokens_tagged=nltk.pos_tag(tokens)
162
+ x=[]
163
+ for i in range(35):
164
+ x.append(0)
165
+ pos_dict={'NNP':[0], 'IN':[1], 'JJ':[2], 'NN':[3], ',':[4], 'RB':[5], ':':[6], 'EX':[7], 'VBD':[8],
166
+ 'WRB':[9], 'CD':[10], 'DT':[11], 'TO':[12], 'VB':[13], '.':[14], '(':[15], ')':[16], 'CC':[17],
167
+ 'POS':[18], 'VBP':[19], 'NNS':[20], 'PRP':[21], 'VBZ':[22], 'VBG':[23], 'VBN':[24], 'MD':[25],
168
+ 'PRP$':[26], 'JJR':[27], 'JJS':[28], 'UH':[29], 'RP':[30], 'WP':[31], 'WDT':[32], '#':[33], "''":[34]}
169
+ for i in tokens_tagged:
170
+ x[pos_dict[i[1]][0]]+=1
171
+ x=np.array(x)
172
+ x=x.reshape(1,-1)
173
+ # x
174
+
175
+
176
+ # In[35]:
177
+
178
+
179
+ pred=clf.predict(x)
180
+ if pred==0:
181
+ return "NOT SPAM"
182
+ else:
183
+ return "SPAM"
184
+
185
+ iface = gr.Interface(fn=spam_detection, inputs="text", outputs="text")
186
+ iface.launch()
187
+
188
+
189
+
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ nltk
2
+ scikit-learn
spam_db.csv ADDED
The diff for this file is too large to render. See raw diff
 
test.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ from nltk.tokenize import word_tokenize
3
+ import pandas as pd
4
+ import csv
5
+
6
+
7
+ tok_dict={}
8
+
9
+ lst=['NNP', 'IN', 'JJ', 'NN', ',', 'RB', ':', 'EX', 'VBD', 'WRB', 'CD', 'DT', 'TO', 'VB', '.',
10
+ '(', ')', 'CC', 'POS', 'VBP', 'NNS', 'PRP', 'VBZ', 'VBG', 'VBN', 'MD', 'PRP$', 'JJR', 'JJS', 'UH', 'RP', 'WP', 'WDT', '#', "''"]
11
+
12
+ pd_dict={'msg':[],'label':[],'label_no':[],'NNP':[], 'IN':[], 'JJ':[], 'NN':[], ',':[], 'RB':[], ':':[], 'EX':[], 'VBD':[],
13
+ 'WRB':[], 'CD':[], 'DT':[], 'TO':[], 'VB':[], '.':[], '(':[], ')':[], 'CC':[],
14
+ 'POS':[], 'VBP':[], 'NNS':[], 'PRP':[], 'VBZ':[], 'VBG':[], 'VBN':[], 'MD':[],
15
+ 'PRP$':[], 'JJR':[], 'JJS':[], 'UH':[], 'RP':[], 'WP':[], 'WDT':[], '#':[], "''":[]}
16
+
17
+ with open("spam.csv", 'r') as file:
18
+ csvreader = csv.reader(file)
19
+ j=0
20
+ k=0
21
+ for row in csvreader:
22
+ if j==0:
23
+ j=1
24
+ continue
25
+ pd_dict['msg'].append(row[1])
26
+ pd_dict['label'].append(row[0])
27
+ if row[0]=='spam':
28
+ pd_dict['label_no'].append(1)
29
+ else:
30
+ pd_dict['label_no'].append(0)
31
+ for label in lst:
32
+ pd_dict[label].append(0)
33
+ text=row[1]
34
+ tokens=word_tokenize(text)
35
+ tokens_tagged=nltk.pos_tag(tokens)
36
+ # print(tokens_tagged,end='\n\n')
37
+ for i in tokens_tagged:
38
+ if i[1] in tok_dict:
39
+ tok_dict[i[1]].append(i[0])
40
+ else:
41
+ tok_dict[i[1]]=[i[0]]
42
+ if i[1] in pd_dict:
43
+ pd_dict[i[1]][k]+=1
44
+ k+=1
45
+
46
+
47
+
48
+
49
+ #text=""
50
+ #tokens=word_tokenize(text)
51
+ #tokens_tagged=nltk.pos_tag(tokens)
52
+ #print(tokens_tagged,end='\n\n')
53
+ #for i in tokens_tagged:
54
+ # if i[1] in tok_dict:
55
+ # tok_dict[i[1]].append(i[0])
56
+ # else:
57
+ # tok_dict[i[1]]=[i[0]]
58
+
59
+ #print(tok_dict, end="\n\n")
60
+
61
+ tok_dict1={}
62
+ for i in tok_dict:
63
+ tok_dict1[i]=len(tok_dict[i])
64
+
65
+ del_lst=[]
66
+ for i in tok_dict1:
67
+ print(i," ",tok_dict1[i])
68
+ if tok_dict1[i]<100:
69
+ del_lst.append(i)
70
+
71
+ print(del_lst)
72
+ for i in del_lst:
73
+ tok_dict1.pop(i)
74
+
75
+
76
+ print(tok_dict1)
77
+
78
+ lst=[]
79
+ for i in tok_dict1:
80
+ lst.append(i)
81
+
82
+
83
+ print(lst,len(lst))
84
+
85
+ df=pd.DataFrame(pd_dict)
86
+ print(df.head())
testing.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ from nltk.tokenize import word_tokenize
3
+ import pandas as pd
4
+ import csv
5
+
6
+
7
+ tok_dict={}
8
+
9
+ lst=['NNP', 'IN', 'JJ', 'NN', ',', 'RB', ':', 'EX', 'VBD', 'WRB', 'CD', 'DT', 'TO', 'VB', '.',
10
+ '(', ')', 'CC', 'POS', 'VBP', 'NNS', 'PRP', 'VBZ', 'VBG', 'VBN', 'MD', 'PRP$', 'JJR', 'JJS', 'UH', 'RP', 'WP', 'WDT', '#', "''"]
11
+
12
+ pd_dict={'msg':[],'label':[],'label_no':[],'NNP':[], 'IN':[], 'JJ':[], 'NN':[], ',':[], 'RB':[], ':':[], 'EX':[], 'VBD':[],
13
+ 'WRB':[], 'CD':[], 'DT':[], 'TO':[], 'VB':[], '.':[], '(':[], ')':[], 'CC':[],
14
+ 'POS':[], 'VBP':[], 'NNS':[], 'PRP':[], 'VBZ':[], 'VBG':[], 'VBN':[], 'MD':[],
15
+ 'PRP$':[], 'JJR':[], 'JJS':[], 'UH':[], 'RP':[], 'WP':[], 'WDT':[], '#':[], "''":[]}
16
+
17
+ with open("spam_db.csv", 'r') as file:
18
+ csvreader = csv.reader(file)
19
+ j=0
20
+ k=0
21
+ for row in csvreader:
22
+ if j==0:
23
+ j=1
24
+ continue
25
+ pd_dict['msg'].append(row[1])
26
+ pd_dict['label'].append(row[0])
27
+ if row[0]=='spam':
28
+ pd_dict['label_no'].append(1)
29
+ else:
30
+ pd_dict['label_no'].append(0)
31
+ for label in lst:
32
+ pd_dict[label].append(0)
33
+ text=row[1]
34
+ tokens=word_tokenize(text)
35
+ tokens_tagged=nltk.pos_tag(tokens)
36
+ # print(tokens_tagged,end='\n\n')
37
+ for i in tokens_tagged:
38
+ if i[1] in tok_dict:
39
+ tok_dict[i[1]].append(i[0])
40
+ else:
41
+ tok_dict[i[1]]=[i[0]]
42
+ if i[1] in pd_dict:
43
+ pd_dict[i[1]][k]+=1
44
+ k+=1
45
+
46
+
47
+
48
+
49
+ #text=""
50
+ #tokens=word_tokenize(text)
51
+ #tokens_tagged=nltk.pos_tag(tokens)
52
+ #print(tokens_tagged,end='\n\n')
53
+ #for i in tokens_tagged:
54
+ # if i[1] in tok_dict:
55
+ # tok_dict[i[1]].append(i[0])
56
+ # else:
57
+ # tok_dict[i[1]]=[i[0]]
58
+
59
+ #print(tok_dict, end="\n\n")
60
+
61
+ tok_dict1={}
62
+ for i in tok_dict:
63
+ tok_dict1[i]=len(tok_dict[i])
64
+
65
+ del_lst=[]
66
+ for i in tok_dict1:
67
+ print(i," ",tok_dict1[i])
68
+ if tok_dict1[i]<100:
69
+ del_lst.append(i)
70
+
71
+ print(del_lst)
72
+ for i in del_lst:
73
+ tok_dict1.pop(i)
74
+
75
+
76
+ print(tok_dict1)
77
+
78
+ lst=[]
79
+ for i in tok_dict1:
80
+ lst.append(i)
81
+
82
+
83
+ print(lst,len(lst))
84
+
85
+ df=pd.DataFrame(pd_dict)
86
+ print(df.head())