spam-detection / app.py
zen21's picture
Update app.py
b9a64e2
#!/usr/bin/env python
# coding: utf-8
# In[25]:
import nltk
nltk.download("averaged_perceptron_tagger")
nltk.download("punkt")
from nltk.tokenize import word_tokenize
import pandas as pd
import csv
import numpy as np
from sklearn import preprocessing , svm , model_selection, metrics
from sklearn.preprocessing import MinMaxScaler
import gradio as gr
# In[26]:
tok_dict={}
lst=['NNP', 'IN', 'JJ', 'NN', ',', 'RB', ':', 'EX', 'VBD', 'WRB', 'CD', 'DT', 'TO', 'VB', '.',
'(', ')', 'CC', 'POS', 'VBP', 'NNS', 'PRP', 'VBZ', 'VBG', 'VBN', 'MD', 'PRP$', 'JJR', 'JJS', 'UH', 'RP', 'WP', 'WDT', '#', "''"]
pd_dict={'msg':[],'label':[],'label_no':[],'NNP':[], 'IN':[], 'JJ':[], 'NN':[], ',':[], 'RB':[], ':':[], 'EX':[], 'VBD':[],
'WRB':[], 'CD':[], 'DT':[], 'TO':[], 'VB':[], '.':[], '(':[], ')':[], 'CC':[],
'POS':[], 'VBP':[], 'NNS':[], 'PRP':[], 'VBZ':[], 'VBG':[], 'VBN':[], 'MD':[],
'PRP$':[], 'JJR':[], 'JJS':[], 'UH':[], 'RP':[], 'WP':[], 'WDT':[], '#':[], "''":[]}
with open("spam_db.csv", 'r', encoding='utf-8', errors = "ignore") as file:
csvreader = csv.reader(file)
j=0
k=0
for row in csvreader:
if j==0:
j=1
continue
pd_dict['msg'].append(row[1])
pd_dict['label'].append(row[0])
if row[0]=='spam':
pd_dict['label_no'].append(1)
else:
pd_dict['label_no'].append(0)
for label in lst:
pd_dict[label].append(0)
text=row[1]
tokens=word_tokenize(text)
tokens_tagged=nltk.pos_tag(tokens)
for i in tokens_tagged:
if i[1] in tok_dict:
tok_dict[i[1]].append(i[0])
else:
tok_dict[i[1]]=[i[0]]
if i[1] in pd_dict:
pd_dict[i[1]][k]+=1
k+=1
tok_dict1={}
for i in tok_dict:
tok_dict1[i]=len(tok_dict[i])
del_lst=[]
for i in tok_dict1:
if tok_dict1[i]<100:
del_lst.append(i)
for i in del_lst:
tok_dict1.pop(i)
lst=[]
for i in tok_dict1:
lst.append(i)
df=pd.DataFrame(pd_dict)
numeric_columns = df.drop(['msg', 'label', 'label_no'], axis=1).columns
# Create the MinMaxScaler object
scaler = MinMaxScaler()
# Normalize the numeric columns using min-max normalization
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
print(df.head())
# In[27]:
X=np.array(df.drop(['msg','label','label_no'],axis = 1))
y=np.array(df['label_no'])
# In[32]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)
clf=svm.SVC(kernel='poly')
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print(accuracy)
# In[36]:
y_pred = clf.predict(X_test)
precision = metrics.precision_score(y_test, y_pred, average='weighted')
recall = metrics.recall_score(y_test, y_pred, average='weighted')
f1 = metrics.f1_score(y_test, y_pred, average='weighted')
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)
confusion_mat = metrics.confusion_matrix(y_test, y_pred)
confusion_mat
# In[33]:
text='''WINNER!! As a valued network customer you have been selected to receivea
å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'''
# In[34]:
tokens=word_tokenize(text)
tokens_tagged=nltk.pos_tag(tokens)
x=[]
for i in range(35):
x.append(0)
pos_dict={'NNP':[0], 'IN':[1], 'JJ':[2], 'NN':[3], ',':[4], 'RB':[5], ':':[6], 'EX':[7], 'VBD':[8],
'WRB':[9], 'CD':[10], 'DT':[11], 'TO':[12], 'VB':[13], '.':[14], '(':[15], ')':[16], 'CC':[17],
'POS':[18], 'VBP':[19], 'NNS':[20], 'PRP':[21], 'VBZ':[22], 'VBG':[23], 'VBN':[24], 'MD':[25],
'PRP$':[26], 'JJR':[27], 'JJS':[28], 'UH':[29], 'RP':[30], 'WP':[31], 'WDT':[32], '#':[33], "''":[34]}
for i in tokens_tagged:
x[pos_dict[i[1]][0]]+=1
x=np.array(x)
x=x.reshape(1,-1)
# x
# In[35]:
pred=clf.predict(x)
if pred==0:
print("NOT SPAM")
else:
print("SPAM")
# In[ ]:
def spam_detection(txt):
tokens=word_tokenize(txt)
tokens_tagged=nltk.pos_tag(tokens)
x=[]
for i in range(35):
x.append(0)
pos_dict={'NNP':[0], 'IN':[1], 'JJ':[2], 'NN':[3], ',':[4], 'RB':[5], ':':[6], 'EX':[7], 'VBD':[8],
'WRB':[9], 'CD':[10], 'DT':[11], 'TO':[12], 'VB':[13], '.':[14], '(':[15], ')':[16], 'CC':[17],
'POS':[18], 'VBP':[19], 'NNS':[20], 'PRP':[21], 'VBZ':[22], 'VBG':[23], 'VBN':[24], 'MD':[25],
'PRP$':[26], 'JJR':[27], 'JJS':[28], 'UH':[29], 'RP':[30], 'WP':[31], 'WDT':[32], '#':[33], "''":[34]}
for i in tokens_tagged:
x[pos_dict[i[1]][0]]+=1
x=np.array(x)
x=x.reshape(1,-1)
# x
# In[35]:
pred=clf.predict(x)
if pred==0:
return "NOT SPAM"
else:
return "SPAM"
iface = gr.Interface(fn=spam_detection, inputs="text", outputs="text")
iface.launch()