Spaces:
Sleeping
Sleeping
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[25]: | |
import nltk | |
nltk.download("averaged_perceptron_tagger") | |
nltk.download("punkt") | |
from nltk.tokenize import word_tokenize | |
import pandas as pd | |
import csv | |
import numpy as np | |
from sklearn import preprocessing , svm , model_selection, metrics | |
from sklearn.preprocessing import MinMaxScaler | |
import gradio as gr | |
# In[26]: | |
tok_dict={} | |
lst=['NNP', 'IN', 'JJ', 'NN', ',', 'RB', ':', 'EX', 'VBD', 'WRB', 'CD', 'DT', 'TO', 'VB', '.', | |
'(', ')', 'CC', 'POS', 'VBP', 'NNS', 'PRP', 'VBZ', 'VBG', 'VBN', 'MD', 'PRP$', 'JJR', 'JJS', 'UH', 'RP', 'WP', 'WDT', '#', "''"] | |
pd_dict={'msg':[],'label':[],'label_no':[],'NNP':[], 'IN':[], 'JJ':[], 'NN':[], ',':[], 'RB':[], ':':[], 'EX':[], 'VBD':[], | |
'WRB':[], 'CD':[], 'DT':[], 'TO':[], 'VB':[], '.':[], '(':[], ')':[], 'CC':[], | |
'POS':[], 'VBP':[], 'NNS':[], 'PRP':[], 'VBZ':[], 'VBG':[], 'VBN':[], 'MD':[], | |
'PRP$':[], 'JJR':[], 'JJS':[], 'UH':[], 'RP':[], 'WP':[], 'WDT':[], '#':[], "''":[]} | |
with open("spam_db.csv", 'r', encoding='utf-8', errors = "ignore") as file: | |
csvreader = csv.reader(file) | |
j=0 | |
k=0 | |
for row in csvreader: | |
if j==0: | |
j=1 | |
continue | |
pd_dict['msg'].append(row[1]) | |
pd_dict['label'].append(row[0]) | |
if row[0]=='spam': | |
pd_dict['label_no'].append(1) | |
else: | |
pd_dict['label_no'].append(0) | |
for label in lst: | |
pd_dict[label].append(0) | |
text=row[1] | |
tokens=word_tokenize(text) | |
tokens_tagged=nltk.pos_tag(tokens) | |
for i in tokens_tagged: | |
if i[1] in tok_dict: | |
tok_dict[i[1]].append(i[0]) | |
else: | |
tok_dict[i[1]]=[i[0]] | |
if i[1] in pd_dict: | |
pd_dict[i[1]][k]+=1 | |
k+=1 | |
tok_dict1={} | |
for i in tok_dict: | |
tok_dict1[i]=len(tok_dict[i]) | |
del_lst=[] | |
for i in tok_dict1: | |
if tok_dict1[i]<100: | |
del_lst.append(i) | |
for i in del_lst: | |
tok_dict1.pop(i) | |
lst=[] | |
for i in tok_dict1: | |
lst.append(i) | |
df=pd.DataFrame(pd_dict) | |
numeric_columns = df.drop(['msg', 'label', 'label_no'], axis=1).columns | |
# Create the MinMaxScaler object | |
scaler = MinMaxScaler() | |
# Normalize the numeric columns using min-max normalization | |
df[numeric_columns] = scaler.fit_transform(df[numeric_columns]) | |
print(df.head()) | |
# In[27]: | |
X=np.array(df.drop(['msg','label','label_no'],axis = 1)) | |
y=np.array(df['label_no']) | |
# In[32]: | |
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25) | |
clf=svm.SVC(kernel='poly') | |
clf.fit(X_train, y_train) | |
accuracy = clf.score(X_test, y_test) | |
print(accuracy) | |
# In[36]: | |
y_pred = clf.predict(X_test) | |
precision = metrics.precision_score(y_test, y_pred, average='weighted') | |
recall = metrics.recall_score(y_test, y_pred, average='weighted') | |
f1 = metrics.f1_score(y_test, y_pred, average='weighted') | |
print("Precision:", precision) | |
print("Recall:", recall) | |
print("F1 score:", f1) | |
confusion_mat = metrics.confusion_matrix(y_test, y_pred) | |
confusion_mat | |
# In[33]: | |
text='''WINNER!! As a valued network customer you have been selected to receivea | |
å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.''' | |
# In[34]: | |
tokens=word_tokenize(text) | |
tokens_tagged=nltk.pos_tag(tokens) | |
x=[] | |
for i in range(35): | |
x.append(0) | |
pos_dict={'NNP':[0], 'IN':[1], 'JJ':[2], 'NN':[3], ',':[4], 'RB':[5], ':':[6], 'EX':[7], 'VBD':[8], | |
'WRB':[9], 'CD':[10], 'DT':[11], 'TO':[12], 'VB':[13], '.':[14], '(':[15], ')':[16], 'CC':[17], | |
'POS':[18], 'VBP':[19], 'NNS':[20], 'PRP':[21], 'VBZ':[22], 'VBG':[23], 'VBN':[24], 'MD':[25], | |
'PRP$':[26], 'JJR':[27], 'JJS':[28], 'UH':[29], 'RP':[30], 'WP':[31], 'WDT':[32], '#':[33], "''":[34]} | |
for i in tokens_tagged: | |
x[pos_dict[i[1]][0]]+=1 | |
x=np.array(x) | |
x=x.reshape(1,-1) | |
# x | |
# In[35]: | |
pred=clf.predict(x) | |
if pred==0: | |
print("NOT SPAM") | |
else: | |
print("SPAM") | |
# In[ ]: | |
def spam_detection(txt): | |
tokens=word_tokenize(txt) | |
tokens_tagged=nltk.pos_tag(tokens) | |
x=[] | |
for i in range(35): | |
x.append(0) | |
pos_dict={'NNP':[0], 'IN':[1], 'JJ':[2], 'NN':[3], ',':[4], 'RB':[5], ':':[6], 'EX':[7], 'VBD':[8], | |
'WRB':[9], 'CD':[10], 'DT':[11], 'TO':[12], 'VB':[13], '.':[14], '(':[15], ')':[16], 'CC':[17], | |
'POS':[18], 'VBP':[19], 'NNS':[20], 'PRP':[21], 'VBZ':[22], 'VBG':[23], 'VBN':[24], 'MD':[25], | |
'PRP$':[26], 'JJR':[27], 'JJS':[28], 'UH':[29], 'RP':[30], 'WP':[31], 'WDT':[32], '#':[33], "''":[34]} | |
for i in tokens_tagged: | |
x[pos_dict[i[1]][0]]+=1 | |
x=np.array(x) | |
x=x.reshape(1,-1) | |
# x | |
# In[35]: | |
pred=clf.predict(x) | |
if pred==0: | |
return "NOT SPAM" | |
else: | |
return "SPAM" | |
iface = gr.Interface(fn=spam_detection, inputs="text", outputs="text") | |
iface.launch() | |