#!/usr/bin/env python # coding: utf-8 # In[25]: import nltk nltk.download("averaged_perceptron_tagger") nltk.download("punkt") from nltk.tokenize import word_tokenize import pandas as pd import csv import numpy as np from sklearn import preprocessing , svm , model_selection, metrics from sklearn.preprocessing import MinMaxScaler import gradio as gr # In[26]: tok_dict={} lst=['NNP', 'IN', 'JJ', 'NN', ',', 'RB', ':', 'EX', 'VBD', 'WRB', 'CD', 'DT', 'TO', 'VB', '.', '(', ')', 'CC', 'POS', 'VBP', 'NNS', 'PRP', 'VBZ', 'VBG', 'VBN', 'MD', 'PRP$', 'JJR', 'JJS', 'UH', 'RP', 'WP', 'WDT', '#', "''"] pd_dict={'msg':[],'label':[],'label_no':[],'NNP':[], 'IN':[], 'JJ':[], 'NN':[], ',':[], 'RB':[], ':':[], 'EX':[], 'VBD':[], 'WRB':[], 'CD':[], 'DT':[], 'TO':[], 'VB':[], '.':[], '(':[], ')':[], 'CC':[], 'POS':[], 'VBP':[], 'NNS':[], 'PRP':[], 'VBZ':[], 'VBG':[], 'VBN':[], 'MD':[], 'PRP$':[], 'JJR':[], 'JJS':[], 'UH':[], 'RP':[], 'WP':[], 'WDT':[], '#':[], "''":[]} with open("spam_db.csv", 'r', encoding='utf-8', errors = "ignore") as file: csvreader = csv.reader(file) j=0 k=0 for row in csvreader: if j==0: j=1 continue pd_dict['msg'].append(row[1]) pd_dict['label'].append(row[0]) if row[0]=='spam': pd_dict['label_no'].append(1) else: pd_dict['label_no'].append(0) for label in lst: pd_dict[label].append(0) text=row[1] tokens=word_tokenize(text) tokens_tagged=nltk.pos_tag(tokens) for i in tokens_tagged: if i[1] in tok_dict: tok_dict[i[1]].append(i[0]) else: tok_dict[i[1]]=[i[0]] if i[1] in pd_dict: pd_dict[i[1]][k]+=1 k+=1 tok_dict1={} for i in tok_dict: tok_dict1[i]=len(tok_dict[i]) del_lst=[] for i in tok_dict1: if tok_dict1[i]<100: del_lst.append(i) for i in del_lst: tok_dict1.pop(i) lst=[] for i in tok_dict1: lst.append(i) df=pd.DataFrame(pd_dict) numeric_columns = df.drop(['msg', 'label', 'label_no'], axis=1).columns # Create the MinMaxScaler object scaler = MinMaxScaler() # Normalize the numeric columns using min-max normalization df[numeric_columns] = scaler.fit_transform(df[numeric_columns]) print(df.head()) # In[27]: X=np.array(df.drop(['msg','label','label_no'],axis = 1)) y=np.array(df['label_no']) # In[32]: X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25) clf=svm.SVC(kernel='poly') clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) print(accuracy) # In[36]: y_pred = clf.predict(X_test) precision = metrics.precision_score(y_test, y_pred, average='weighted') recall = metrics.recall_score(y_test, y_pred, average='weighted') f1 = metrics.f1_score(y_test, y_pred, average='weighted') print("Precision:", precision) print("Recall:", recall) print("F1 score:", f1) confusion_mat = metrics.confusion_matrix(y_test, y_pred) confusion_mat # In[33]: text='''WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.''' # In[34]: tokens=word_tokenize(text) tokens_tagged=nltk.pos_tag(tokens) x=[] for i in range(35): x.append(0) pos_dict={'NNP':[0], 'IN':[1], 'JJ':[2], 'NN':[3], ',':[4], 'RB':[5], ':':[6], 'EX':[7], 'VBD':[8], 'WRB':[9], 'CD':[10], 'DT':[11], 'TO':[12], 'VB':[13], '.':[14], '(':[15], ')':[16], 'CC':[17], 'POS':[18], 'VBP':[19], 'NNS':[20], 'PRP':[21], 'VBZ':[22], 'VBG':[23], 'VBN':[24], 'MD':[25], 'PRP$':[26], 'JJR':[27], 'JJS':[28], 'UH':[29], 'RP':[30], 'WP':[31], 'WDT':[32], '#':[33], "''":[34]} for i in tokens_tagged: x[pos_dict[i[1]][0]]+=1 x=np.array(x) x=x.reshape(1,-1) # x # In[35]: pred=clf.predict(x) if pred==0: print("NOT SPAM") else: print("SPAM") # In[ ]: def spam_detection(txt): tokens=word_tokenize(txt) tokens_tagged=nltk.pos_tag(tokens) x=[] for i in range(35): x.append(0) pos_dict={'NNP':[0], 'IN':[1], 'JJ':[2], 'NN':[3], ',':[4], 'RB':[5], ':':[6], 'EX':[7], 'VBD':[8], 'WRB':[9], 'CD':[10], 'DT':[11], 'TO':[12], 'VB':[13], '.':[14], '(':[15], ')':[16], 'CC':[17], 'POS':[18], 'VBP':[19], 'NNS':[20], 'PRP':[21], 'VBZ':[22], 'VBG':[23], 'VBN':[24], 'MD':[25], 'PRP$':[26], 'JJR':[27], 'JJS':[28], 'UH':[29], 'RP':[30], 'WP':[31], 'WDT':[32], '#':[33], "''":[34]} for i in tokens_tagged: x[pos_dict[i[1]][0]]+=1 x=np.array(x) x=x.reshape(1,-1) # x # In[35]: pred=clf.predict(x) if pred==0: return "NOT SPAM" else: return "SPAM" iface = gr.Interface(fn=spam_detection, inputs="text", outputs="text") iface.launch()