Spaces:

zen21
/

spam-detection

Sleeping

App Files Files Community

spam-detection / app.py

zen21

Update app.py

b9a64e2 about 1 year ago

raw

history blame contribute delete

4.71 kB

	#!/usr/bin/env python
	# coding: utf-8

	# In[25]:


	import nltk
	nltk.download("averaged_perceptron_tagger")
	nltk.download("punkt")
	from nltk.tokenize import word_tokenize
	import pandas as pd
	import csv
	import numpy as np
	from sklearn import preprocessing , svm , model_selection, metrics
	from sklearn.preprocessing import MinMaxScaler
	import gradio as gr


	# In[26]:


	tok_dict={}

	lst=['NNP', 'IN', 'JJ', 'NN', ',', 'RB', ':', 'EX', 'VBD', 'WRB', 'CD', 'DT', 'TO', 'VB', '.',
	'(', ')', 'CC', 'POS', 'VBP', 'NNS', 'PRP', 'VBZ', 'VBG', 'VBN', 'MD', 'PRP$', 'JJR', 'JJS', 'UH', 'RP', 'WP', 'WDT', '#', "''"]

	pd_dict={'msg':[],'label':[],'label_no':[],'NNP':[], 'IN':[], 'JJ':[], 'NN':[], ',':[], 'RB':[], ':':[], 'EX':[], 'VBD':[],
	'WRB':[], 'CD':[], 'DT':[], 'TO':[], 'VB':[], '.':[], '(':[], ')':[], 'CC':[],
	'POS':[], 'VBP':[], 'NNS':[], 'PRP':[], 'VBZ':[], 'VBG':[], 'VBN':[], 'MD':[],
	'PRP$':[], 'JJR':[], 'JJS':[], 'UH':[], 'RP':[], 'WP':[], 'WDT':[], '#':[], "''":[]}

	with open("spam_db.csv", 'r', encoding='utf-8', errors = "ignore") as file:
	csvreader = csv.reader(file)
	j=0
	k=0
	for row in csvreader:
	if j==0:
	j=1
	continue
	pd_dict['msg'].append(row[1])
	pd_dict['label'].append(row[0])
	if row[0]=='spam':
	pd_dict['label_no'].append(1)
	else:
	pd_dict['label_no'].append(0)
	for label in lst:
	pd_dict[label].append(0)
	text=row[1]
	tokens=word_tokenize(text)
	tokens_tagged=nltk.pos_tag(tokens)
	for i in tokens_tagged:
	if i[1] in tok_dict:
	tok_dict[i[1]].append(i[0])
	else:
	tok_dict[i[1]]=[i[0]]
	if i[1] in pd_dict:
	pd_dict[i[1]][k]+=1
	k+=1

	tok_dict1={}
	for i in tok_dict:
	tok_dict1[i]=len(tok_dict[i])

	del_lst=[]
	for i in tok_dict1:
	if tok_dict1[i]<100:
	del_lst.append(i)

	for i in del_lst:
	tok_dict1.pop(i)

	lst=[]
	for i in tok_dict1:
	lst.append(i)

	df=pd.DataFrame(pd_dict)
	numeric_columns = df.drop(['msg', 'label', 'label_no'], axis=1).columns

	# Create the MinMaxScaler object
	scaler = MinMaxScaler()

	# Normalize the numeric columns using min-max normalization
	df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

	print(df.head())


	# In[27]:


	X=np.array(df.drop(['msg','label','label_no'],axis = 1))
	y=np.array(df['label_no'])


	# In[32]:


	X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)
	clf=svm.SVC(kernel='poly')
	clf.fit(X_train, y_train)
	accuracy = clf.score(X_test, y_test)
	print(accuracy)


	# In[36]:


	y_pred = clf.predict(X_test)

	precision = metrics.precision_score(y_test, y_pred, average='weighted')
	recall = metrics.recall_score(y_test, y_pred, average='weighted')
	f1 = metrics.f1_score(y_test, y_pred, average='weighted')

	print("Precision:", precision)
	print("Recall:", recall)
	print("F1 score:", f1)

	confusion_mat = metrics.confusion_matrix(y_test, y_pred)
	confusion_mat


	# In[33]:


	text='''WINNER!! As a valued network customer you have been selected to receivea
	å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'''


	# In[34]:


	tokens=word_tokenize(text)
	tokens_tagged=nltk.pos_tag(tokens)
	x=[]
	for i in range(35):
	x.append(0)
	pos_dict={'NNP':[0], 'IN':[1], 'JJ':[2], 'NN':[3], ',':[4], 'RB':[5], ':':[6], 'EX':[7], 'VBD':[8],
	'WRB':[9], 'CD':[10], 'DT':[11], 'TO':[12], 'VB':[13], '.':[14], '(':[15], ')':[16], 'CC':[17],
	'POS':[18], 'VBP':[19], 'NNS':[20], 'PRP':[21], 'VBZ':[22], 'VBG':[23], 'VBN':[24], 'MD':[25],
	'PRP$':[26], 'JJR':[27], 'JJS':[28], 'UH':[29], 'RP':[30], 'WP':[31], 'WDT':[32], '#':[33], "''":[34]}
	for i in tokens_tagged:
	x[pos_dict[i[1]][0]]+=1
	x=np.array(x)
	x=x.reshape(1,-1)
	# x


	# In[35]:


	pred=clf.predict(x)
	if pred==0:
	print("NOT SPAM")
	else:
	print("SPAM")


	# In[ ]:
	def spam_detection(txt):
	tokens=word_tokenize(txt)
	tokens_tagged=nltk.pos_tag(tokens)
	x=[]
	for i in range(35):
	x.append(0)
	pos_dict={'NNP':[0], 'IN':[1], 'JJ':[2], 'NN':[3], ',':[4], 'RB':[5], ':':[6], 'EX':[7], 'VBD':[8],
	'WRB':[9], 'CD':[10], 'DT':[11], 'TO':[12], 'VB':[13], '.':[14], '(':[15], ')':[16], 'CC':[17],
	'POS':[18], 'VBP':[19], 'NNS':[20], 'PRP':[21], 'VBZ':[22], 'VBG':[23], 'VBN':[24], 'MD':[25],
	'PRP$':[26], 'JJR':[27], 'JJS':[28], 'UH':[29], 'RP':[30], 'WP':[31], 'WDT':[32], '#':[33], "''":[34]}
	for i in tokens_tagged:
	x[pos_dict[i[1]][0]]+=1
	x=np.array(x)
	x=x.reshape(1,-1)
	# x


	# In[35]:


	pred=clf.predict(x)
	if pred==0:
	return "NOT SPAM"
	else:
	return "SPAM"

	iface = gr.Interface(fn=spam_detection, inputs="text", outputs="text")
	iface.launch()