antitheft159
/

spamemailfinder.159

Model card Files Files and versions Community

spamemailfinder.159 / spamemailfinder_159.py

antitheft159's picture

Upload spamemailfinder_159.py

871abaf verified 3 months ago

history blame contribute delete

No virus

1.88 kB

	# -- coding: utf-8 --
	"""spamemailfinder.159

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1VK3x8uRt-HA3ZSip5FllNRtdqeRZ-X8y
	"""

	import string
	import numpy as np
	import pandas as pd
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem.porter import PorterStemmer
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.model_selection import train_test_split
	from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB

	nltk.download('stopwords')

	df = pd.read_csv("spam_ham_dataset.csv")

	df.head()

	df.info()

	df.isna().sum()

	df['text'] = df['text'].apply(lambda x: x.replace('\r\n', ' '))

	df.head()

	stemmer = PorterStemmer()

	corpus = []

	stopwords_set = set(stopwords.words('english'))

	for i in range(len(df)):
	text = df['text'].iloc[i].lower()
	text = text.translate(str.maketrans('', '',string.punctuation)).split()
	text = [stemmer.stem(word) for word in text if word not in stopwords_set]
	text = ''.join(text)
	corpus.append(text)

	vectorizer = CountVectorizer()

	X = vectorizer.fit_transform(corpus).toarray()
	y = df.label_num

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	mnb = MultinomialNB()
	bnb = BernoulliNB()
	gnb = GaussianNB()

	mnb.fit(X_train, y_train)

	bnb.fit(X_train, y_train)

	gnb.fit(X_train, y_train)

	mnb.score(X_test, y_test)

	bnb.score(X_test, y_test)

	gnb.score(X_test, y_test)

	email_to_classify = df.text.values[19]

	email_to_classify

	email_text = email_to_classify.lower().translate(str.maketrans('', '',string.punctuation)).split()
	email_text = [stemmer.stem(word) for word in text if word not in stopwords_set]
	email_text = ''.join(email_text)

	email_corpus = [email_text]

	X_email = vectorizer.transform(email_corpus)

	mnb.predict(X_email)

	bnb.predict(X_email)

	gnb.predict(X_email.toarray())