spamemailfinder.159 / spamemailfinder_159.py
antitheft159's picture
Upload spamemailfinder_159.py
871abaf verified
raw
history blame contribute delete
No virus
1.88 kB
# -*- coding: utf-8 -*-
"""spamemailfinder.159
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1VK3x8uRt-HA3ZSip5FllNRtdqeRZ-X8y
"""
import string
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB
nltk.download('stopwords')
df = pd.read_csv("spam_ham_dataset.csv")
df.head()
df.info()
df.isna().sum()
df['text'] = df['text'].apply(lambda x: x.replace('\r\n', ' '))
df.head()
stemmer = PorterStemmer()
corpus = []
stopwords_set = set(stopwords.words('english'))
for i in range(len(df)):
text = df['text'].iloc[i].lower()
text = text.translate(str.maketrans('', '',string.punctuation)).split()
text = [stemmer.stem(word) for word in text if word not in stopwords_set]
text = ''.join(text)
corpus.append(text)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus).toarray()
y = df.label_num
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
mnb = MultinomialNB()
bnb = BernoulliNB()
gnb = GaussianNB()
mnb.fit(X_train, y_train)
bnb.fit(X_train, y_train)
gnb.fit(X_train, y_train)
mnb.score(X_test, y_test)
bnb.score(X_test, y_test)
gnb.score(X_test, y_test)
email_to_classify = df.text.values[19]
email_to_classify
email_text = email_to_classify.lower().translate(str.maketrans('', '',string.punctuation)).split()
email_text = [stemmer.stem(word) for word in text if word not in stopwords_set]
email_text = ''.join(email_text)
email_corpus = [email_text]
X_email = vectorizer.transform(email_corpus)
mnb.predict(X_email)
bnb.predict(X_email)
gnb.predict(X_email.toarray())