|
|
|
"""spamemailfinder.159 |
|
|
|
Automatically generated by Colab. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1VK3x8uRt-HA3ZSip5FllNRtdqeRZ-X8y |
|
""" |
|
|
|
import string |
|
import numpy as np |
|
import pandas as pd |
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.stem.porter import PorterStemmer |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB |
|
|
|
nltk.download('stopwords') |
|
|
|
df = pd.read_csv("spam_ham_dataset.csv") |
|
|
|
df.head() |
|
|
|
df.info() |
|
|
|
df.isna().sum() |
|
|
|
df['text'] = df['text'].apply(lambda x: x.replace('\r\n', ' ')) |
|
|
|
df.head() |
|
|
|
stemmer = PorterStemmer() |
|
|
|
corpus = [] |
|
|
|
stopwords_set = set(stopwords.words('english')) |
|
|
|
for i in range(len(df)): |
|
text = df['text'].iloc[i].lower() |
|
text = text.translate(str.maketrans('', '',string.punctuation)).split() |
|
text = [stemmer.stem(word) for word in text if word not in stopwords_set] |
|
text = ''.join(text) |
|
corpus.append(text) |
|
|
|
vectorizer = CountVectorizer() |
|
|
|
X = vectorizer.fit_transform(corpus).toarray() |
|
y = df.label_num |
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
mnb = MultinomialNB() |
|
bnb = BernoulliNB() |
|
gnb = GaussianNB() |
|
|
|
mnb.fit(X_train, y_train) |
|
|
|
bnb.fit(X_train, y_train) |
|
|
|
gnb.fit(X_train, y_train) |
|
|
|
mnb.score(X_test, y_test) |
|
|
|
bnb.score(X_test, y_test) |
|
|
|
gnb.score(X_test, y_test) |
|
|
|
email_to_classify = df.text.values[19] |
|
|
|
email_to_classify |
|
|
|
email_text = email_to_classify.lower().translate(str.maketrans('', '',string.punctuation)).split() |
|
email_text = [stemmer.stem(word) for word in text if word not in stopwords_set] |
|
email_text = ''.join(email_text) |
|
|
|
email_corpus = [email_text] |
|
|
|
X_email = vectorizer.transform(email_corpus) |
|
|
|
mnb.predict(X_email) |
|
|
|
bnb.predict(X_email) |
|
|
|
gnb.predict(X_email.toarray()) |