# -*- coding: utf-8 -*- """spamemailfinder.159 Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1VK3x8uRt-HA3ZSip5FllNRtdqeRZ-X8y """ import string import numpy as np import pandas as pd import nltk from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB nltk.download('stopwords') df = pd.read_csv("spam_ham_dataset.csv") df.head() df.info() df.isna().sum() df['text'] = df['text'].apply(lambda x: x.replace('\r\n', ' ')) df.head() stemmer = PorterStemmer() corpus = [] stopwords_set = set(stopwords.words('english')) for i in range(len(df)): text = df['text'].iloc[i].lower() text = text.translate(str.maketrans('', '',string.punctuation)).split() text = [stemmer.stem(word) for word in text if word not in stopwords_set] text = ''.join(text) corpus.append(text) vectorizer = CountVectorizer() X = vectorizer.fit_transform(corpus).toarray() y = df.label_num X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) mnb = MultinomialNB() bnb = BernoulliNB() gnb = GaussianNB() mnb.fit(X_train, y_train) bnb.fit(X_train, y_train) gnb.fit(X_train, y_train) mnb.score(X_test, y_test) bnb.score(X_test, y_test) gnb.score(X_test, y_test) email_to_classify = df.text.values[19] email_to_classify email_text = email_to_classify.lower().translate(str.maketrans('', '',string.punctuation)).split() email_text = [stemmer.stem(word) for word in text if word not in stopwords_set] email_text = ''.join(email_text) email_corpus = [email_text] X_email = vectorizer.transform(email_corpus) mnb.predict(X_email) bnb.predict(X_email) gnb.predict(X_email.toarray())