antitheft159
/

spamemailfinder.159

Model card Files Files and versions Community

antitheft159 commited on Jun 30

Commit

871abaf

•

1 Parent(s): 306642b

Upload spamemailfinder_159.py

Browse files

Files changed (1) hide show

spamemailfinder_159.py +86 -0

spamemailfinder_159.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# -*- coding: utf-8 -*-
+"""spamemailfinder.159
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1VK3x8uRt-HA3ZSip5FllNRtdqeRZ-X8y
+"""
+import string
+import numpy as np
+import pandas as pd
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem.porter import PorterStemmer
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB
+nltk.download('stopwords')
+df = pd.read_csv("spam_ham_dataset.csv")
+df.head()
+df.info()
+df.isna().sum()
+df['text'] = df['text'].apply(lambda x: x.replace('\r\n', ' '))
+df.head()
+stemmer = PorterStemmer()
+corpus = []
+stopwords_set = set(stopwords.words('english'))
+for i in range(len(df)):
+  text = df['text'].iloc[i].lower()
+  text = text.translate(str.maketrans('', '',string.punctuation)).split()
+  text = [stemmer.stem(word) for word in text if word not in stopwords_set]
+  text = ''.join(text)
+  corpus.append(text)
+vectorizer = CountVectorizer()
+X = vectorizer.fit_transform(corpus).toarray()
+y = df.label_num
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+mnb = MultinomialNB()
+bnb = BernoulliNB()
+gnb = GaussianNB()
+mnb.fit(X_train, y_train)
+bnb.fit(X_train, y_train)
+gnb.fit(X_train, y_train)
+mnb.score(X_test, y_test)
+bnb.score(X_test, y_test)
+gnb.score(X_test, y_test)
+email_to_classify = df.text.values[19]
+email_to_classify
+email_text = email_to_classify.lower().translate(str.maketrans('', '',string.punctuation)).split()
+email_text = [stemmer.stem(word) for word in text if word not in stopwords_set]
+email_text = ''.join(email_text)
+email_corpus = [email_text]
+X_email = vectorizer.transform(email_corpus)
+mnb.predict(X_email)
+bnb.predict(X_email)
+gnb.predict(X_email.toarray())