antitheft159 commited on
Commit
871abaf
1 Parent(s): 306642b

Upload spamemailfinder_159.py

Browse files
Files changed (1) hide show
  1. spamemailfinder_159.py +86 -0
spamemailfinder_159.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """spamemailfinder.159
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1VK3x8uRt-HA3ZSip5FllNRtdqeRZ-X8y
8
+ """
9
+
10
+ import string
11
+ import numpy as np
12
+ import pandas as pd
13
+ import nltk
14
+ from nltk.corpus import stopwords
15
+ from nltk.stem.porter import PorterStemmer
16
+ from sklearn.feature_extraction.text import CountVectorizer
17
+ from sklearn.model_selection import train_test_split
18
+ from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB
19
+
20
+ nltk.download('stopwords')
21
+
22
+ df = pd.read_csv("spam_ham_dataset.csv")
23
+
24
+ df.head()
25
+
26
+ df.info()
27
+
28
+ df.isna().sum()
29
+
30
+ df['text'] = df['text'].apply(lambda x: x.replace('\r\n', ' '))
31
+
32
+ df.head()
33
+
34
+ stemmer = PorterStemmer()
35
+
36
+ corpus = []
37
+
38
+ stopwords_set = set(stopwords.words('english'))
39
+
40
+ for i in range(len(df)):
41
+ text = df['text'].iloc[i].lower()
42
+ text = text.translate(str.maketrans('', '',string.punctuation)).split()
43
+ text = [stemmer.stem(word) for word in text if word not in stopwords_set]
44
+ text = ''.join(text)
45
+ corpus.append(text)
46
+
47
+ vectorizer = CountVectorizer()
48
+
49
+ X = vectorizer.fit_transform(corpus).toarray()
50
+ y = df.label_num
51
+
52
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
53
+
54
+ mnb = MultinomialNB()
55
+ bnb = BernoulliNB()
56
+ gnb = GaussianNB()
57
+
58
+ mnb.fit(X_train, y_train)
59
+
60
+ bnb.fit(X_train, y_train)
61
+
62
+ gnb.fit(X_train, y_train)
63
+
64
+ mnb.score(X_test, y_test)
65
+
66
+ bnb.score(X_test, y_test)
67
+
68
+ gnb.score(X_test, y_test)
69
+
70
+ email_to_classify = df.text.values[19]
71
+
72
+ email_to_classify
73
+
74
+ email_text = email_to_classify.lower().translate(str.maketrans('', '',string.punctuation)).split()
75
+ email_text = [stemmer.stem(word) for word in text if word not in stopwords_set]
76
+ email_text = ''.join(email_text)
77
+
78
+ email_corpus = [email_text]
79
+
80
+ X_email = vectorizer.transform(email_corpus)
81
+
82
+ mnb.predict(X_email)
83
+
84
+ bnb.predict(X_email)
85
+
86
+ gnb.predict(X_email.toarray())