antitheft159 commited on
Commit
a248df7
1 Parent(s): 923b39a

Upload kaltstart_195.py

Browse files
Files changed (1) hide show
  1. kaltstart_195.py +70 -0
kaltstart_195.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Kaltstart.195
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1W1WPsxSyG7efWOHMRIcMxuKWq0GDFdtG
8
+ """
9
+
10
+ !pip install nltk
11
+
12
+ import string
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+
17
+ import nltk
18
+ from nltk.corpus import stopwords
19
+ from nltk.stem.porter import PorterStemmer
20
+ from sklearn.model_selection import train_test_split
21
+
22
+ from sklearn.feature_extraction.text import CountVectorizer
23
+ from sklearn.ensemble import RandomForestClassifier
24
+
25
+ nltk.download('stopwords')
26
+
27
+ df = pd.read_csv("spam_ham_dataset.csv")
28
+
29
+ df['text'] = df['text'].apply(lambda x: x.replace('\r\n', ' '))
30
+
31
+ df
32
+
33
+ df.info()
34
+
35
+ stemmer = PorterStemmer()
36
+ corpus = []
37
+
38
+ stopwords_set =set(stopwords.words('english'))
39
+
40
+ for i in range(len(df)):
41
+ text = df['text'].iloc[1].lower()
42
+ text = text.translate(str.maketrans('', '', string.punctuation)).split()
43
+ text = [stemmer.stem(word) for word in text if word not in stopwords_set]
44
+ text = ' '.join(text)
45
+ corpus.append(text)
46
+
47
+ vectorizer = CountVectorizer()
48
+ X = vectorizer.fit_transform(corpus).toarray()
49
+ y = df.label_num
50
+
51
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
52
+
53
+ clf = RandomForestClassifier(n_jobs=-1)
54
+ clf.fit(X_train, y_train)
55
+
56
+ clf.score(X_test, y_test)
57
+
58
+ email_to_classify = df.text.values[16]
59
+
60
+ email_text = email_to_classify.lower().translate(str.maketrans('', '', string.punctuation)).split()
61
+ email_text = [stemmer.stem(word) for word in text if word not in stopwords_set]
62
+ email_text = ' '.join(email_text)
63
+
64
+ email_corpus = [email_text]
65
+
66
+ X_email = vectorizer.transform(email_corpus)
67
+
68
+ clf.predict(X_email)
69
+
70
+ df.label_num.iloc[16]