Reaper200 commited on
Commit
885c051
1 Parent(s): 63e6dbd

Create model training

Browse files
Files changed (1) hide show
  1. model training +81 -0
model training ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # importing data manipulation libraries
3
+ import pandas as pd
4
+ import numpy as np
5
+ import re
6
+ import string
7
+ from string import punctuation
8
+
9
+ # importing text preprocessing libraries
10
+ import nltk
11
+ nltk.download('wordnet')
12
+ from nltk.corpus import stopwords
13
+ from nltk.tokenize import word_tokenize
14
+ from nltk.stem import WordNetLemmatizer
15
+
16
+ from sklearn.feature_extraction.text import TfidfVectorizer
17
+ from sklearn.model_selection import train_test_split
18
+
19
+ # Machine learning libraries
20
+ from sklearn.metrics import accuracy_score
21
+ from sklearn.naive_bayes import MultinomialNB
22
+ from sklearn.linear_model import LogisticRegression
23
+ from sklearn.svm import SVC
24
+ from sklearn.ensemble import RandomForestClassifier
25
+
26
+ import warnings
27
+ warnings.filterwarnings("ignore")
28
+ [nltk_data] Downloading package wordnet to /root/nltk_data...
29
+ [nltk_data] Package wordnet is already up-to-date!
30
+ In [25]:
31
+ # creating English Stopwords set
32
+ import nltk
33
+ nltk.download('stopwords')
34
+ stop_words = set(stopwords.words('english'))
35
+ [nltk_data] Downloading package stopwords to /root/nltk_data...
36
+ [nltk_data] Package stopwords is already up-to-date!
37
+ Since the dataset did not have column names, we explicitly define it
38
+
39
+ In [26]:
40
+ col = ['target','ids','date','flag','user','text']
41
+ In [45]:
42
+ df = pd.read_csv('/content/drive/MyDrive/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', names = col)
43
+ In [28]:
44
+ df.head(10)
45
+ Out[28]:
46
+ target ids date flag user text
47
+ 0 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY _TheSpecialOne_ @switchfoot http://twitpic.com/2y1zl - Awww, t...
48
+ 1 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton is upset that he can't update his Facebook by ...
49
+ 2 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus @Kenichan I dived many times for the ball. Man...
50
+ 3 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY ElleCTF my whole body feels itchy and like its on fire
51
+ 4 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY Karoli @nationwideclass no, it's not behaving at all....
52
+ 5 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY joy_wolf @Kwesidei not the whole crew
53
+ 6 0 1467811592 Mon Apr 06 22:20:03 PDT 2009 NO_QUERY mybirch Need a hug
54
+ 7 0 1467811594 Mon Apr 06 22:20:03 PDT 2009 NO_QUERY coZZ @LOLTrish hey long time no see! Yes.. Rains a...
55
+ 8 0 1467811795 Mon Apr 06 22:20:05 PDT 2009 NO_QUERY 2Hood4Hollywood @Tatiana_K nope they didn't have it
56
+ 9 0 1467812025 Mon Apr 06 22:20:09 PDT 2009 NO_QUERY mimismo @twittera que me muera ?
57
+ Columns like ids, date, flag and user aren't gonna help the model to make classification so we're gonna drop them
58
+
59
+ In [29]:
60
+ df = df.drop(['ids','date','flag','user'],axis = 1)
61
+ In [16]:
62
+ df.head(5)
63
+ Out[16]:
64
+ target text
65
+ 0 0 @switchfoot http://twitpic.com/2y1zl - Awww, t...
66
+ 1 0 is upset that he can't update his Facebook by ...
67
+ 2 0 @Kenichan I dived many times for the ball. Man...
68
+ 3 0 my whole body feels itchy and like its on fire
69
+ 4 0 @nationwideclass no, it's not behaving at all....
70
+ In [17]:
71
+ df.describe()
72
+ Out[17]:
73
+ target
74
+ count 1.600000e+06
75
+ mean 2.000000e+00
76
+ std 2.000001e+00
77
+ min 0.000000e+00
78
+ 25% 0.000000e+00
79
+ 50% 2.000000e+00
80
+ 75% 4.000000e+00
81
+ max 4.000000e+00