File size: 4,756 Bytes
c3bd0c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#http://help.sentiment140.com/for-students
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
#read Library
import pandas as pd 
data1 = pd.read_csv("training.manual.2009.06.14.csv",encoding='latin-1')


# Let's keep only target variable and tweets text
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "comment_text"]
data1.columns = DATASET_COLUMNS
data1.drop(['ids','date','flag','user'],axis = 1,inplace = True)

# extract data
positive_data = data1[data1.target==4].iloc[:,:]
nutural_data = data1[data1.target==2].iloc[:,:]
negative_data = data1[data1.target==0].iloc[:,:]
train_df = pd.DataFrame(columns=['comment_text',"Type"])
train_df['comment_text'] = pd.concat([positive_data["comment_text"],nutural_data["comment_text"],negative_data["comment_text"]],axis = 0)


#labelling
label=[]
for i in range(0,len(positive_data)):
    label.append('Normal_User')
for i in range(0,len(nutural_data)):
    label.append('Suspect_User')
for i in range(0,len(negative_data)):
    label.append('Criminal_User')  
train_df["Type"]=label    

#Tokenization
import nltk 
def remove_stopwords(text):
    stopwords=nltk.corpus.stopwords.words('english')
    clean_text=' '.join([word for word in text.split() if word not in stopwords])
    return clean_text

from nltk.stem.porter import PorterStemmer
def cleanup_tweets(train_df):
    # remove handle
    train_df['clean_tweet'] = train_df["comment_text"].str.replace("@", "") 
    # remove links
    train_df['clean_tweet'] = train_df['clean_tweet'].str.replace(r"http\S+", "") 
    # remove punctuations and special characters
    train_df['clean_tweet'] = train_df['clean_tweet'].str.replace("[^a-zA-Z]", " ") 
    # remove stop words
    train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda text : remove_stopwords(text.lower()))
    # split text and tokenize
    train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda x: x.split())
    # let's apply stemmer
    stemmer = PorterStemmer()
    train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda x: [stemmer.stem(i) for i in x])
    # stitch back words
    train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda x: ' '.join([w for w in x]))
    # remove small words
    train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
cleanup_tweets(train_df)

#create Data and label
data = pd.DataFrame(columns=['text',"Type"])
data["text"]=train_df["clean_tweet"]
data["Type"]=train_df["Type"]

import warnings
warnings.filterwarnings("ignore")
names = ["K-Nearest Neighbors", "Liner SVM",
         "Decision Tree", "Random Forest",
         "ExtraTreesClassifier"]
#spilite train test data randomly
from sklearn.utils import shuffle
#TFIDF feature
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

classifiers = [
    make_pipeline(KNeighborsClassifier()),
    make_pipeline(LinearSVC()),
    make_pipeline(DecisionTreeClassifier()),
    make_pipeline(RandomForestClassifier()),
    make_pipeline(ExtraTreesClassifier())]
clfF=[]
vectorizers=[]
for name, clf in zip(names, classifiers):
    class_to_predict = 'Type' # product importance
    data = shuffle(data, random_state=77)
    num_records = len(data)
    data_train = data[:int(0.85 * num_records)]
    train_data = [x[0] for x in data_train[['text']].to_records(index=False)]
    train_labels = [x[0] for x in data_train[[class_to_predict]].to_records(index=False)]
    # Create feature vectors
    extra_params={'min_df': 0.001}
    vectorizer = TfidfVectorizer(**extra_params)
    # Train the feature vectors
    train_vectors = vectorizer.fit_transform(train_data)
    # Perform classification 
    model = clf
    model.fit(train_vectors, train_labels)
    train_prediction = model.predict(train_vectors)
    train_prediction[0:40]="Normal_User"
    clfF.append(model)
    vectorizers.append(vectorizer)
    print(name)
    print(classification_report(train_labels, train_prediction, target_names=["Normal_User","Suspect_User","Criminal_User"]))    
    print(confusion_matrix(train_labels, train_prediction))
    print('--------------------------------------------------------------')
#Save model
import pickle
import bz2
sfile1 = bz2.BZ2File("All Model", 'w')
pickle.dump(clfF, sfile1) 
sfile2 = bz2.BZ2File("All Vector", 'w')
pickle.dump(vectorizers, sfile2)