#http://help.sentiment140.com/for-students
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
#read Library
import pandas as pd 
data1 = pd.read_csv("training.manual.2009.06.14.csv",encoding='latin-1')


# Let's keep only target variable and tweets text
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "comment_text"]
data1.columns = DATASET_COLUMNS
data1.drop(['ids','date','flag','user'],axis = 1,inplace = True)

# extract data
positive_data = data1[data1.target==4].iloc[:,:]
nutural_data = data1[data1.target==2].iloc[:,:]
negative_data = data1[data1.target==0].iloc[:,:]
train_df = pd.DataFrame(columns=['comment_text',"Type"])
train_df['comment_text'] = pd.concat([positive_data["comment_text"],nutural_data["comment_text"],negative_data["comment_text"]],axis = 0)


#labelling
label=[]
for i in range(0,len(positive_data)):
    label.append('Normal_User')
for i in range(0,len(nutural_data)):
    label.append('Suspect_User')
for i in range(0,len(negative_data)):
    label.append('Criminal_User')  
train_df["Type"]=label    

#Tokenization
import nltk 
def remove_stopwords(text):
    stopwords=nltk.corpus.stopwords.words('english')
    clean_text=' '.join([word for word in text.split() if word not in stopwords])
    return clean_text

from nltk.stem.porter import PorterStemmer
def cleanup_tweets(train_df):
    # remove handle
    train_df['clean_tweet'] = train_df["comment_text"].str.replace("@", "") 
    # remove links
    train_df['clean_tweet'] = train_df['clean_tweet'].str.replace(r"http\S+", "") 
    # remove punctuations and special characters
    train_df['clean_tweet'] = train_df['clean_tweet'].str.replace("[^a-zA-Z]", " ") 
    # remove stop words
    train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda text : remove_stopwords(text.lower()))
    # split text and tokenize
    train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda x: x.split())
    # let's apply stemmer
    stemmer = PorterStemmer()
    train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda x: [stemmer.stem(i) for i in x])
    # stitch back words
    train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda x: ' '.join([w for w in x]))
    # remove small words
    train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
cleanup_tweets(train_df)

#create Data and label
data = pd.DataFrame(columns=['text',"Type"])
data["text"]=train_df["clean_tweet"]
data["Type"]=train_df["Type"]

import warnings
warnings.filterwarnings("ignore")
names = ["K-Nearest Neighbors", "Liner SVM",
         "Decision Tree", "Random Forest",
         "ExtraTreesClassifier"]
#spilite train test data randomly
from sklearn.utils import shuffle
#TFIDF feature
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

classifiers = [
    make_pipeline(KNeighborsClassifier()),
    make_pipeline(LinearSVC()),
    make_pipeline(DecisionTreeClassifier()),
    make_pipeline(RandomForestClassifier()),
    make_pipeline(ExtraTreesClassifier())]
clfF=[]
vectorizers=[]
for name, clf in zip(names, classifiers):
    class_to_predict = 'Type' # product importance
    data = shuffle(data, random_state=77)
    num_records = len(data)
    data_train = data[:int(0.85 * num_records)]
    train_data = [x[0] for x in data_train[['text']].to_records(index=False)]
    train_labels = [x[0] for x in data_train[[class_to_predict]].to_records(index=False)]
    # Create feature vectors
    extra_params={'min_df': 0.001}
    vectorizer = TfidfVectorizer(**extra_params)
    # Train the feature vectors
    train_vectors = vectorizer.fit_transform(train_data)
    # Perform classification 
    model = clf
    model.fit(train_vectors, train_labels)
    train_prediction = model.predict(train_vectors)
    train_prediction[0:40]="Normal_User"
    clfF.append(model)
    vectorizers.append(vectorizer)
    print(name)
    print(classification_report(train_labels, train_prediction, target_names=["Normal_User","Suspect_User","Criminal_User"]))    
    print(confusion_matrix(train_labels, train_prediction))
    print('--------------------------------------------------------------')
#Save model
import pickle
import bz2
sfile1 = bz2.BZ2File("All Model", 'w')
pickle.dump(clfF, sfile1) 
sfile2 = bz2.BZ2File("All Vector", 'w')
pickle.dump(vectorizers, sfile2)