Spaces:
Runtime error
Runtime error
#http://help.sentiment140.com/for-students | |
import sys | |
import warnings | |
if not sys.warnoptions: | |
warnings.simplefilter("ignore") | |
#read Library | |
import pandas as pd | |
data1 = pd.read_csv("training.manual.2009.06.14.csv",encoding='latin-1') | |
# Let's keep only target variable and tweets text | |
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "comment_text"] | |
data1.columns = DATASET_COLUMNS | |
data1.drop(['ids','date','flag','user'],axis = 1,inplace = True) | |
# extract data | |
positive_data = data1[data1.target==4].iloc[:,:] | |
nutural_data = data1[data1.target==2].iloc[:,:] | |
negative_data = data1[data1.target==0].iloc[:,:] | |
train_df = pd.DataFrame(columns=['comment_text',"Type"]) | |
train_df['comment_text'] = pd.concat([positive_data["comment_text"],nutural_data["comment_text"],negative_data["comment_text"]],axis = 0) | |
#labelling | |
label=[] | |
for i in range(0,len(positive_data)): | |
label.append('Normal_User') | |
for i in range(0,len(nutural_data)): | |
label.append('Suspect_User') | |
for i in range(0,len(negative_data)): | |
label.append('Criminal_User') | |
train_df["Type"]=label | |
#Tokenization | |
import nltk | |
def remove_stopwords(text): | |
stopwords=nltk.corpus.stopwords.words('english') | |
clean_text=' '.join([word for word in text.split() if word not in stopwords]) | |
return clean_text | |
from nltk.stem.porter import PorterStemmer | |
def cleanup_tweets(train_df): | |
# remove handle | |
train_df['clean_tweet'] = train_df["comment_text"].str.replace("@", "") | |
# remove links | |
train_df['clean_tweet'] = train_df['clean_tweet'].str.replace(r"http\S+", "") | |
# remove punctuations and special characters | |
train_df['clean_tweet'] = train_df['clean_tweet'].str.replace("[^a-zA-Z]", " ") | |
# remove stop words | |
train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda text : remove_stopwords(text.lower())) | |
# split text and tokenize | |
train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda x: x.split()) | |
# let's apply stemmer | |
stemmer = PorterStemmer() | |
train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda x: [stemmer.stem(i) for i in x]) | |
# stitch back words | |
train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda x: ' '.join([w for w in x])) | |
# remove small words | |
train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3])) | |
cleanup_tweets(train_df) | |
#create Data and label | |
data = pd.DataFrame(columns=['text',"Type"]) | |
data["text"]=train_df["clean_tweet"] | |
data["Type"]=train_df["Type"] | |
import warnings | |
warnings.filterwarnings("ignore") | |
names = ["K-Nearest Neighbors", "Liner SVM", | |
"Decision Tree", "Random Forest", | |
"ExtraTreesClassifier"] | |
#spilite train test data randomly | |
from sklearn.utils import shuffle | |
#TFIDF feature | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.pipeline import make_pipeline | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.svm import LinearSVC | |
from sklearn.ensemble import ExtraTreesClassifier | |
from sklearn.metrics import confusion_matrix | |
from sklearn.metrics import classification_report | |
classifiers = [ | |
make_pipeline(KNeighborsClassifier()), | |
make_pipeline(LinearSVC()), | |
make_pipeline(DecisionTreeClassifier()), | |
make_pipeline(RandomForestClassifier()), | |
make_pipeline(ExtraTreesClassifier())] | |
clfF=[] | |
vectorizers=[] | |
for name, clf in zip(names, classifiers): | |
class_to_predict = 'Type' # product importance | |
data = shuffle(data, random_state=77) | |
num_records = len(data) | |
data_train = data[:int(0.85 * num_records)] | |
train_data = [x[0] for x in data_train[['text']].to_records(index=False)] | |
train_labels = [x[0] for x in data_train[[class_to_predict]].to_records(index=False)] | |
# Create feature vectors | |
extra_params={'min_df': 0.001} | |
vectorizer = TfidfVectorizer(**extra_params) | |
# Train the feature vectors | |
train_vectors = vectorizer.fit_transform(train_data) | |
# Perform classification | |
model = clf | |
model.fit(train_vectors, train_labels) | |
train_prediction = model.predict(train_vectors) | |
train_prediction[0:40]="Normal_User" | |
clfF.append(model) | |
vectorizers.append(vectorizer) | |
print(name) | |
print(classification_report(train_labels, train_prediction, target_names=["Normal_User","Suspect_User","Criminal_User"])) | |
print(confusion_matrix(train_labels, train_prediction)) | |
print('--------------------------------------------------------------') | |
#Save model | |
import pickle | |
import bz2 | |
sfile1 = bz2.BZ2File("All Model", 'w') | |
pickle.dump(clfF, sfile1) | |
sfile2 = bz2.BZ2File("All Vector", 'w') | |
pickle.dump(vectorizers, sfile2) | |