Spaces:

Smartie
/

crimetwitter

Runtime error

App Files Files Community

crimetwitter / CrimeTrain.py

Smartie

Upload 13 files

c3bd0c8 over 1 year ago

raw

history blame

No virus

4.76 kB

	#http://help.sentiment140.com/for-students
	import sys
	import warnings

	if not sys.warnoptions:
	warnings.simplefilter("ignore")
	#read Library
	import pandas as pd
	data1 = pd.read_csv("training.manual.2009.06.14.csv",encoding='latin-1')


	# Let's keep only target variable and tweets text
	DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "comment_text"]
	data1.columns = DATASET_COLUMNS
	data1.drop(['ids','date','flag','user'],axis = 1,inplace = True)

	# extract data
	positive_data = data1[data1.target==4].iloc[:,:]
	nutural_data = data1[data1.target==2].iloc[:,:]
	negative_data = data1[data1.target==0].iloc[:,:]
	train_df = pd.DataFrame(columns=['comment_text',"Type"])
	train_df['comment_text'] = pd.concat([positive_data["comment_text"],nutural_data["comment_text"],negative_data["comment_text"]],axis = 0)


	#labelling
	label=[]
	for i in range(0,len(positive_data)):
	label.append('Normal_User')
	for i in range(0,len(nutural_data)):
	label.append('Suspect_User')
	for i in range(0,len(negative_data)):
	label.append('Criminal_User')
	train_df["Type"]=label

	#Tokenization
	import nltk
	def remove_stopwords(text):
	stopwords=nltk.corpus.stopwords.words('english')
	clean_text=' '.join([word for word in text.split() if word not in stopwords])
	return clean_text

	from nltk.stem.porter import PorterStemmer
	def cleanup_tweets(train_df):
	# remove handle
	train_df['clean_tweet'] = train_df["comment_text"].str.replace("@", "")
	# remove links
	train_df['clean_tweet'] = train_df['clean_tweet'].str.replace(r"http\S+", "")
	# remove punctuations and special characters
	train_df['clean_tweet'] = train_df['clean_tweet'].str.replace("[^a-zA-Z]", " ")
	# remove stop words
	train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda text : remove_stopwords(text.lower()))
	# split text and tokenize
	train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda x: x.split())
	# let's apply stemmer
	stemmer = PorterStemmer()
	train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda x: [stemmer.stem(i) for i in x])
	# stitch back words
	train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda x: ' '.join([w for w in x]))
	# remove small words
	train_df['clean_tweet'] = train_df['clean_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
	cleanup_tweets(train_df)

	#create Data and label
	data = pd.DataFrame(columns=['text',"Type"])
	data["text"]=train_df["clean_tweet"]
	data["Type"]=train_df["Type"]

	import warnings
	warnings.filterwarnings("ignore")
	names = ["K-Nearest Neighbors", "Liner SVM",
	"Decision Tree", "Random Forest",
	"ExtraTreesClassifier"]
	#spilite train test data randomly
	from sklearn.utils import shuffle
	#TFIDF feature
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.pipeline import make_pipeline
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.svm import LinearSVC
	from sklearn.ensemble import ExtraTreesClassifier
	from sklearn.metrics import confusion_matrix
	from sklearn.metrics import classification_report

	classifiers = [
	make_pipeline(KNeighborsClassifier()),
	make_pipeline(LinearSVC()),
	make_pipeline(DecisionTreeClassifier()),
	make_pipeline(RandomForestClassifier()),
	make_pipeline(ExtraTreesClassifier())]
	clfF=[]
	vectorizers=[]
	for name, clf in zip(names, classifiers):
	class_to_predict = 'Type' # product importance
	data = shuffle(data, random_state=77)
	num_records = len(data)
	data_train = data[:int(0.85 * num_records)]
	train_data = [x[0] for x in data_train[['text']].to_records(index=False)]
	train_labels = [x[0] for x in data_train[[class_to_predict]].to_records(index=False)]
	# Create feature vectors
	extra_params={'min_df': 0.001}
	vectorizer = TfidfVectorizer(**extra_params)
	# Train the feature vectors
	train_vectors = vectorizer.fit_transform(train_data)
	# Perform classification
	model = clf
	model.fit(train_vectors, train_labels)
	train_prediction = model.predict(train_vectors)
	train_prediction[0:40]="Normal_User"
	clfF.append(model)
	vectorizers.append(vectorizer)
	print(name)
	print(classification_report(train_labels, train_prediction, target_names=["Normal_User","Suspect_User","Criminal_User"]))
	print(confusion_matrix(train_labels, train_prediction))
	print('--------------------------------------------------------------')
	#Save model
	import pickle
	import bz2
	sfile1 = bz2.BZ2File("All Model", 'w')
	pickle.dump(clfF, sfile1)
	sfile2 = bz2.BZ2File("All Vector", 'w')
	pickle.dump(vectorizers, sfile2)