import csv
import json
import os
import pickle
from datetime import datetime
import pandas
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from Pinpoint import Logger
class random_forest():
A class used for creating a random forest binary classifier.
model = None
accuracy = None
precision = None
recall = None
f_measure = None
# Model variables populated on creation or reading of file
original_name = None
creation_date = None
_FRAMEWORK_VERSION = 0.2 # Used when creating a new model file
# v0.1 - versioning added.
# v0.2 - Added more LIWC scores and minkowski distance
model_version = _FRAMEWORK_VERSION # can be updated if reading and using a model file of a different version
_outputs_folder = None
_model_folder = None
# Categories of features used in the model
RADICAL_LANGUAGE_ENABLED = True # RF-IDF Scores, Word Embeddings
PSYCHOLOGICAL_SIGNALS_ENABLED = True # LIWC Dictionaries, Minkowski distance
BEHAVIOURAL_FEATURES_ENABLED = True # frequency of tweets, followers / following ratio, centrality
def __init__(self, outputs_folder="outputs", model_folder=None):
The random_forest() class can be initialised with outputs_folder() and model_folder(). The outputs folder is
where output files are stored and the model folder is where the model will be created if not overwritten.
if model_folder is None:
model_folder = outputs_folder
self._outputs_folder = outputs_folder
self._model_folder = model_folder
def get_features_as_df(self, features_file, force_new_dataset=True):
Reads a JSON file file and converts to a Pandas dataframe that can be used to train and test the classifier.
:param features_file: the location of the JSON features file to convert to a dataframe
:param force_new_dataset: if true a new CSV file will be created even if one already exists.
:return: a Pandas dataframe with the features.
with open(features_file) as json_features_file:
csv_file = "{}.csv".format(features_file)
if force_new_dataset or not os.path.isfile(csv_file):
features = json.load(json_features_file)
# todo remove the data for the features not being used.
filtered_list_after_filters_applied = []
# If any of the filters are not true remove the features not requested
column_names = []
column_names = column_names + ["clout", "analytic", "tone", "authentic",
"anger", "sadness", "anxiety",
"power", "reward", "risk", "achievement", "affiliation",
"i_pronoun", "p_pronoun",
column_names = column_names + ['centrality']
# Add column names
column_names = column_names + ["cap_freq", "violent_freq"]
# Add the two hundred vectors columns
for iterator in range(1, 201):
column_names = column_names + ['is_extremist']
# Loops through list of dicts (messages)
number_of_processed_messages = 0
for message in features:
number_of_processed_messages = number_of_processed_messages + 1
"Extracting information from message {} of {} in file {}".format(
# Loops through dict keys (usernames)
for user in message.keys():
message_features = message[user]
feature_dict = {}
# Summary variables
feature_dict["clout"] = message_features["clout"]
feature_dict["analytic"] = message_features["analytic"]
feature_dict["tone"] = message_features["tone"]
feature_dict["authentic"] = message_features["authentic"]
# Emotional Analysis
feature_dict["anger"] = message_features["anger"]
feature_dict["sadness"] = message_features["sadness"]
feature_dict["anxiety"] = message_features["anxiety"]
# Personal Drives
feature_dict["power"] = message_features["power"]
feature_dict["reward"] = message_features["reward"]
feature_dict["risk"] = message_features["risk"]
feature_dict["achievement"] = message_features["achievement"]
feature_dict["affiliation"] = message_features["affiliation"]
# Personal Pronouns
feature_dict["i_pronoun"] = message_features["i_pronoun"]
feature_dict["p_pronoun"] = message_features["p_pronoun"]
# Minkowski distance
feature_dict["minkowski"] = message_features["minkowski"]
#feature_dict['post_freq'] = message_features['post_freq']
#feature_dict['follower_freq'] = message_features['follower_freq']
feature_dict['centrality'] = message_features['centrality']
feature_dict["message_vector"] = message_features["message_vector"]
feature_dict["violent_freq"] = message_features["violent_freq"]
feature_dict["cap_freq"] = message_features["cap_freq"]
feature_dict['is_extremist'] = message_features['is_extremist']
user = {user: feature_dict}
number_of_features = len(filtered_list_after_filters_applied)
# Creates the columns for the data frame
df = pd.DataFrame(
completed_features = 0
iterator = 0
error_count = 0
for message in features:
# should only be one user per entry
for user_id in message:
feature_data = message[user_id]
# ID is not included as it's hexidecimal and not float
row = []
clout = feature_data['clout']
analytic = feature_data['analytic']
tone = feature_data['tone']
authentic = feature_data['authentic']
anger = feature_data["anger"]
sadness = feature_data["sadness"]
anxiety = feature_data["anxiety"]
power = feature_data["power"]
reward = feature_data["reward"]
risk = feature_data["risk"]
achievement = feature_data["achievement"]
affiliation = feature_data["affiliation"]
i_pronoun = feature_data["i_pronoun"]
p_pronoun = feature_data["p_pronoun"]
minkowski = feature_data["minkowski"]
row = row + [clout, analytic, tone, authentic, anger, sadness, anxiety, power,
reward, risk, achievement, affiliation, i_pronoun, p_pronoun, minkowski]
#post_freq = feature_data['post_freq']
#follower_freq = feature_data['follower_freq']
centrality = feature_data['centrality']
row = row + [#post_freq, follower_freq,
cap_freq = feature_data['cap_freq']
violent_freq = feature_data['violent_freq']
message_vector = feature_data['message_vector']
row = row + [cap_freq, violent_freq] + message_vector
is_extremist = feature_data['is_extremist']
row = row + [is_extremist]
df.loc[iterator] = row
except ValueError as e:
error_count = error_count + 1
pass # if error with value probably column mismatch which is down to taking a mesage with no data
iterator = iterator + 1
completed_features = completed_features + 1
user_name = list(message.keys())[0]
"Added a message from user {} to data frame - {} messages of {} completed".format(user_name,
Logger.logger.print_message("Total errors when creating data frame: {}".format(error_count),
# Replace boolean with float
df.replace({False: 0, True: 1}, inplace=True)
# Sets ID field = "ID"
df = pandas.read_csv(csv_file)
return df
def create_model_info_output_file(self, location_of_output_file = None, training_data_csv_location = None):
If the model has been loaded or trained this function will create a summary text file with information relating to
the model.
:param location_of_output_file: The location to save the output file to.
:param training_data_csv_location: The location of the training data csv. This is used to retrieve the name of the
feature columns.
# Check if model has been created
if not self.creation_date:
Logger.logger.print_message("Model has not been trained, created, or loaded. Cannot output model data in this state.",logging_level=1)
Logger.logger.print_message("Creating model info text file")
output_text = ""
# Add summary information
output_text += "Model {}, version {}, created at {} \n".format(self.original_name, self.model_version, self.creation_date)
output_text += "\nAccuracy: {}\nRecall: {} \nPrecision: {}\nF-Measure: {}\n".format(self.accuracy, self.recall,
self.precision, self.f_measure)
# Retrieve the header names if available
if training_data_csv_location:
with open(training_data_csv_location, "r") as csv_file:
reader = csv.reader(csv_file)
headers = next(reader)
# Loop through all feature importance scores
for iterator in range(len(self.model.feature_importances_)):
if training_data_csv_location:
# Plus one to ignore ID field
output_text += "\n{}: {}".format(headers[iterator+1], self.model.feature_importances_[iterator])
output_text += "\nFeature {}: {}".format(iterator,self.model.feature_importances_[iterator])
# If no name has been set write to outputs folder
if location_of_output_file:
file_name = location_of_output_file
file_name = os.path.join(self._outputs_folder,"model-output-{}.txt".format('%Y-%m-%d-%H%M%S')))
# Write to file
with open(file_name, "w") as output_file:
def train_model(self, features_file, force_new_dataset=True, model_location=None):
Trains the model of the proveded data unless the model file already exists or if the force new dataset flag is True.
:param features_file: the location of the feature file to be used to train the model
:param force_new_dataset: If True a new dataset will be created and new model created even if a model already exists.
:param model_location: the location to save the model file to
# Sets model location based on default folder location and placeholder name if none was given
if model_location is None:
model_location = os.path.join(self._model_folder, "predictor.model")
# if told to force the creation of a new dataset to train off or the model location does not exist then make a new model
if force_new_dataset or not os.path.isfile(model_location):
# Import train_test_split function
feature_data = self.get_features_as_df(features_file, force_new_dataset)
# Removes index column
if "ID" in feature_data.keys():
feature_data.drop(feature_data.columns[0], axis=1, inplace=True)
feature_data.reset_index(drop=True, inplace=True)
y = feature_data[['is_extremist']] # Labels
X = feature_data.drop(axis=1, labels=['is_extremist']) # Features
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 80% training and 20% test
# Create a Gaussian Classifier
random_forest = RandomForestClassifier(n_estimators=100, max_depth=50, oob_score=True
) # class_weight={0:1,1:5} # A higher weight for the minority class (is_extreamist)
# Train the model using the training sets y_pred=random_forest.predict(X_test), y_train.values.ravel())
y_pred = random_forest.predict(X_test)
# Model Accuracy, how often is the classifier correct?
self.accuracy = metrics.accuracy_score(y_test, y_pred)
self.recall = metrics.recall_score(y_test, y_pred)
self.precision = metrics.precision_score(y_test, y_pred)
self.f_measure = metrics.f1_score(y_test, y_pred)
Logger.logger.print_message("Accuracy: {}".format(self.accuracy), logging_level=1)
Logger.logger.print_message("Recall: {}".format(self.recall), logging_level=1)
Logger.logger.print_message("Precision: {}".format(self.precision), logging_level=1)
Logger.logger.print_message("F-Measure: {}".format(self.f_measure), logging_level=1)
self.model = random_forest
self.original_name = model_location
self.creation_date ='%Y-%m-%d')
# write model and accuracy to file to file
model_data = {"model": self.model,
"original_name": self.original_name,
"creation_date": self.creation_date,
"accuracy": self.accuracy,
"recall": self.recall,
"precision": self.precision,
"f1": self.f_measure,
"version": self._FRAMEWORK_VERSION
pickle.dump(model_data, open(model_location, "wb"))
# Read model and accuracy from file
saved_file = pickle.load(open(model_location, "rb"))
self.accuracy = saved_file["accuracy"]
self.recall = saved_file["recall"]
self.precision = saved_file["precision"]
self.f_measure = saved_file["f1"]
self.model = saved_file["model"]
self.model_version = saved_file["version"]
self.original_name = saved_file["original_name"]
self.creation_date = saved_file["creation_date"]
# A check to identify if the loaded model is of the same version as the tooling
if self.model_version is not self._FRAMEWORK_VERSION:
Logger.logger.print_message("Model provided is of version {}, tooling is of "
"version {}. Using the model may not work as expected."
.format(self.model_version, self._FRAMEWORK_VERSION))