diff --git a/Pinpoint/Aggregator_NGram.py b/Pinpoint/Aggregator_NGram.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd90c38ac1d2ad9f797883b8f37f734000d51f80
--- /dev/null
+++ b/Pinpoint/Aggregator_NGram.py
@@ -0,0 +1,103 @@
+from sklearn.feature_extraction.text import CountVectorizer
+
+from Pinpoint.Logger import *
+
+c_vec = CountVectorizer(ngram_range=(1, 5))
+
+
+class n_gram_aggregator():
+ """
+ This class is used to retrieve the most common NGrams for a given dataset corpus.
+ """
+
+ def _get_average_ngram_count(self, n_grams_dict):
+ """
+ takes a dict of Ngrams and identifies the average weighting
+ :param n_grams_dict:
+ :return:
+ """
+ all_count = []
+ for n_gram in n_grams_dict:
+ ng_count = n_grams_dict[n_gram]
+ all_count.append(ng_count)
+
+ average_count = sum(all_count) / len(all_count)
+ # print(all_count)
+ return average_count
+
+ def _get_all_ngrams(self, data):
+ """
+ Returns all ngrams (tri, bi, and uni) for a given piece of text
+ :param data:
+ :return:
+ """
+
+ if type(data) is not list:
+ data = [data]
+
+ # input to fit_transform() should be an iterable with strings
+ ngrams = c_vec.fit_transform(data)
+
+ # needs to happen after fit_transform()
+ vocab = c_vec.vocabulary_
+
+ count_values = ngrams.toarray().sum(axis=0)
+
+ # output n-grams
+ uni_grams = {}
+ bi_grams = {}
+ tri_grams = {}
+
+ for ng_count, ng_text in sorted([(count_values[i], k) for k, i in vocab.items()], reverse=True):
+ sentence_length = len(ng_text.split(" "))
+
+ if sentence_length == 3:
+ tri_grams[ng_text] = ng_count
+ elif sentence_length == 2:
+ bi_grams[ng_text] = ng_count
+ elif sentence_length == 1:
+ uni_grams[ng_text] = ng_count
+
+ return uni_grams, bi_grams, tri_grams
+
+ def _get_popular_ngrams(self, ngrams_dict):
+ """
+ Returns ngrams for a given piece of text that are the most popular (i.e. their weighting is
+ above the average ngram wighting)
+ :param ngrams_dict:
+ :return:
+ """
+ average_count = self._get_average_ngram_count(ngrams_dict)
+
+ popular_ngrams = {}
+ for n_gram in ngrams_dict:
+ ng_count = ngrams_dict[n_gram]
+
+ if ng_count >= average_count:
+ popular_ngrams[n_gram] = ng_count
+ return popular_ngrams
+
+ def get_ngrams(self, data=None, file_name_to_read=None):
+ """
+ Wrapper function for returning uni, bi, and tri grams that are the most popular (above the average weighting in
+ a given piece of text).
+ :param data:
+ :param file_name_to_read:
+ :return:
+ """
+ logger().print_message("Getting Ngrams")
+
+ if data is None and file_name_to_read is None:
+ raise Exception("No data supplied to retrieve n_grams")
+
+ if data is None and file_name_to_read is not None:
+ with open(file_name_to_read, 'r') as file_to_read:
+ data = file_to_read.read()
+
+ uni_grams, bi_grams, tri_grams = self._get_all_ngrams(data)
+
+ popular_uni_grams = list(self._get_popular_ngrams(uni_grams).keys())
+ popular_bi_grams = list(self._get_popular_ngrams(bi_grams).keys())
+ popular_tri_grams = list(self._get_popular_ngrams(tri_grams).keys())
+
+ return popular_uni_grams, popular_bi_grams, popular_tri_grams
diff --git a/Pinpoint/Aggregator_TfIdf.py b/Pinpoint/Aggregator_TfIdf.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f10ef3c42a68eb1a8f40190c4b7b7c876071b82
--- /dev/null
+++ b/Pinpoint/Aggregator_TfIdf.py
@@ -0,0 +1,41 @@
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+from Pinpoint.Logger import *
+
+
+class tf_idf_aggregator():
+ """
+ A wrapper class around SKlearn for retrieving TF-IDF scores.
+ """
+
+ def get_tf_idf_scores(self, ngrams_vocabulary, corpus_data=None, file_name_to_read=None):
+ """
+ Used to generate a TF IDF score based of a vocabulary of Ngrams and a data corpus.
+ :param ngrams_vocabulary:
+ :param corpus_data:
+ :param file_name_to_read:
+ :return: a dictionary of the pairing name and their score
+ """
+ logger.print_message("Getting TF IDF scores")
+
+ if corpus_data is None and file_name_to_read is None:
+ raise Exception("No data supplied to retrieve n_grams")
+
+ if corpus_data is None and file_name_to_read is not None:
+ with open(file_name_to_read, 'r') as file_to_read:
+ corpus_data = file_to_read.read()
+
+ tfidf = TfidfVectorizer(vocabulary=ngrams_vocabulary, stop_words='english', ngram_range=(1, 2))
+ tfs = tfidf.fit_transform([corpus_data])
+
+ feature_names = tfidf.get_feature_names()
+ corpus_index = [n for n in corpus_data]
+ rows, cols = tfs.nonzero()
+
+ dict_of_scores = {}
+
+ for row, col in zip(rows, cols):
+ dict_of_scores[feature_names[col]] = tfs[row, col]
+ logger.print_message((feature_names[col], corpus_index[row]), tfs[row, col])
+
+ return dict_of_scores
diff --git a/Pinpoint/Aggregator_Word2Vec.py b/Pinpoint/Aggregator_Word2Vec.py
new file mode 100644
index 0000000000000000000000000000000000000000..9be57952774f125cb8ac53dc60b8a0afb32b6256
--- /dev/null
+++ b/Pinpoint/Aggregator_Word2Vec.py
@@ -0,0 +1,32 @@
+from gensim.models import Word2Vec
+
+
+class word_2_vec_aggregator():
+ """
+ A wrapper function around gensim used for creating a word 2 vec model
+ """
+
+ def get_model(self, list_of_sentences):
+ """
+ Used to retrieve the model
+ :param list_of_sentences:
+ :return: the model
+ """
+
+ list_of_sentences_in_nested_list = []
+
+ for sentence in list_of_sentences:
+
+ # Skip unigrams
+ if " " not in sentence:
+ continue
+
+ list_of_sentences_in_nested_list.append(sentence.split(" "))
+
+ model = Word2Vec(min_count=1, window=5) # vector size of 100 and window size of 5?
+ model.build_vocab(list_of_sentences_in_nested_list) # prepare the model vocabulary
+ model.model_trimmed_post_training = False
+ model.train(list_of_sentences_in_nested_list, total_examples=model.corpus_count,
+ epochs=model.epochs) # train word vectors
+
+ return model
diff --git a/Pinpoint/Aggregator_WordingChoice.py b/Pinpoint/Aggregator_WordingChoice.py
new file mode 100644
index 0000000000000000000000000000000000000000..10b6e1b49295a5650883c11bcb55989f95273242
--- /dev/null
+++ b/Pinpoint/Aggregator_WordingChoice.py
@@ -0,0 +1,51 @@
+import os
+
+
+class wording_choice_aggregator():
+ """
+ A class used for retrieving frequencies based on wording in a message
+ """
+
+ def get_frequency_of_capatalised_words(self, text):
+ """
+ A function used to retrieve the frequencies of capitalised words in a dataset
+ :param text:
+ :return: the frequency of capitalised words in a dataset
+ """
+ number_of_capatalised_words = 0
+ for word in text.split(" "):
+ if word.isupper():
+ number_of_capatalised_words = number_of_capatalised_words + 1
+
+ total_number_of_words = len(text.split(" "))
+ frequency = number_of_capatalised_words / total_number_of_words
+
+ return frequency
+
+ def get_frequency_of_violent_or_curse_words(self, text, violent_words_datasets_location):
+ """
+ A function ued for retrieving the frequencies of violent words in a dataset
+ :param text:
+ :return: the frequency of violent words in a dataset
+ """
+
+ dataset_folder = os.path.join(os.getcwd(), violent_words_datasets_location)
+
+ list_of_violent_or_curse_words = []
+
+ # Retrieves all words in all of the files in the violent or curse word datasets
+ for filename in os.listdir(dataset_folder):
+ with open(os.path.join(dataset_folder, filename), 'r') as file:
+
+ for line in file.readlines():
+ line = line.strip().replace("\n", " ").replace(",", "")
+ list_of_violent_or_curse_words.append(line)
+
+ number_of_swear_words = 0
+ for word in text.split(" "):
+ if word in list_of_violent_or_curse_words:
+ number_of_swear_words = number_of_swear_words + 1
+
+ total_number_of_words = len(text.split(" "))
+ frequency = number_of_swear_words / total_number_of_words
+ return frequency
diff --git a/Pinpoint/ConfigManager.py b/Pinpoint/ConfigManager.py
new file mode 100644
index 0000000000000000000000000000000000000000..2be7f87b64acdd9189114b774bc9a7d0a6f80e26
--- /dev/null
+++ b/Pinpoint/ConfigManager.py
@@ -0,0 +1,21 @@
+import json
+from pathlib import Path
+
+
+class ConfigManager:
+ """
+ A wrapper file used to abstract Twitter config options. """
+
+ @staticmethod
+ def _get_config(config_path):
+ if Path(config_path).is_file() == False:
+ raise Exception("The {} config file was not found.".format(config_path))
+
+ with open(config_path) as json_file:
+ twitter_config_dict = json.load(json_file)
+
+ return twitter_config_dict
+
+ @staticmethod
+ def getTwitterConfig():
+ return ConfigManager._get_config("twitterConfig.json")
diff --git a/Pinpoint/FeatureExtraction.py b/Pinpoint/FeatureExtraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..b178e059aa5e3610cb34d5740ceca49030fb9f3d
--- /dev/null
+++ b/Pinpoint/FeatureExtraction.py
@@ -0,0 +1,795 @@
+import ast
+import base64
+import codecs
+import csv
+import gc
+import json
+import os
+import pickle
+import re
+import shutil
+import time
+
+import numpy
+import pandas as pd
+import uuid
+from scipy.spatial import distance
+
+from Pinpoint.Aggregator_NGram import n_gram_aggregator
+from Pinpoint.Aggregator_TfIdf import tf_idf_aggregator
+from Pinpoint.Aggregator_Word2Vec import word_2_vec_aggregator
+from Pinpoint.Aggregator_WordingChoice import wording_choice_aggregator
+from Pinpoint.Grapher import grapher
+from Pinpoint.Logger import logger
+from Pinpoint.Sanitizer import sanitization, sys
+
+
+class feature_extraction():
+ """
+ This class is used to wrap the functionality of aggregating tweets from CSV files and extracting features pertinent
+ to building a random forest extremist classifier.
+ """
+
+ # A graph used to store connections between aggregated users
+ graph = grapher()
+ archived_graphs = [] # an archive of the previous graphs
+ # A list storing dictionaries of user ids and their features.
+ tweet_user_features = []
+ completed_tweet_user_features = [] # has centrality added
+ # the global TF IDF model used for the Word 2 Vec model
+ saved_tf_idf_model = None
+ # A dictionary used for the translation of actual Twitter username to UUID
+ dict_of_users = {}
+
+ # The max size for all data entries (i.e. baseline tweets)
+ MAX_RECORD_SIZE = sys.maxsize # 3050
+
+ # Datasets for training
+ violent_words_dataset_location = None
+ tf_idf_training_dataset_location = None
+ outputs_location = None
+
+ # Used for knowing which columns to access data from. For Twitter data.
+ # Summary variables
+ DEFAULT_USERNAME_COLUMN_ID = 0
+ DEFAULT_DATE_COLUMN_ID = 1
+ DEFAULT_MESSAGE_COLUMN_ID = 2
+ DEFAULT_ANALYTIC_COLUMN_ID = 4
+ DEFAULT_CLOUT_COLUMN_ID = 5
+ DEFAULT_AUTHENTIC_COLUMN_ID = 6
+ DEFAULT_TONE_COLUMN_ID = 7
+ # Emotional Analysis
+ DEFAULT_ANGER_COLUMN_ID = 36
+ DEFAULT_SADNESS_COLUMN_ID = 37
+ DEFAULT_ANXIETY_COLUMN_ID = 35
+ # Personal Drives:
+ DEFAULT_POWER_COLUMN_ID = 62
+ DEFAULT_REWARD_COLUMN_ID = 63
+ DEFAULT_RISK_COLUMN_ID = 64
+ DEFAULT_ACHIEVEMENT_COLUMN_ID = 61
+ DEFAULT_AFFILIATION_COLUMN_ID = 60
+ # Personal pronouns
+ DEFAULT_P_PRONOUN_COLUMN_ID = 13
+ DEFAULT_I_PRONOUN_COLUMN_ID = 19
+
+ # Constants for the fields in the baseline data set (i.e. ISIS magazine/ Stormfront, etc)
+ DEFAULT_BASELINE_MESSAGE_COLUMN_ID = 5
+ # Summary variables
+ DEFAULT_BASELINE_CLOUT_COLUMN_ID = 10
+ DEFAULT_BASELINE_ANALYTIC_COLUMN_ID = 9
+ DEFAULT_BASELINE_TONE_COLUMN_ID = 12
+ DEFAULT_BASELINE_AUTHENTIC_COLUMN_ID = 11
+ # Emotional Analysis
+ DEFAULT_BASELINE_ANGER_COLUMN_ID = 41
+ DEFAULT_BASELINE_SADNESS_COLUMN_ID = 42
+ DEFAULT_BASELINE_ANXIETY_COLUMN_ID = 40
+ # Personal Drives
+ DEFAULT_BASELINE_POWER_COLUMN_ID = 67
+ DEFAULT_BASELINE_REWARD_COLUMN_ID = 68
+ DEFAULT_BASELINE_RISK_COLUMN_ID = 69
+ DEFAULT_BASELINE_ACHIEVEMENT_COLUMN_ID = 66
+ DEFAULT_BASELINE_AFFILIATION_COLUMN_ID = 65
+ # Personal pronouns
+ DEFAULT_BASELINE_P_PRONOUN_COLUMN_ID = 18
+ DEFAULT_BASELINE_I_PRONOUN_COLUMN_ID = 24
+
+ # Used for Minkowski distance
+ _average_clout = 0
+ _average_analytic = 0
+ _average_tone = 0
+ _average_authentic = 0
+ _average_anger = 0
+ _average_sadness = 0
+ average_anxiety = 0
+ average_power = 0
+ average_reward = 0
+ average_risk = 0
+ average_achievement = 0
+ average_affiliation = 0
+ average_p_pronoun = 0
+ average_i_pronoun = 0
+
+ # Used to chache messages to free memory
+ MESSAGE_TMP_CACHE_LOCATION = "message_cache"
+
+ def __init__(self, violent_words_dataset_location=None
+ , baseline_training_dataset_location=None,
+ outputs_location=r"outputs"):
+ """
+ Constructor
+
+ The feature_extraction() class can be initialised with violent_words_dataset_location,
+ tf_idf_training_dataset_location, and outputs_location locations. All files in the violent_words_dataset_location
+ will be read (one line at a time) and added to the corpus of violent and swear words. The csv file at
+ baseline_training_dataset_location is used to train the TFIDF model and a Minkowski distance score is calculated based on the LIWC scores present.
+
+ If the constant variable need to be changed, do this by setting the member variables.
+ """
+
+ # Error if datasets not provided
+ if violent_words_dataset_location is None:
+ raise Exception("No Violent Words dir provided. Provide a directory that contains new line seperated "
+ "files where each line is a violent, extremist, etc word")
+
+ if baseline_training_dataset_location is None:
+ raise Exception("No baseline (TF-IDF/ Minkowski) dataset provided. Thus should be a csv file containing "
+ "extremist content and LIWC scores.")
+
+ # Set datasets to member variables
+ self.violent_words_dataset_location = violent_words_dataset_location
+ self.tf_idf_training_dataset_location = baseline_training_dataset_location
+ self.outputs_location = outputs_location
+
+ # Attempt to make the outputs folder if it doesn't exist
+ try:
+ os.makedirs(outputs_location)
+ except:
+ pass
+
+ def _reset_stored_feature_data(self):
+ """
+ Resets memeber variables from a previous run. Importantly does not reset to TF IDF model.
+ :return:
+ """
+
+ # A graph used to store connections between aggregated users
+ self.graph = grapher()
+ archived_graphs = [] # an archive of the previous graphs
+ # A list storing dictionaries of user ids and their features.
+ self.tweet_user_features = []
+ self.completed_tweet_user_features = [] # has centrality added
+ # the global TF IDF model used for the Word 2 Vec model
+ self.dict_of_users = {}
+
+ # Used for Minkowski distance
+ self._average_clout = 0
+ self._average_analytic = 0
+ self._average_tone = 0
+ self._average_authentic = 0
+ self._average_anger = 0
+ self._average_sadness = 0
+ self.average_anxiety = 0
+ self.average_power = 0
+ self.average_reward = 0
+ self.average_risk = 0
+ self.average_achievement = 0
+ self.average_affiliation = 0
+ self.average_p_pronoun = 0
+ self.average_i_pronoun = 0
+
+ def _get_unique_id_from_username(self, username):
+ """
+ A function used to retrieve a UUID based on a twitter username. If a username has been used before the same UUID
+ will be returned as it is stored in a dictionary.
+ :param username:
+ :return: a string representation of a UUID relating to a Twitter username
+ """
+
+ if username in self.dict_of_users:
+ # username already in dictionary
+ unique_id = self.dict_of_users[username]
+ else:
+ # make new UUID
+ unique_id = uuid.uuid4().hex
+ # stops uuid collisions
+ while unique_id in self.dict_of_users.values():
+ unique_id = uuid.uuid4().hex
+
+ # Add new user id to dictionary
+ self.dict_of_users[username] = unique_id
+
+ # todo it's less efficient writing the whole file every run
+ path = os.path.join(self.outputs_location, "users.json")
+
+ with open(path, 'w') as outfile:
+ json.dump(self.dict_of_users, outfile)
+
+ return unique_id
+
+ def _add_to_graph(self, originating_user_name, message):
+ """
+ A wrapper function used for adding a node/ connection to the graph.
+ :param originating_user_name: the Twitter username
+ :param message: The Tweet
+ """
+
+ # Adds node to graph so that if they don't interact with anyone they still have a centrality
+ self.graph.add_node(originating_user_name)
+
+ # Process mentions
+ mentions = re.findall("\@([a-zA-Z\-\_]+)", message)
+
+ # For all mentions in the tweet add them to the graph as a node
+ for mention in mentions:
+ self.graph.add_edge_wrapper(originating_user_name, mention, 1, "mention")
+
+ # process hashtags
+ hashtags = re.findall("\#([a-zA-Z\-\_]+)", message)
+
+ # For all hashtags in the tweet add them to the graph as a node
+ for hashtag in hashtags:
+ self.graph.add_edge_wrapper(originating_user_name, hashtag, 1, "hashtag")
+
+ def _get_capitalised_word_frequency(self, message):
+ """
+ A wrapper function for returning the frequency of capitalised words in a message.
+ :param message:
+ :return: the frequency of capitalised words in a message.
+ """
+ return wording_choice_aggregator().get_frequency_of_capatalised_words(
+ message) # NEEDS TO BE DONE before lower case
+
+ def _get_violent_word_frequency(self, message):
+ """
+ A wrapper function used to retrieve the frequency of violent words in a message.
+ :param message: a string representation of a social media message
+ :return: The frequency of violent words in the message
+ """
+ return wording_choice_aggregator().get_frequency_of_violent_or_curse_words(message,
+ self.violent_words_dataset_location)
+
+ def _get_tweet_vector(self, message):
+ """
+ A wrapper function used retrieve the 200 size vector representation (Average and Max vector concatenated)
+ of that message.
+ :param message: a string representation of a message
+ :param tf_idf_model:
+ :return: a 200 size vector of the tweet
+ """
+ vectors = []
+ tf_idf_model = self._get_tf_idf_model()
+
+ for word in message.split(" "):
+ # todo add back word = sanitization().sanitize(word, self.outputs_location, force_new_data_and_dont_persisit=True)
+ try:
+ vectors.append(tf_idf_model.wv[word])
+ logger().print_message("Word '{}' in vocabulary...".format(word))
+ except KeyError as e:
+ pass
+ logger().print_message(e)
+ logger().print_message("Word '{}' not in vocabulary...".format(word))
+
+ # Lists of the values used to store the max and average vector values
+ max_value_list = []
+ average_value_list = []
+
+ # Check for if at least one word in the message is in the vocabulary of the model
+ final_array_of_vectors = pd.np.zeros(100)
+ if len(vectors) > 0:
+
+ # Loop through the elements in the vectors
+ for iterator in range(vectors[0].size):
+
+ list_of_all_values = []
+
+ # Loop through each vector
+ for vector in vectors:
+ value = vector[iterator]
+ list_of_all_values.append(value)
+
+ average_value = sum(list_of_all_values) / len(list_of_all_values)
+ max_value = max(list_of_all_values)
+ max_value_list.append(max_value)
+ average_value_list.append(average_value)
+
+ final_array_of_vectors = pd.np.append(pd.np.array([max_value_list]), pd.np.array([average_value_list]))
+
+ # Convert array to list
+ list_of_vectors = []
+ for vector in final_array_of_vectors:
+ list_of_vectors.append(vector)
+
+ return list_of_vectors
+
+ def _process_tweet(self, user_name, message, row):
+ """
+ Wrapper function for taking a username and tweet and extracting the features.
+ :param user_name:
+ :param message:
+ :return: a dictionary of all features from the message
+ """
+ self._add_to_graph(user_name, message)
+
+ features_dict = {"cap_freq": self._get_capitalised_word_frequency(message),
+ "violent_freq": self._get_violent_word_frequency(message),
+ "message_vector": self._get_tweet_vector(message)}
+
+
+ return features_dict
+
+ def _get_average_liwc_scores_for_baseline_data(self):
+ """
+ Calculate the LIWC scores for the baseline dataset and the minkowski dataset.
+ """
+
+ # Checks if the values have already been set this run, if so don't calculate again
+ # TODO what of the edge case where average clout is 0?
+ if self._average_clout == 0:
+ logger.print_message("Opening dataset {} for LIWC feature extraction and Minkowski distance".format(
+ self.tf_idf_training_dataset_location))
+ baseline_data_set_name = self.tf_idf_training_dataset_location
+
+ clout_list = []
+ analytic_list = []
+ tone_list = []
+ authentic_list = []
+ anger_list = []
+ sadness_list = []
+ anxiety_list = []
+ power_list = []
+ reward_list = []
+ risk_list = []
+ achievement_list = []
+ affiliation_list = []
+ p_pronoun_list = []
+ i_pronoun_list = []
+
+ with open(baseline_data_set_name, 'r', encoding='cp1252') as file:
+ reader = csv.reader(file)
+
+ is_header = True
+ for row in reader:
+
+ if is_header:
+ is_header = False
+ continue
+
+ # Try and access columns, if can't then LIWC fields haven't been set and should be set to 0
+ try:
+ clout = row[self.DEFAULT_BASELINE_CLOUT_COLUMN_ID]
+ analytic = row[self.DEFAULT_BASELINE_ANALYTIC_COLUMN_ID]
+ tone = row[self.DEFAULT_BASELINE_TONE_COLUMN_ID]
+ authentic = row[self.DEFAULT_BASELINE_AUTHENTIC_COLUMN_ID]
+ anger = row[self.DEFAULT_BASELINE_ANGER_COLUMN_ID]
+ sadness = row[self.DEFAULT_BASELINE_SADNESS_COLUMN_ID]
+ anxiety = row[self.DEFAULT_BASELINE_ANXIETY_COLUMN_ID]
+ power = row[self.DEFAULT_BASELINE_POWER_COLUMN_ID]
+ reward = row[self.DEFAULT_BASELINE_REWARD_COLUMN_ID]
+ risk = row[self.DEFAULT_BASELINE_RISK_COLUMN_ID]
+ achievement = row[self.DEFAULT_BASELINE_ACHIEVEMENT_COLUMN_ID]
+ affiliation = row[self.DEFAULT_BASELINE_AFFILIATION_COLUMN_ID]
+ p_pronoun = row[self.DEFAULT_BASELINE_P_PRONOUN_COLUMN_ID]
+ i_pronoun = row[self.DEFAULT_BASELINE_I_PRONOUN_COLUMN_ID]
+ except:
+ clout = 0
+ analytic = 0
+ tone = 0
+ authentic = 0
+ anger = 0
+ sadness = 0
+ anxiety = 0
+ power = 0
+ reward = 0
+ risk = 0
+ achievement = 0
+ affiliation = 0
+ p_pronoun = 0
+ i_pronoun = 0
+
+ clout_list.append(float(clout))
+ analytic_list.append(float(analytic))
+ tone_list.append(float(tone))
+ authentic_list.append(float(authentic))
+ anger_list.append(float(anger))
+ sadness_list.append(float(sadness))
+ anxiety_list.append(float(anxiety))
+ power_list.append(float(power))
+ reward_list.append(float(reward))
+ risk_list.append(float(risk))
+ achievement_list.append(float(achievement))
+ affiliation_list.append(float(affiliation))
+ p_pronoun_list.append(float(p_pronoun))
+ i_pronoun_list.append(float(i_pronoun))
+
+ # Get average for variables, used for distance score. These are member variables so that they don't
+ # have to be re-calculated on later runs
+ self._average_clout = sum(clout_list) / len(clout_list)
+ self._average_analytic = sum(analytic_list) / len(analytic_list)
+ self._average_tone = sum(tone_list) / len(tone_list)
+ self._average_authentic = sum(authentic_list) / len(authentic_list)
+ self._average_anger = sum(anger_list) / len(anger_list)
+ self._average_sadness = sum(sadness_list) / len(sadness_list)
+ self.average_anxiety = sum(anxiety_list) / len(anxiety_list)
+ self.average_power = sum(power_list) / len(power_list)
+ self.average_reward = sum(reward_list) / len(reward_list)
+ self.average_risk = sum(risk_list) / len(risk_list)
+ self.average_achievement = sum(achievement_list) / len(achievement_list)
+ self.average_affiliation = sum(affiliation_list) / len(affiliation_list)
+ self.average_p_pronoun = sum(p_pronoun_list) / len(p_pronoun_list)
+ self.average_i_pronoun = sum(i_pronoun_list) / len(i_pronoun_list)
+
+ return [self._average_clout, self._average_analytic, self._average_tone, self._average_authentic,
+ self._average_anger, self._average_sadness, self.average_anxiety,
+ self.average_power, self.average_reward, self.average_risk, self.average_achievement,
+ self.average_affiliation,
+ self.average_p_pronoun, self.average_i_pronoun]
+
+ def _get_tf_idf_model(self):
+ """
+ A function used to retrieve the TFIDF model trained on the extremist dataset. If the model has already been
+ created then the previously created model will be used.
+ :return: a TF-IDF model
+ """
+
+ # if already made model, reuse
+ if self.saved_tf_idf_model is None:
+ logger.print_message("Opening dataset {} for TF-IDF".format(self.tf_idf_training_dataset_location))
+ baseline_data_set_name = self.tf_idf_training_dataset_location
+
+ data_set = ""
+
+ with open(baseline_data_set_name, 'r', encoding='cp1252') as file:
+ reader = csv.reader(file)
+
+ is_header = True
+ for row in reader:
+
+ if is_header:
+ is_header = False
+ continue
+
+ # take quote from dataset and add it to dataset
+ message = row[self.DEFAULT_BASELINE_MESSAGE_COLUMN_ID] # data column
+ data_set = data_set + message + "/n"
+
+ # clean data set
+ # todo should we be doing sanitization clean_data = sanitization().sanitize(data_set, self.outputs_location) # if so remove line below
+ clean_data = data_set
+
+ # get ngrams
+ uni_grams, bi_grams, tri_grams = n_gram_aggregator().get_ngrams(clean_data)
+ ngrams = uni_grams + bi_grams + tri_grams
+
+ # todo The TF_IDF most important ngrams arn't being used. Should these be used instead of the other ngrams
+ tf_idf_scores = tf_idf_aggregator().get_tf_idf_scores(ngrams, data_set)
+ number_of_most_important_ngrams = int(len(ngrams) / 2) # number is half all ngrams
+ list_of_most_important_ngrams = sorted(tf_idf_scores, key=tf_idf_scores.get, reverse=True)[
+ :number_of_most_important_ngrams]
+
+ # create a word 2 vec model
+ model = word_2_vec_aggregator().get_model(list_of_sentences=list_of_most_important_ngrams)
+ self.saved_tf_idf_model = model
+ else:
+ model = self.saved_tf_idf_model
+
+ return model
+
+ def open_wrapper(self, location, access_type, list_of_encodings=["utf-8", 'latin-1', 'cp1252']):
+ """
+ A wrapper around the open built in function that has fallbacks for different encodings.
+ :return:
+ """
+
+ for encoding in list_of_encodings:
+ try:
+ file = open(location, access_type, encoding=encoding)
+ # Attempt to read file, if fails try other encoding
+ file.readlines()
+ file.seek(0)
+ file.close()
+ file = open(location, access_type, encoding=encoding)
+ return file
+ except LookupError as e:
+ continue
+ except UnicodeDecodeError as e:
+ continue
+
+ raise Exception(
+ "No valid encoding provided for file: '{}'. Encodings provided: '{}'".format(location, list_of_encodings))
+
+ def _add_user_post_db_cache(self, user_id, dict_to_add):
+ """
+ Used to add data to the post message db cache used to free up memory.
+ """
+
+ if not os.path.isdir(self.MESSAGE_TMP_CACHE_LOCATION):
+ os.mkdir(self.MESSAGE_TMP_CACHE_LOCATION)
+
+ # Save file as pickle
+ file_name = "{}-{}.pickle".format(user_id,int(time.time()))
+ file_name = os.path.join(self.MESSAGE_TMP_CACHE_LOCATION, file_name)
+ with open(file_name, 'wb') as pickle_handle:
+ pickle.dump({"description":"a temporery file used for saving memory",
+ "data":dict_to_add}, pickle_handle, protocol=pickle.HIGHEST_PROTOCOL)
+
+ def _get_user_post_db_cache(self, file_name):
+ """
+ Retrieves data from the cache database used to free up memory.
+ """
+ if not os.path.isdir(self.MESSAGE_TMP_CACHE_LOCATION):
+ raise Exception("Attempted to access temporery cache files before files are created")
+
+ if not os.path.isfile(file_name):
+ raise Exception("Attempted to access cache file {}, however, it does not exist".format(file_name))
+
+ with (open(file_name, "rb")) as openfile:
+ cache_data = pickle.load(openfile)
+
+ return cache_data["data"]
+
+ def _delete_user_post_db_cache(self):
+ try:
+ if os.path.isdir(self.MESSAGE_TMP_CACHE_LOCATION):
+ shutil.rmtree(self.MESSAGE_TMP_CACHE_LOCATION)
+ except:
+ pass
+
+ def _get_type_of_message_data(self, data_set_location, has_header=True, is_extremist=None):
+ # Ensure all temp files are deleted
+ self._delete_user_post_db_cache()
+
+ # Counts the total rows in the CSV. Used for progress reporting.
+ print("Starting entity count. Will count '{}'".format(self.MAX_RECORD_SIZE))
+
+ # Read one entry at a time
+ max_chunksize = 1
+ row_count = 0
+
+ for row in pd.read_csv(data_set_location, iterator=True,encoding='latin-1'):
+
+ row_count = row_count + 1
+
+ if row_count >= self.MAX_RECORD_SIZE:
+ break
+
+
+ print("Finished entity count. Count is: '{}'".format(row_count))
+ print("")
+ # Loops through all rows in the dataset CSV file.
+ current_processed_rows = 0
+ is_header = False
+
+ for row in pd.read_csv(data_set_location, iterator=True,encoding='latin-1'):
+ row = row.columns
+ # Makes sure same number for each dataset
+ if current_processed_rows > row_count:
+ break
+
+ # Skips the first entry, as it's the CSV header
+ if has_header and is_header:
+ is_header = False
+ continue
+
+ # Retrieve username
+ try:
+ username = row[self.DEFAULT_USERNAME_COLUMN_ID]
+ date = row[self.DEFAULT_DATE_COLUMN_ID]
+ user_unique_id = self._get_unique_id_from_username(username)
+ except:
+ # if empty entry
+ continue
+ # Attempt to get LIWC scores from csv, if not present return 0's
+ try:
+ # Summary variables
+ clout = float(row[self.DEFAULT_CLOUT_COLUMN_ID])
+ analytic = float(row[self.DEFAULT_ANALYTIC_COLUMN_ID])
+ tone = float(row[self.DEFAULT_TONE_COLUMN_ID])
+ authentic = float(row[self.DEFAULT_AUTHENTIC_COLUMN_ID])
+ # Emotional Analysis
+ anger = float(row[self.DEFAULT_ANGER_COLUMN_ID])
+ sadness = float(row[self.DEFAULT_SADNESS_COLUMN_ID])
+ anxiety = float(row[self.DEFAULT_ANXIETY_COLUMN_ID])
+ # Personal Drives:
+ power = float(row[self.DEFAULT_POWER_COLUMN_ID])
+ reward = float(row[self.DEFAULT_REWARD_COLUMN_ID])
+ risk = float(row[self.DEFAULT_RISK_COLUMN_ID])
+ achievement = float(row[self.DEFAULT_ACHIEVEMENT_COLUMN_ID])
+ affiliation = float(row[self.DEFAULT_AFFILIATION_COLUMN_ID])
+ # Personal pronouns
+ i_pronoun = float(row[self.DEFAULT_I_PRONOUN_COLUMN_ID])
+ p_pronoun = float(row[self.DEFAULT_P_PRONOUN_COLUMN_ID])
+
+ except:
+ # Summary variables
+ clout = 0
+ analytic = 0
+ tone = 0
+ authentic = 0
+ # Emotional Analysis
+ anger = 0
+ sadness = 0
+ anxiety = 0
+ # Personal Drives:
+ power = 0
+ reward = 0
+ risk = 0
+ achievement = 0
+ affiliation = 0
+ # Personal pronouns
+ i_pronoun = 0
+ p_pronoun = 0
+
+ liwc_dict = {
+ "clout": clout,
+ "analytic": analytic,
+ "tone": tone,
+ "authentic": authentic,
+ "anger": anger,
+ "sadness": sadness,
+ "anxiety": anxiety,
+ "power": power,
+ "reward": reward,
+ "risk": risk,
+ "achievement": achievement,
+ "affiliation": affiliation,
+ "i_pronoun": i_pronoun,
+ "p_pronoun": p_pronoun,
+ }
+
+ # Calculate minkowski distance
+ average_row = self._get_average_liwc_scores_for_baseline_data()
+
+ actual_row = [clout, analytic, tone, authentic,
+ anger, sadness, anxiety,
+ power, reward, risk, achievement, affiliation,
+ p_pronoun, i_pronoun
+ ]
+
+ try:
+ liwc_dict["minkowski"] = distance.minkowski(actual_row, average_row, 1)
+ except ValueError:
+ continue
+
+ # Retrieve Tweet for message
+ tweet = str(row[self.DEFAULT_MESSAGE_COLUMN_ID])
+
+ # clean/ remove markup in dataset
+ sanitised_message = sanitization().sanitize(tweet, self.outputs_location,
+ force_new_data_and_dont_persisit=True)
+
+ # If no message skip entry
+ if not len(tweet) > 0 or not len(sanitised_message) > 0 or sanitised_message == '' or not len(
+ sanitised_message.split(" ")) > 0:
+ continue
+
+ # Process Tweet and save as dict
+ tweet_dict = self._process_tweet(user_unique_id, tweet, row)
+
+ # If the message vector is not 200 skip (meaning that a blank message was processed)
+ if not len(tweet_dict["message_vector"]) == 200:
+ continue
+
+ if is_extremist is not None:
+ tweet_dict["is_extremist"] = is_extremist
+
+ tweet_dict["date"] = date
+
+ # Merge liwc dict with tweet dict
+ tweet_dict = {**tweet_dict, **liwc_dict}
+
+ #tweet_dict["user_unique_id"]= user_unique_id
+
+ self._add_user_post_db_cache(user_unique_id, {user_unique_id: tweet_dict})
+ #self.tweet_user_features.append()
+ # TODO here save to cache json instead of list and graph
+
+ logger().print_message("Added message from user: '{}', from dataset: '{}'. {} rows of {} completed."
+ .format(user_unique_id, data_set_location, current_processed_rows, row_count), 1)
+ current_processed_rows = current_processed_rows + 1
+ print("Finished reading row")
+
+ # Add the centrality (has to be done after all users are added to graph)
+ completed_tweet_user_features = []
+ # Loops through each item in the list which represents each message/ tweet
+
+ # Loop through all data in cache file
+ for cached_message_file in os.listdir(self.MESSAGE_TMP_CACHE_LOCATION):
+ cached_message_file = os.fsdecode(cached_message_file)
+ cached_message_file = os.path.join(self.MESSAGE_TMP_CACHE_LOCATION,cached_message_file)
+
+ # Only process pickle files
+ if not cached_message_file.endswith(".pickle"):
+ continue
+
+ print("Reading cache file: '{}'".format(cached_message_file))
+ cached_message_data = self._get_user_post_db_cache(cached_message_file)
+ # Loops through the data in that tweet (Should only be one entry per tweet).
+ for user_id in cached_message_data.keys():
+ updated_entry = {}
+ updated_entry[user_id] = cached_message_data[user_id]
+ # Adds centrality
+ updated_entry[user_id]["centrality"] = self.graph.get_degree_centrality_for_user(user_id)
+ logger().print_message(
+ "Added '{}' Centrality for user '{}'".format(updated_entry[user_id]["centrality"], user_id), 1)
+ completed_tweet_user_features.append(updated_entry)
+ gc.collect()
+ break # Only one entry per list
+
+
+ self._delete_user_post_db_cache()
+ self.completed_tweet_user_features = self.completed_tweet_user_features + completed_tweet_user_features
+ self.tweet_user_features = []
+ #self.archived_graphs.append(self.graph)
+ self.graph = grapher()
+ print("Finished messages")
+
+ def _get_extremist_data(self, dataset_location):
+ """
+ This function is responsible for aggregating tweets from the extremist dataset, extracting the features, and
+ saving them to a file for a model to be created.
+ """
+
+ self._get_type_of_message_data(data_set_location=dataset_location, is_extremist=True)
+
+ def _get_counterpoise_data(self, dataset_location):
+ """
+ This function is responsible for aggregating tweets from the counterpoise (related to the topic but from
+ legitimate sources, e.g. news outlets) dataset, extracting the features, and saving them to a file for a
+ model to be created.
+ """
+
+ self._get_type_of_message_data(data_set_location=dataset_location, is_extremist=False)
+
+ def _get_standard_tweets(self, dataset_location):
+ """
+ This function is responsible for aggregating tweets from the baseline (random sample of twitter posts)
+ dataset, extracting the features, and saving them to a file for a model to be created.
+ """
+
+ self._get_type_of_message_data(data_set_location=dataset_location, is_extremist=False)
+
+ def dump_features_for_list_of_datasets(self, feature_file_path_to_save_to, list_of_dataset_locations,
+ force_new_dataset=True):
+ """
+ Saves features representing a provided dataset to a json file. Designed to be used for testing after a
+ model has been created.
+ :param feature_file_path_to_save_to:
+ :param dataset_location:
+ :return:
+ """
+
+ self._reset_stored_feature_data()
+
+ if force_new_dataset or not os.path.isfile(feature_file_path_to_save_to):
+ for dataset in list_of_dataset_locations:
+ self._get_type_of_message_data(data_set_location=dataset, is_extremist=None)
+
+ with open(feature_file_path_to_save_to, 'w') as outfile:
+ json.dump(self.completed_tweet_user_features, outfile, indent=4)
+
+ else:
+ with open(feature_file_path_to_save_to, 'r') as file:
+ data = file.read()
+
+ # parse file
+ self.completed_tweet_user_features = json.loads(data)
+
+ def dump_training_data_features(self, feature_file_path_to_save_to, extremist_data_location,
+ baseline_data_location, force_new_dataset=True):
+ """
+ The entrypoint function, used to dump all features, for all users in the extreamist, counterpoise, and baseline
+ datsets to a json file.
+ :param feature_file_path_to_save_to: The filepath to save the datasets to
+ """
+
+ self._reset_stored_feature_data()
+
+ if force_new_dataset or not os.path.isfile(feature_file_path_to_save_to):
+ print("Starting baseline messages")
+ self._get_standard_tweets(baseline_data_location)
+ print("Starting extremist messages")
+ self._get_extremist_data(extremist_data_location)
+
+
+ with open(feature_file_path_to_save_to, 'w') as outfile:
+ json.dump(self.completed_tweet_user_features, outfile, indent=4)
diff --git a/Pinpoint/Grapher.py b/Pinpoint/Grapher.py
new file mode 100644
index 0000000000000000000000000000000000000000..638c1e11f8b082a41b7709b0db8d63dd0400099f
--- /dev/null
+++ b/Pinpoint/Grapher.py
@@ -0,0 +1,60 @@
+import networkx as nx
+
+
+class grapher():
+ """
+ A wrapper class used for generating a graph for interactions between users
+ """
+ graph = None
+
+ def __init__(self):
+ """
+ Constructor.
+ """
+ self.graph = nx.DiGraph()
+
+ def add_edge_wrapper(self, node_1_name, node_2_name, weight, relationship):
+ """
+ A wrapper function used to add an edge connection or node.
+ :param node_1_name: from
+ :param node_2_name: to
+ :param weight:
+ :param relationship:
+ :return:
+ """
+ self.graph.add_edge(node_1_name, node_2_name, weight=weight, relation=relationship)
+
+ def add_node(self, node_name):
+ """
+ A wrapper function that adds a node with no edges to the graph
+ :param node_name:
+ """
+ self.graph.add_node(node_name)
+
+ def get_info(self):
+ """
+ Retrieves information about the graph
+ :return:
+ """
+ return nx.info(self.graph)
+
+ def show_graph(self):
+ """
+ Displays the graph
+ :return:
+ """
+ nx.spring_layout(self.graph)
+
+ def get_degree_centrality_for_user(self, user_name):
+ """
+ Returns the Degree of Centrality for a given user present in the graph
+ :param user_name:
+ :return: the Degree of Centrality for a given user present in the graph
+ """
+ centrality = nx.degree_centrality(self.graph)
+ return centrality[user_name]
+
+ # todo implement
+ # def get_eigenvector_centrality_for_user(self, user_name):
+ # centrality = nx.eigenvector_centrality(self.graph)
+ # return centrality[user_name]
diff --git a/Pinpoint/Logger.py b/Pinpoint/Logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..d165f17e94835e8b122033c6c4350d7eb93f4866
--- /dev/null
+++ b/Pinpoint/Logger.py
@@ -0,0 +1,21 @@
+from datetime import datetime
+
+
+class logger():
+ """
+ A wrapper class around the Python print function used to only print
+ """
+ DEBUG = False
+
+ @staticmethod
+ def print_message(message, logging_level=0):
+ """
+ A wrapper function around the Python print function used to only print
+ :param message: the message to print
+ :param override_debug: a boolean on if the DEBUG status should be override. if True a log will be printed,
+ irrespective of if in Debug mode.
+ """
+ if logging_level >= 1 or logger.DEBUG:
+ now = datetime.now()
+ current_time = now.strftime("%H:%M:%S")
+ print("{} | {}".format(current_time, message))
diff --git a/Pinpoint/RandomForest.py b/Pinpoint/RandomForest.py
new file mode 100644
index 0000000000000000000000000000000000000000..a91c32be496fb8af669989032915c7c6184bec32
--- /dev/null
+++ b/Pinpoint/RandomForest.py
@@ -0,0 +1,374 @@
+import csv
+import json
+import os
+import pickle
+from datetime import datetime
+
+import pandas
+import pandas as pd
+from sklearn import metrics
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+
+from Pinpoint import Logger
+
+
+class random_forest():
+ """
+ A class used for creating a random forest binary classifier.
+ """
+
+ model = None
+ accuracy = None
+ precision = None
+ recall = None
+ f_measure = None
+
+ # Model variables populated on creation or reading of file
+
+ original_name = None
+ creation_date = None
+
+ _FRAMEWORK_VERSION = 0.2 # Used when creating a new model file
+ # v0.1 - versioning added.
+ # v0.2 - Added more LIWC scores and minkowski distance
+
+ model_version = _FRAMEWORK_VERSION # can be updated if reading and using a model file of a different version
+
+ _outputs_folder = None
+ _model_folder = None
+
+ # Categories of features used in the model
+ RADICAL_LANGUAGE_ENABLED = True # RF-IDF Scores, Word Embeddings
+ PSYCHOLOGICAL_SIGNALS_ENABLED = True # LIWC Dictionaries, Minkowski distance
+ BEHAVIOURAL_FEATURES_ENABLED = True # frequency of tweets, followers / following ratio, centrality
+
+ def __init__(self, outputs_folder="outputs", model_folder=None):
+ """
+ Constructor
+
+ The random_forest() class can be initialised with outputs_folder() and model_folder(). The outputs folder is
+ where output files are stored and the model folder is where the model will be created if not overwritten.
+ """
+
+ if model_folder is None:
+ model_folder = outputs_folder
+
+ self._outputs_folder = outputs_folder
+ self._model_folder = model_folder
+
+ def get_features_as_df(self, features_file, force_new_dataset=True):
+ """
+ Reads a JSON file file and converts to a Pandas dataframe that can be used to train and test the classifier.
+ :param features_file: the location of the JSON features file to convert to a dataframe
+ :param force_new_dataset: if true a new CSV file will be created even if one already exists.
+ :return: a Pandas dataframe with the features.
+ """
+
+ with open(features_file) as json_features_file:
+ csv_file = "{}.csv".format(features_file)
+
+ if force_new_dataset or not os.path.isfile(csv_file):
+ features = json.load(json_features_file)
+
+ # todo remove the data for the features not being used.
+ filtered_list_after_filters_applied = []
+
+ # If any of the filters are not true remove the features not requested
+ column_names = []
+
+ if self.PSYCHOLOGICAL_SIGNALS_ENABLED:
+ column_names = column_names + ["clout", "analytic", "tone", "authentic",
+ "anger", "sadness", "anxiety",
+ "power", "reward", "risk", "achievement", "affiliation",
+ "i_pronoun", "p_pronoun",
+ "minkowski"]
+ if self.BEHAVIOURAL_FEATURES_ENABLED:
+ column_names = column_names + ['centrality']
+
+ if self.RADICAL_LANGUAGE_ENABLED:
+ # Add column names
+ column_names = column_names + ["cap_freq", "violent_freq"]
+ # Add the two hundred vectors columns
+ for iterator in range(1, 201):
+ column_names.append("message_vector_{}".format(iterator))
+
+ column_names = column_names + ['is_extremist']
+
+ if not self.BEHAVIOURAL_FEATURES_ENABLED or not self.PSYCHOLOGICAL_SIGNALS_ENABLED or self.RADICAL_LANGUAGE_ENABLED:
+
+ # Loops through list of dicts (messages)
+ number_of_processed_messages = 0
+ for message in features:
+ number_of_processed_messages = number_of_processed_messages + 1
+ Logger.logger.print_message(
+ "Extracting information from message {} of {} in file {}".format(
+ number_of_processed_messages,
+ len(features),
+ features_file),
+ logging_level=1)
+
+ # Loops through dict keys (usernames)
+ for user in message.keys():
+
+ message_features = message[user]
+
+ feature_dict = {}
+
+ if self.PSYCHOLOGICAL_SIGNALS_ENABLED:
+ # Summary variables
+ feature_dict["clout"] = message_features["clout"]
+ feature_dict["analytic"] = message_features["analytic"]
+ feature_dict["tone"] = message_features["tone"]
+ feature_dict["authentic"] = message_features["authentic"]
+
+ # Emotional Analysis
+ feature_dict["anger"] = message_features["anger"]
+ feature_dict["sadness"] = message_features["sadness"]
+ feature_dict["anxiety"] = message_features["anxiety"]
+
+ # Personal Drives
+ feature_dict["power"] = message_features["power"]
+ feature_dict["reward"] = message_features["reward"]
+ feature_dict["risk"] = message_features["risk"]
+ feature_dict["achievement"] = message_features["achievement"]
+ feature_dict["affiliation"] = message_features["affiliation"]
+
+ # Personal Pronouns
+ feature_dict["i_pronoun"] = message_features["i_pronoun"]
+ feature_dict["p_pronoun"] = message_features["p_pronoun"]
+
+ # Minkowski distance
+ feature_dict["minkowski"] = message_features["minkowski"]
+
+ if self.BEHAVIOURAL_FEATURES_ENABLED:
+ #feature_dict['post_freq'] = message_features['post_freq']
+ #feature_dict['follower_freq'] = message_features['follower_freq']
+ feature_dict['centrality'] = message_features['centrality']
+
+ if self.RADICAL_LANGUAGE_ENABLED:
+ feature_dict["message_vector"] = message_features["message_vector"]
+ feature_dict["violent_freq"] = message_features["violent_freq"]
+ feature_dict["cap_freq"] = message_features["cap_freq"]
+
+ feature_dict['is_extremist'] = message_features['is_extremist']
+
+ user = {user: feature_dict}
+ filtered_list_after_filters_applied.append(user)
+
+ number_of_features = len(filtered_list_after_filters_applied)
+
+ # Creates the columns for the data frame
+ df = pd.DataFrame(
+ columns=column_names)
+
+ completed_features = 0
+ iterator = 0
+ error_count = 0
+ for message in features:
+ # should only be one user per entry
+ for user_id in message:
+ feature_data = message[user_id]
+ # ID is not included as it's hexidecimal and not float
+
+ row = []
+
+ if self.PSYCHOLOGICAL_SIGNALS_ENABLED:
+ clout = feature_data['clout']
+ analytic = feature_data['analytic']
+ tone = feature_data['tone']
+ authentic = feature_data['authentic']
+
+ anger = feature_data["anger"]
+ sadness = feature_data["sadness"]
+ anxiety = feature_data["anxiety"]
+ power = feature_data["power"]
+ reward = feature_data["reward"]
+ risk = feature_data["risk"]
+ achievement = feature_data["achievement"]
+ affiliation = feature_data["affiliation"]
+ i_pronoun = feature_data["i_pronoun"]
+ p_pronoun = feature_data["p_pronoun"]
+ minkowski = feature_data["minkowski"]
+
+ row = row + [clout, analytic, tone, authentic, anger, sadness, anxiety, power,
+ reward, risk, achievement, affiliation, i_pronoun, p_pronoun, minkowski]
+
+ if self.BEHAVIOURAL_FEATURES_ENABLED:
+ #post_freq = feature_data['post_freq']
+ #follower_freq = feature_data['follower_freq']
+ centrality = feature_data['centrality']
+
+ row = row + [#post_freq, follower_freq,
+ centrality]
+
+ if self.RADICAL_LANGUAGE_ENABLED:
+ cap_freq = feature_data['cap_freq']
+ violent_freq = feature_data['violent_freq']
+ message_vector = feature_data['message_vector']
+
+ row = row + [cap_freq, violent_freq] + message_vector
+
+ is_extremist = feature_data['is_extremist']
+
+ row = row + [is_extremist]
+ try:
+ df.loc[iterator] = row
+ except ValueError as e:
+ print(e)
+ error_count = error_count + 1
+ pass # if error with value probably column mismatch which is down to taking a mesage with no data
+
+ iterator = iterator + 1
+ completed_features = completed_features + 1
+ user_name = list(message.keys())[0]
+ Logger.logger.print_message(
+ "Added a message from user {} to data frame - {} messages of {} completed".format(user_name,
+ completed_features,
+ number_of_features),
+ logging_level=1)
+
+ Logger.logger.print_message("Total errors when creating data frame: {}".format(error_count),
+ logging_level=1)
+
+ # Replace boolean with float
+ df.replace({False: 0, True: 1}, inplace=True)
+
+ # Sets ID field
+ df.index.name = "ID"
+ df.to_csv("{}.csv".format(features_file))
+
+ else:
+ df = pandas.read_csv(csv_file)
+
+ return df
+
+ def create_model_info_output_file(self, location_of_output_file = None, training_data_csv_location = None):
+ """
+ If the model has been loaded or trained this function will create a summary text file with information relating to
+ the model.
+ :param location_of_output_file: The location to save the output file to.
+ :param training_data_csv_location: The location of the training data csv. This is used to retrieve the name of the
+ feature columns.
+ """
+
+ # Check if model has been created
+ if not self.creation_date:
+ Logger.logger.print_message("Model has not been trained, created, or loaded. Cannot output model data in this state.",logging_level=1)
+ else:
+ Logger.logger.print_message("Creating model info text file")
+ output_text = ""
+
+ # Add summary information
+ output_text += "Model {}, version {}, created at {} \n".format(self.original_name, self.model_version, self.creation_date)
+ output_text += "\nAccuracy: {}\nRecall: {} \nPrecision: {}\nF-Measure: {}\n".format(self.accuracy, self.recall,
+ self.precision, self.f_measure)
+
+ # Retrieve the header names if available
+ if training_data_csv_location:
+ with open(training_data_csv_location, "r") as csv_file:
+ reader = csv.reader(csv_file)
+ headers = next(reader)
+
+ # Loop through all feature importance scores
+ for iterator in range(len(self.model.feature_importances_)):
+ if training_data_csv_location:
+ # Plus one to ignore ID field
+ output_text += "\n{}: {}".format(headers[iterator+1], self.model.feature_importances_[iterator])
+ else:
+ output_text += "\nFeature {}: {}".format(iterator,self.model.feature_importances_[iterator])
+
+ # If no name has been set write to outputs folder
+ if location_of_output_file:
+ file_name = location_of_output_file
+ else:
+ file_name = os.path.join(self._outputs_folder,"model-output-{}.txt".format(datetime.today().strftime('%Y-%m-%d-%H%M%S')))
+
+ # Write to file
+ with open(file_name, "w") as output_file:
+ output_file.write(output_text)
+
+ def train_model(self, features_file, force_new_dataset=True, model_location=None):
+ """
+ Trains the model of the proveded data unless the model file already exists or if the force new dataset flag is True.
+ :param features_file: the location of the feature file to be used to train the model
+ :param force_new_dataset: If True a new dataset will be created and new model created even if a model already exists.
+ :param model_location: the location to save the model file to
+ """
+
+ # Sets model location based on default folder location and placeholder name if none was given
+ if model_location is None:
+ model_location = os.path.join(self._model_folder, "predictor.model")
+
+ # if told to force the creation of a new dataset to train off or the model location does not exist then make a new model
+ if force_new_dataset or not os.path.isfile(model_location):
+
+ # Import train_test_split function
+ feature_data = self.get_features_as_df(features_file, force_new_dataset)
+
+ # Removes index column
+ if "ID" in feature_data.keys():
+ feature_data.drop(feature_data.columns[0], axis=1, inplace=True)
+ feature_data.reset_index(drop=True, inplace=True)
+
+ y = feature_data[['is_extremist']] # Labels
+ X = feature_data.drop(axis=1, labels=['is_extremist']) # Features
+
+ # Split dataset into training set and test set
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 80% training and 20% test
+
+ # Create a Gaussian Classifier
+ random_forest = RandomForestClassifier(n_estimators=100, max_depth=50, oob_score=True
+ ) # class_weight={0:1,1:5} # A higher weight for the minority class (is_extreamist)
+
+ # Train the model using the training sets y_pred=random_forest.predict(X_test)
+ random_forest.fit(X_train, y_train.values.ravel())
+
+ y_pred = random_forest.predict(X_test)
+
+ # Model Accuracy, how often is the classifier correct?
+ self.accuracy = metrics.accuracy_score(y_test, y_pred)
+ self.recall = metrics.recall_score(y_test, y_pred)
+ self.precision = metrics.precision_score(y_test, y_pred)
+ self.f_measure = metrics.f1_score(y_test, y_pred)
+
+ Logger.logger.print_message("Accuracy: {}".format(self.accuracy), logging_level=1)
+ Logger.logger.print_message("Recall: {}".format(self.recall), logging_level=1)
+ Logger.logger.print_message("Precision: {}".format(self.precision), logging_level=1)
+ Logger.logger.print_message("F-Measure: {}".format(self.f_measure), logging_level=1)
+
+ self.model = random_forest
+ self.original_name = model_location
+ self.creation_date = datetime.today().strftime('%Y-%m-%d')
+
+ # write model and accuracy to file to file
+ model_data = {"model": self.model,
+ "original_name": self.original_name,
+ "creation_date": self.creation_date,
+ "accuracy": self.accuracy,
+ "recall": self.recall,
+ "precision": self.precision,
+ "f1": self.f_measure,
+ "version": self._FRAMEWORK_VERSION
+ }
+
+ pickle.dump(model_data, open(model_location, "wb"))
+
+ else:
+ # Read model and accuracy from file
+ saved_file = pickle.load(open(model_location, "rb"))
+
+ self.accuracy = saved_file["accuracy"]
+ self.recall = saved_file["recall"]
+ self.precision = saved_file["precision"]
+ self.f_measure = saved_file["f1"]
+ self.model = saved_file["model"]
+ self.model_version = saved_file["version"]
+ self.original_name = saved_file["original_name"]
+ self.creation_date = saved_file["creation_date"]
+
+ # A check to identify if the loaded model is of the same version as the tooling
+ if self.model_version is not self._FRAMEWORK_VERSION:
+ Logger.logger.print_message("Model provided is of version {}, tooling is of "
+ "version {}. Using the model may not work as expected."
+ .format(self.model_version, self._FRAMEWORK_VERSION))
\ No newline at end of file
diff --git a/Pinpoint/Sanitizer.py b/Pinpoint/Sanitizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f025934fb42a20c8fcfb9d640f9077264c7f8190
--- /dev/null
+++ b/Pinpoint/Sanitizer.py
@@ -0,0 +1,131 @@
+import os.path
+
+from nltk import *
+from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
+
+from Pinpoint.Logger import *
+
+# If NLTK data doesn't exist, downloads it
+try:
+ tagged = pos_tag(["test"])
+except LookupError:
+ download()
+
+
+# nltk.download() #todo how to get this to run once?
+
+class sanitization():
+ """
+ This class is used to sanitize a given corpus of data. In turn removing stop words, stemming words, removing small
+ words, removing no alphabet words, and setting words to lower case. To save on repeat runs a local copy of the
+ serialised corpus is saved that is used unless this feature is overwritten.
+ """
+
+ def sanitize(self, text, output_folder, force_new_data_and_dont_persisit=False):
+ """
+ Entry function for sanitizing text
+ :param text:
+ :param force_new_data_and_dont_persisit:
+ :return: sanitized text
+ """
+ sanitize_file_name = os.path.join(output_folder, "{}-sanitized_text.txt".format(uuid.uuid4()))
+ final_text = ""
+
+ # If a file exists don't sanitize given text
+ if os.path.isfile(sanitize_file_name) and not force_new_data_and_dont_persisit:
+ logger.print_message("Sanitized file exists. Using data")
+
+ with open(sanitize_file_name, 'r', encoding="utf8") as file_to_write:
+ final_text = file_to_write.read()
+
+ else:
+ total_words = len(text.split(" "))
+ number = 0
+ logger.print_message("Starting sanitization... {} words to go".format(total_words))
+ for word in text.split(" "):
+ number = number + 1
+ word = self.remove_non_alpha(word)
+ word = self.lower(word)
+ word = self.stemmer(word)
+ word = self.remove_stop_words(word)
+ word = self.remove_small_words(word)
+
+ if word is None:
+ continue
+
+ final_text = final_text + word + " "
+ logger.print_message("Completed {} of {} sanitized words".format(number, total_words))
+
+ final_text = final_text.replace(" ", " ")
+
+ if not force_new_data_and_dont_persisit:
+ with open(sanitize_file_name, 'w', encoding="utf8") as file_to_write:
+ file_to_write.write(final_text)
+
+ final_text = final_text.strip()
+ return final_text
+
+ def stemmer(self, word):
+ """
+ Get stemms of words
+ :param word:
+ :return: the stemmed word using port stemmer
+ """
+
+ porter = PorterStemmer()
+
+ # todo anouther stemmer be assessed?
+ # lancaster = LancasterStemmer()
+ # stemmed_word = lancaster.stem(word)
+ stemmed_word = porter.stem(word)
+
+ return stemmed_word
+
+ def lower(self, word):
+ """
+ get the lower case representation of words
+ :param word:
+ :return: the lowercase representation of the word
+ """
+ return word.lower()
+
+ def remove_stop_words(self, text):
+ """
+ Remove stop words
+ :param text:
+ :return: the word without stop words
+ """
+
+ text_without_stopwords = [word for word in text.split() if word not in ENGLISH_STOP_WORDS]
+
+ final_string = ""
+
+ for word in text_without_stopwords:
+ final_string = final_string + word + " "
+
+ return final_string
+
+ def remove_non_alpha(self, word):
+ """
+ Removes non alphabet characters (Excluding spaces)
+ :param word:
+ :return: the word with non-alpha characters removed
+ """
+ word = word.replace("\n", " ").replace("\t", " ").replace(" ", " ")
+ regex = re.compile('[^a-zA-Z ]')
+
+ return regex.sub('', word)
+
+ def remove_small_words(self, word, length_to_remove_if_not_equal=4):
+ """
+ Removes words that are too small, defaults to words words length 3 characters or below which are removed.
+ :param word:
+ :param length_to_remove_if_not_equal:
+ :return: "" if word below 3 characters or the word if above
+ """
+
+ new_word = ""
+ if len(word) >= length_to_remove_if_not_equal:
+ new_word = word
+
+ return new_word
diff --git a/Pinpoint/Serializer.py b/Pinpoint/Serializer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8ef4687228f175ec529ec55a4f4d7f6c3319e97
--- /dev/null
+++ b/Pinpoint/Serializer.py
@@ -0,0 +1,20 @@
+# todo This file should be used to store common serialisations across aggregating data
+
+def createPostDict(date, post_text, likes, comments, shares, source="self"):
+ '''
+ Creates a dictionary containing the pertinent information from a social media post. This should later be added to a list
+ of other posts from that account and then added to a master dictionary.
+ :param date:
+ :param post_text:
+ :param likes:
+ :param comments:
+ :param shares:
+ :param source:
+ :return: a dictionary containing pertinent post information
+ '''
+ return {"text": post_text, "likes": likes, "comments": comments, "shares": shares, "source": source, "date": date}
+
+
+def createWholeUserDict(unique_id, reddit_list, instagram_list, twitter_list, survey_data):
+ return {"id": unique_id, "reddit": reddit_list, "instagram": instagram_list, "twitter": twitter_list,
+ "survey": survey_data}
diff --git a/Pinpoint/__pycache__/Aggregator_NGram.cpython-310.pyc b/Pinpoint/__pycache__/Aggregator_NGram.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c7af2888b584f3aab440e3f20b486d6c394e48d
Binary files /dev/null and b/Pinpoint/__pycache__/Aggregator_NGram.cpython-310.pyc differ
diff --git a/Pinpoint/__pycache__/Aggregator_NGram.cpython-36.pyc b/Pinpoint/__pycache__/Aggregator_NGram.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10f508865c88f36d22f6d2b0727c119a5d145a78
Binary files /dev/null and b/Pinpoint/__pycache__/Aggregator_NGram.cpython-36.pyc differ
diff --git a/Pinpoint/__pycache__/Aggregator_NGram.cpython-38.pyc b/Pinpoint/__pycache__/Aggregator_NGram.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0061b2c790bf1fc038f1ed9def1e0f2266f04d5f
Binary files /dev/null and b/Pinpoint/__pycache__/Aggregator_NGram.cpython-38.pyc differ
diff --git a/Pinpoint/__pycache__/Aggregator_TfIdf.cpython-310.pyc b/Pinpoint/__pycache__/Aggregator_TfIdf.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7679dcec19f5951772d70044561df2b223950183
Binary files /dev/null and b/Pinpoint/__pycache__/Aggregator_TfIdf.cpython-310.pyc differ
diff --git a/Pinpoint/__pycache__/Aggregator_TfIdf.cpython-36.pyc b/Pinpoint/__pycache__/Aggregator_TfIdf.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b892381dff7bcef36088e40189a03af14c95ec01
Binary files /dev/null and b/Pinpoint/__pycache__/Aggregator_TfIdf.cpython-36.pyc differ
diff --git a/Pinpoint/__pycache__/Aggregator_TfIdf.cpython-38.pyc b/Pinpoint/__pycache__/Aggregator_TfIdf.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7479b88a28ad1ff502542cb247b283338fbebe07
Binary files /dev/null and b/Pinpoint/__pycache__/Aggregator_TfIdf.cpython-38.pyc differ
diff --git a/Pinpoint/__pycache__/Aggregator_Word2Vec.cpython-310.pyc b/Pinpoint/__pycache__/Aggregator_Word2Vec.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e16a1bb188ec52c17aee2738748cb61c413ef6b
Binary files /dev/null and b/Pinpoint/__pycache__/Aggregator_Word2Vec.cpython-310.pyc differ
diff --git a/Pinpoint/__pycache__/Aggregator_Word2Vec.cpython-36.pyc b/Pinpoint/__pycache__/Aggregator_Word2Vec.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..30310087e9fb886bd2001340026e4e177783269b
Binary files /dev/null and b/Pinpoint/__pycache__/Aggregator_Word2Vec.cpython-36.pyc differ
diff --git a/Pinpoint/__pycache__/Aggregator_Word2Vec.cpython-38.pyc b/Pinpoint/__pycache__/Aggregator_Word2Vec.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1779db9467efb4e3eb64e9094314b62a988d8cfa
Binary files /dev/null and b/Pinpoint/__pycache__/Aggregator_Word2Vec.cpython-38.pyc differ
diff --git a/Pinpoint/__pycache__/Aggregator_WordingChoice.cpython-310.pyc b/Pinpoint/__pycache__/Aggregator_WordingChoice.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dbe6112b8e63cc38377708193ee5b67036cfeb48
Binary files /dev/null and b/Pinpoint/__pycache__/Aggregator_WordingChoice.cpython-310.pyc differ
diff --git a/Pinpoint/__pycache__/Aggregator_WordingChoice.cpython-36.pyc b/Pinpoint/__pycache__/Aggregator_WordingChoice.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..64e6290f6c6a1dcd6c95c0c9ab518c477e99d62b
Binary files /dev/null and b/Pinpoint/__pycache__/Aggregator_WordingChoice.cpython-36.pyc differ
diff --git a/Pinpoint/__pycache__/Aggregator_WordingChoice.cpython-38.pyc b/Pinpoint/__pycache__/Aggregator_WordingChoice.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a412e522c08ebfbab3322127cf1eaf57a4353391
Binary files /dev/null and b/Pinpoint/__pycache__/Aggregator_WordingChoice.cpython-38.pyc differ
diff --git a/Pinpoint/__pycache__/FeatureExtraction.cpython-310.pyc b/Pinpoint/__pycache__/FeatureExtraction.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..691436df9740bb13091dad04543a74548302b01c
Binary files /dev/null and b/Pinpoint/__pycache__/FeatureExtraction.cpython-310.pyc differ
diff --git a/Pinpoint/__pycache__/FeatureExtraction.cpython-36.pyc b/Pinpoint/__pycache__/FeatureExtraction.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d3fd27d80fe255aa2ef46b2cc02e5adc85ee78e
Binary files /dev/null and b/Pinpoint/__pycache__/FeatureExtraction.cpython-36.pyc differ
diff --git a/Pinpoint/__pycache__/FeatureExtraction.cpython-38.pyc b/Pinpoint/__pycache__/FeatureExtraction.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dcd1024554d2da3300c5c263f15d18a38fa21c9d
Binary files /dev/null and b/Pinpoint/__pycache__/FeatureExtraction.cpython-38.pyc differ
diff --git a/Pinpoint/__pycache__/Grapher.cpython-310.pyc b/Pinpoint/__pycache__/Grapher.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c631fb81d965fc74a28c6ec15536f2d5bd8d3d2
Binary files /dev/null and b/Pinpoint/__pycache__/Grapher.cpython-310.pyc differ
diff --git a/Pinpoint/__pycache__/Grapher.cpython-36.pyc b/Pinpoint/__pycache__/Grapher.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84b6dd42dcb28c0699f8851a59c8dba22f82bc18
Binary files /dev/null and b/Pinpoint/__pycache__/Grapher.cpython-36.pyc differ
diff --git a/Pinpoint/__pycache__/Grapher.cpython-38.pyc b/Pinpoint/__pycache__/Grapher.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6846226a834846e8eaf4cf9e5085eaf0e2c60779
Binary files /dev/null and b/Pinpoint/__pycache__/Grapher.cpython-38.pyc differ
diff --git a/Pinpoint/__pycache__/Logger.cpython-310.pyc b/Pinpoint/__pycache__/Logger.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c589eda5202f5d1ec929e623223e4e88a7e8c3cc
Binary files /dev/null and b/Pinpoint/__pycache__/Logger.cpython-310.pyc differ
diff --git a/Pinpoint/__pycache__/Logger.cpython-36.pyc b/Pinpoint/__pycache__/Logger.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb4d37f0c6a1c6a7ce82f38d5333f340032bec92
Binary files /dev/null and b/Pinpoint/__pycache__/Logger.cpython-36.pyc differ
diff --git a/Pinpoint/__pycache__/Logger.cpython-38.pyc b/Pinpoint/__pycache__/Logger.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29c9d78a2f2710c3f4bf543d37390942aecd65fe
Binary files /dev/null and b/Pinpoint/__pycache__/Logger.cpython-38.pyc differ
diff --git a/Pinpoint/__pycache__/RandomForest.cpython-310.pyc b/Pinpoint/__pycache__/RandomForest.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f02689ad8057b27564c906440acdd54c4e5fdf78
Binary files /dev/null and b/Pinpoint/__pycache__/RandomForest.cpython-310.pyc differ
diff --git a/Pinpoint/__pycache__/RandomForest.cpython-36.pyc b/Pinpoint/__pycache__/RandomForest.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee31a7f6ff555aa0eacd9fa1adfcc1d7694fd22a
Binary files /dev/null and b/Pinpoint/__pycache__/RandomForest.cpython-36.pyc differ
diff --git a/Pinpoint/__pycache__/RandomForest.cpython-38.pyc b/Pinpoint/__pycache__/RandomForest.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72aee730e1c2fbe27ff190bb4b5baa2edaadb9a3
Binary files /dev/null and b/Pinpoint/__pycache__/RandomForest.cpython-38.pyc differ
diff --git a/Pinpoint/__pycache__/Sanitizer.cpython-310.pyc b/Pinpoint/__pycache__/Sanitizer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47c410cde2b88569a553199231db0b2aaa6c886d
Binary files /dev/null and b/Pinpoint/__pycache__/Sanitizer.cpython-310.pyc differ
diff --git a/Pinpoint/__pycache__/Sanitizer.cpython-36.pyc b/Pinpoint/__pycache__/Sanitizer.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3119e4308327a74a0a0cacc68608705b15233ebc
Binary files /dev/null and b/Pinpoint/__pycache__/Sanitizer.cpython-36.pyc differ
diff --git a/Pinpoint/__pycache__/Sanitizer.cpython-38.pyc b/Pinpoint/__pycache__/Sanitizer.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f62960471610e429bf19baa3cbcf6c372adc03f6
Binary files /dev/null and b/Pinpoint/__pycache__/Sanitizer.cpython-38.pyc differ
diff --git a/Pinpoint/__pycache__/predictor.cpython-38.pyc b/Pinpoint/__pycache__/predictor.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e9684ee78eacfbfcdfeaafd22e96a4b06651435
Binary files /dev/null and b/Pinpoint/__pycache__/predictor.cpython-38.pyc differ
diff --git a/Pinpoint/far-right-core.py b/Pinpoint/far-right-core.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccb1a66057116a349569f32d80fc8b71c89938e0
--- /dev/null
+++ b/Pinpoint/far-right-core.py
@@ -0,0 +1,65 @@
+"""
+Example of training a model using this package.
+"""
+
+from Pinpoint.FeatureExtraction import *
+from Pinpoint.RandomForest import *
+
+# Performs feature extraction from the provided Extremist, Counterpoise, and Baseline datasets.
+extractor = feature_extraction(violent_words_dataset_location=r"datasets/swears",
+ baseline_training_dataset_location=r"datasets/far-right/LIWC2015 Results (Storm_Front_Posts).csv")
+
+extractor.MAX_RECORD_SIZE = 50000
+
+extractor.dump_training_data_features(
+ feature_file_path_to_save_to=r"outputs/training_features.json",
+ extremist_data_location=r"datasets/far-right/LIWC2015 Results (extreamist-messages.csv).csv",
+ baseline_data_location=r"datasets/far-right/LIWC2015 Results (non-extreamist-messages.csv).csv")
+
+# Trains a model off the features file created in the previous stage
+model = random_forest()
+
+model.RADICAL_LANGUAGE_ENABLED = True
+model.BEHAVIOURAL_FEATURES_ENABLED = True
+model.PSYCHOLOGICAL_SIGNALS_ENABLED = True
+
+model.train_model(features_file= r"outputs/training_features.json",
+ force_new_dataset=True, model_location=r"outputs/far-right-radical-language.model") # , model_location=r"Pinpoint/model/my.model"
+
+model.create_model_info_output_file(location_of_output_file="outputs/far-right-radical-language-output.txt",
+ training_data_csv_location=r"outputs/training_features.json.csv")
+
+#############################################################################################
+model.RADICAL_LANGUAGE_ENABLED = False
+model.BEHAVIOURAL_FEATURES_ENABLED = True
+model.PSYCHOLOGICAL_SIGNALS_ENABLED = False
+
+model.train_model(features_file= r"outputs/training_features.json",
+ force_new_dataset=True, model_location=r"outputs/far-right-behavioural.model") # , model_location=r"Pinpoint/model/my.model"
+
+model.create_model_info_output_file(location_of_output_file="outputs/far-right-behavioural-output.txt",
+ training_data_csv_location=r"outputs/training_features.json.csv")
+
+############################################################################
+model.RADICAL_LANGUAGE_ENABLED = False
+model.BEHAVIOURAL_FEATURES_ENABLED = False
+model.PSYCHOLOGICAL_SIGNALS_ENABLED = True
+
+model.train_model(features_file= r"outputs/training_features.json",
+ force_new_dataset=True, model_location=r"outputs/far-right-psychological.model") # , model_location=r"Pinpoint/model/my.model"
+
+model.create_model_info_output_file(location_of_output_file="outputs/far-right-psychological-output.txt",
+ training_data_csv_location=r"outputs/training_features.json.csv")
+
+##############################################################################################
+model.RADICAL_LANGUAGE_ENABLED = True
+model.BEHAVIOURAL_FEATURES_ENABLED = False
+model.PSYCHOLOGICAL_SIGNALS_ENABLED = False
+
+model.train_model(features_file= r"outputs/training_features.json",
+ force_new_dataset=True, model_location=r"outputs/far-right-baseline.model") # , model_location=r"Pinpoint/model/my.model"
+
+model.create_model_info_output_file(location_of_output_file="outputs/far-right-baseline-output.txt",
+ training_data_csv_location=r"outputs/training_features.json.csv")
+
+print("Finished")
\ No newline at end of file
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6c1641e726923a6bc2cce43bab4e96127d43c2c
--- /dev/null
+++ b/app.py
@@ -0,0 +1,356 @@
+#!/usr/bin/env python
+# coding: utf-8
+import json
+import os
+import re
+import time
+from random import random
+import socket
+
+from threading import Thread
+from time import sleep
+
+test_html = '''
+
+WATCH
+
WatchTower identifies, blocks, and filters out violent and radical content before it reaches your Twitter feed. +
+WatchTower works to protect you from violent, misinformation, hate speech and other malicious communication by using a suite of machine learning models to identify user accounts that post content that commonly falls into these categories. WatchTower is broken down into two components, the first utilises the Twitter streaming API and applies a suite of machine learning models to identify users that commonly post malicious information, while the second element provides a web UI where users can authenticaate with Twitter and tailor the types and thresholds for the accounts they block.
+WatchTower was developed solely by James Stevenson and primarily uses Pinpoint, a machine learning model also developed by James. The future roadmap sees WatchTower incoperate other models for identifying contrent such as misinformation and hate speech. More on Pinpoint and the model WatchTower uses to identify violent extremism can be seen below.
+ +Model Accuracy:
+Machine learning models can be validated based on several statistics. These statistics for Pinpoint the main ML model used by WatchTower can be seen below.
+Accuracy
+Recall
+Precision
+F-Measure
+WatchTower was developed for the Chirp 2022 Twitter API Developer Challenge
+Watchtower was developed solely by James Stevenson for the Chirp 2022 Twitter API Developer Challenge. More infomration of this can be found below.
+