Spaces:

User1342
/

WatchTower

Runtime error

App Files Files Community

James Stevenson commited on Jul 31, 2022

Commit

32a03a4

•

1 Parent(s): b264f40

initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Pinpoint/Aggregator_NGram.py +103 -0
Pinpoint/Aggregator_TfIdf.py +41 -0
Pinpoint/Aggregator_Word2Vec.py +32 -0
Pinpoint/Aggregator_WordingChoice.py +51 -0
Pinpoint/ConfigManager.py +21 -0
Pinpoint/FeatureExtraction.py +795 -0
Pinpoint/Grapher.py +60 -0
Pinpoint/Logger.py +21 -0
Pinpoint/RandomForest.py +374 -0
Pinpoint/Sanitizer.py +131 -0
Pinpoint/Serializer.py +20 -0
Pinpoint/__pycache__/Aggregator_NGram.cpython-310.pyc +0 -0
Pinpoint/__pycache__/Aggregator_NGram.cpython-36.pyc +0 -0
Pinpoint/__pycache__/Aggregator_NGram.cpython-38.pyc +0 -0
Pinpoint/__pycache__/Aggregator_TfIdf.cpython-310.pyc +0 -0
Pinpoint/__pycache__/Aggregator_TfIdf.cpython-36.pyc +0 -0
Pinpoint/__pycache__/Aggregator_TfIdf.cpython-38.pyc +0 -0
Pinpoint/__pycache__/Aggregator_Word2Vec.cpython-310.pyc +0 -0
Pinpoint/__pycache__/Aggregator_Word2Vec.cpython-36.pyc +0 -0
Pinpoint/__pycache__/Aggregator_Word2Vec.cpython-38.pyc +0 -0
Pinpoint/__pycache__/Aggregator_WordingChoice.cpython-310.pyc +0 -0
Pinpoint/__pycache__/Aggregator_WordingChoice.cpython-36.pyc +0 -0
Pinpoint/__pycache__/Aggregator_WordingChoice.cpython-38.pyc +0 -0
Pinpoint/__pycache__/FeatureExtraction.cpython-310.pyc +0 -0
Pinpoint/__pycache__/FeatureExtraction.cpython-36.pyc +0 -0
Pinpoint/__pycache__/FeatureExtraction.cpython-38.pyc +0 -0
Pinpoint/__pycache__/Grapher.cpython-310.pyc +0 -0
Pinpoint/__pycache__/Grapher.cpython-36.pyc +0 -0
Pinpoint/__pycache__/Grapher.cpython-38.pyc +0 -0
Pinpoint/__pycache__/Logger.cpython-310.pyc +0 -0
Pinpoint/__pycache__/Logger.cpython-36.pyc +0 -0
Pinpoint/__pycache__/Logger.cpython-38.pyc +0 -0
Pinpoint/__pycache__/RandomForest.cpython-310.pyc +0 -0
Pinpoint/__pycache__/RandomForest.cpython-36.pyc +0 -0
Pinpoint/__pycache__/RandomForest.cpython-38.pyc +0 -0
Pinpoint/__pycache__/Sanitizer.cpython-310.pyc +0 -0
Pinpoint/__pycache__/Sanitizer.cpython-36.pyc +0 -0
Pinpoint/__pycache__/Sanitizer.cpython-38.pyc +0 -0
Pinpoint/__pycache__/predictor.cpython-38.pyc +0 -0
Pinpoint/far-right-core.py +65 -0
app.py +356 -0
outputs/sanitized_text.txt +0 -0
outputs/users.json +1 -0
predictor.py +78 -0
python-streamer.py +173 -0
sign-in.png +0 -0
swears/VIOLENT_TERRORIST_WORDS.txt +1 -0
swears/bad_Words_list.txt +547 -0
swears/badwords.txt +451 -0
swears/cmu-bad-words.txt +1383 -0

Pinpoint/Aggregator_NGram.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from sklearn.feature_extraction.text import CountVectorizer
+from Pinpoint.Logger import *
+c_vec = CountVectorizer(ngram_range=(1, 5))
+class n_gram_aggregator():
+    """
+    This class is used to retrieve the most common NGrams for a given dataset corpus.
+    """
+    def _get_average_ngram_count(self, n_grams_dict):
+        """
+        takes a dict of Ngrams and identifies the average weighting
+        :param n_grams_dict:
+        :return:
+        """
+        all_count = []
+        for n_gram in n_grams_dict:
+            ng_count = n_grams_dict[n_gram]
+            all_count.append(ng_count)
+        average_count = sum(all_count) / len(all_count)
+        # print(all_count)
+        return average_count
+    def _get_all_ngrams(self, data):
+        """
+        Returns all ngrams (tri, bi, and uni) for a given piece of text
+        :param data:
+        :return:
+        """
+        if type(data) is not list:
+            data = [data]
+        # input to fit_transform() should be an iterable with strings
+        ngrams = c_vec.fit_transform(data)
+        # needs to happen after fit_transform()
+        vocab = c_vec.vocabulary_
+        count_values = ngrams.toarray().sum(axis=0)
+        # output n-grams
+        uni_grams = {}
+        bi_grams = {}
+        tri_grams = {}
+        for ng_count, ng_text in sorted([(count_values[i], k) for k, i in vocab.items()], reverse=True):
+            sentence_length = len(ng_text.split(" "))
+            if sentence_length == 3:
+                tri_grams[ng_text] = ng_count
+            elif sentence_length == 2:
+                bi_grams[ng_text] = ng_count
+            elif sentence_length == 1:
+                uni_grams[ng_text] = ng_count
+        return uni_grams, bi_grams, tri_grams
+    def _get_popular_ngrams(self, ngrams_dict):
+        """
+        Returns ngrams for a given piece of text that are the most popular (i.e. their weighting is
+        above the average ngram wighting)
+        :param ngrams_dict:
+        :return:
+        """
+        average_count = self._get_average_ngram_count(ngrams_dict)
+        popular_ngrams = {}
+        for n_gram in ngrams_dict:
+            ng_count = ngrams_dict[n_gram]
+            if ng_count >= average_count:
+                popular_ngrams[n_gram] = ng_count
+        return popular_ngrams
+    def get_ngrams(self, data=None, file_name_to_read=None):
+        """
+        Wrapper function for returning uni, bi, and tri grams that are the most popular (above the average weighting in
+        a given piece of text).
+        :param data:
+        :param file_name_to_read:
+        :return:
+        """
+        logger().print_message("Getting Ngrams")
+        if data is None and file_name_to_read is None:
+            raise Exception("No data supplied to retrieve n_grams")
+        if data is None and file_name_to_read is not None:
+            with open(file_name_to_read, 'r') as file_to_read:
+                data = file_to_read.read()
+        uni_grams, bi_grams, tri_grams = self._get_all_ngrams(data)
+        popular_uni_grams = list(self._get_popular_ngrams(uni_grams).keys())
+        popular_bi_grams = list(self._get_popular_ngrams(bi_grams).keys())
+        popular_tri_grams = list(self._get_popular_ngrams(tri_grams).keys())
+        return popular_uni_grams, popular_bi_grams, popular_tri_grams

Pinpoint/Aggregator_TfIdf.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from sklearn.feature_extraction.text import TfidfVectorizer
+from Pinpoint.Logger import *
+class tf_idf_aggregator():
+    """
+    A wrapper class around SKlearn for retrieving TF-IDF scores.
+    """
+    def get_tf_idf_scores(self, ngrams_vocabulary, corpus_data=None, file_name_to_read=None):
+        """
+        Used to generate a TF IDF score based of a vocabulary of Ngrams and a data corpus.
+        :param ngrams_vocabulary:
+        :param corpus_data:
+        :param file_name_to_read:
+        :return: a dictionary of the pairing name and their score
+        """
+        logger.print_message("Getting TF IDF scores")
+        if corpus_data is None and file_name_to_read is None:
+            raise Exception("No data supplied to retrieve n_grams")
+        if corpus_data is None and file_name_to_read is not None:
+            with open(file_name_to_read, 'r') as file_to_read:
+                corpus_data = file_to_read.read()
+        tfidf = TfidfVectorizer(vocabulary=ngrams_vocabulary, stop_words='english', ngram_range=(1, 2))
+        tfs = tfidf.fit_transform([corpus_data])
+        feature_names = tfidf.get_feature_names()
+        corpus_index = [n for n in corpus_data]
+        rows, cols = tfs.nonzero()
+        dict_of_scores = {}
+        for row, col in zip(rows, cols):
+            dict_of_scores[feature_names[col]] = tfs[row, col]
+            logger.print_message((feature_names[col], corpus_index[row]), tfs[row, col])
+        return dict_of_scores

Pinpoint/Aggregator_Word2Vec.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from gensim.models import Word2Vec
+class word_2_vec_aggregator():
+    """
+    A wrapper function around gensim used for creating a word 2 vec model
+    """
+    def get_model(self, list_of_sentences):
+        """
+        Used to retrieve the model
+        :param list_of_sentences:
+        :return: the model
+        """
+        list_of_sentences_in_nested_list = []
+        for sentence in list_of_sentences:
+            # Skip unigrams
+            if " " not in sentence:
+                continue
+            list_of_sentences_in_nested_list.append(sentence.split(" "))
+        model = Word2Vec(min_count=1, window=5)  # vector size of 100 and window size of 5?
+        model.build_vocab(list_of_sentences_in_nested_list)  # prepare the model vocabulary
+        model.model_trimmed_post_training = False
+        model.train(list_of_sentences_in_nested_list, total_examples=model.corpus_count,
+                    epochs=model.epochs)  # train word vectors
+        return model

Pinpoint/Aggregator_WordingChoice.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+class wording_choice_aggregator():
+    """
+    A class used for retrieving frequencies based on wording in a message
+    """
+    def get_frequency_of_capatalised_words(self, text):
+        """
+        A function used to retrieve the frequencies of capitalised words in a dataset
+        :param text:
+        :return: the frequency of capitalised words in a dataset
+        """
+        number_of_capatalised_words = 0
+        for word in text.split(" "):
+            if word.isupper():
+                number_of_capatalised_words = number_of_capatalised_words + 1
+        total_number_of_words = len(text.split(" "))
+        frequency = number_of_capatalised_words / total_number_of_words
+        return frequency
+    def get_frequency_of_violent_or_curse_words(self, text, violent_words_datasets_location):
+        """
+        A function ued for retrieving the frequencies of violent words in a dataset
+        :param text:
+        :return: the frequency of violent words in a dataset
+        """
+        dataset_folder = os.path.join(os.getcwd(), violent_words_datasets_location)
+        list_of_violent_or_curse_words = []
+        # Retrieves all words in all of the files in the violent or curse word datasets
+        for filename in os.listdir(dataset_folder):
+            with open(os.path.join(dataset_folder, filename), 'r') as file:
+                for line in file.readlines():
+                    line = line.strip().replace("\n", " ").replace(",", "")
+                    list_of_violent_or_curse_words.append(line)
+        number_of_swear_words = 0
+        for word in text.split(" "):
+            if word in list_of_violent_or_curse_words:
+                number_of_swear_words = number_of_swear_words + 1
+        total_number_of_words = len(text.split(" "))
+        frequency = number_of_swear_words / total_number_of_words
+        return frequency

Pinpoint/ConfigManager.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import json
+from pathlib import Path
+class ConfigManager:
+    """
+    A wrapper file used to abstract Twitter config options.    """
+    @staticmethod
+    def _get_config(config_path):
+        if Path(config_path).is_file() == False:
+            raise Exception("The {} config file was not found.".format(config_path))
+        with open(config_path) as json_file:
+            twitter_config_dict = json.load(json_file)
+        return twitter_config_dict
+    @staticmethod
+    def getTwitterConfig():
+        return ConfigManager._get_config("twitterConfig.json")

Pinpoint/FeatureExtraction.py ADDED Viewed

	@@ -0,0 +1,795 @@

+import ast
+import base64
+import codecs
+import csv
+import gc
+import json
+import os
+import pickle
+import re
+import shutil
+import time
+import numpy
+import pandas as pd
+import uuid
+from scipy.spatial import distance
+from Pinpoint.Aggregator_NGram import n_gram_aggregator
+from Pinpoint.Aggregator_TfIdf import tf_idf_aggregator
+from Pinpoint.Aggregator_Word2Vec import word_2_vec_aggregator
+from Pinpoint.Aggregator_WordingChoice import wording_choice_aggregator
+from Pinpoint.Grapher import grapher
+from Pinpoint.Logger import logger
+from Pinpoint.Sanitizer import sanitization, sys
+class feature_extraction():
+    """
+    This class is used to wrap the functionality of aggregating tweets from CSV files and extracting features pertinent
+    to building a random forest extremist classifier.
+    """
+    # A graph used to store connections between aggregated users
+    graph = grapher()
+    archived_graphs = []  # an archive of the previous graphs
+    # A list storing dictionaries of user ids and their features.
+    tweet_user_features = []
+    completed_tweet_user_features = []  # has centrality added
+    # the global TF IDF model used for the Word 2 Vec model
+    saved_tf_idf_model = None
+    # A dictionary used for the translation of actual Twitter username to UUID
+    dict_of_users = {}
+    # The max size for all data entries  (i.e. baseline tweets)
+    MAX_RECORD_SIZE = sys.maxsize  # 3050
+    # Datasets for training
+    violent_words_dataset_location = None
+    tf_idf_training_dataset_location = None
+    outputs_location = None
+    # Used for knowing which columns to access data from. For Twitter data.
+    # Summary variables
+    DEFAULT_USERNAME_COLUMN_ID = 0
+    DEFAULT_DATE_COLUMN_ID = 1
+    DEFAULT_MESSAGE_COLUMN_ID = 2
+    DEFAULT_ANALYTIC_COLUMN_ID = 4
+    DEFAULT_CLOUT_COLUMN_ID = 5
+    DEFAULT_AUTHENTIC_COLUMN_ID = 6
+    DEFAULT_TONE_COLUMN_ID = 7
+    # Emotional Analysis
+    DEFAULT_ANGER_COLUMN_ID = 36
+    DEFAULT_SADNESS_COLUMN_ID = 37
+    DEFAULT_ANXIETY_COLUMN_ID = 35
+    # Personal Drives:
+    DEFAULT_POWER_COLUMN_ID = 62
+    DEFAULT_REWARD_COLUMN_ID = 63
+    DEFAULT_RISK_COLUMN_ID = 64
+    DEFAULT_ACHIEVEMENT_COLUMN_ID = 61
+    DEFAULT_AFFILIATION_COLUMN_ID = 60
+    # Personal pronouns
+    DEFAULT_P_PRONOUN_COLUMN_ID = 13
+    DEFAULT_I_PRONOUN_COLUMN_ID = 19
+    # Constants for the fields in the baseline data set (i.e. ISIS magazine/ Stormfront, etc)
+    DEFAULT_BASELINE_MESSAGE_COLUMN_ID = 5
+    # Summary variables
+    DEFAULT_BASELINE_CLOUT_COLUMN_ID = 10
+    DEFAULT_BASELINE_ANALYTIC_COLUMN_ID = 9
+    DEFAULT_BASELINE_TONE_COLUMN_ID = 12
+    DEFAULT_BASELINE_AUTHENTIC_COLUMN_ID = 11
+    # Emotional Analysis
+    DEFAULT_BASELINE_ANGER_COLUMN_ID = 41
+    DEFAULT_BASELINE_SADNESS_COLUMN_ID = 42
+    DEFAULT_BASELINE_ANXIETY_COLUMN_ID = 40
+    # Personal Drives
+    DEFAULT_BASELINE_POWER_COLUMN_ID = 67
+    DEFAULT_BASELINE_REWARD_COLUMN_ID = 68
+    DEFAULT_BASELINE_RISK_COLUMN_ID = 69
+    DEFAULT_BASELINE_ACHIEVEMENT_COLUMN_ID = 66
+    DEFAULT_BASELINE_AFFILIATION_COLUMN_ID = 65
+    # Personal pronouns
+    DEFAULT_BASELINE_P_PRONOUN_COLUMN_ID = 18
+    DEFAULT_BASELINE_I_PRONOUN_COLUMN_ID = 24
+    # Used for Minkowski distance
+    _average_clout = 0
+    _average_analytic = 0
+    _average_tone = 0
+    _average_authentic = 0
+    _average_anger = 0
+    _average_sadness = 0
+    average_anxiety = 0
+    average_power = 0
+    average_reward = 0
+    average_risk = 0
+    average_achievement = 0
+    average_affiliation = 0
+    average_p_pronoun = 0
+    average_i_pronoun = 0
+    # Used to chache messages to free memory
+    MESSAGE_TMP_CACHE_LOCATION = "message_cache"
+    def __init__(self, violent_words_dataset_location=None
+                 , baseline_training_dataset_location=None,
+                 outputs_location=r"outputs"):
+        """
+        Constructor
+        The feature_extraction() class can be initialised with violent_words_dataset_location,
+        tf_idf_training_dataset_location, and outputs_location locations. All files in the violent_words_dataset_location
+        will be read (one line at a time) and added to the corpus of violent and swear words. The csv file at
+        baseline_training_dataset_location is used to train the TFIDF model and a Minkowski distance score is calculated based on the LIWC scores present.
+        If the constant variable need to be changed, do this by setting the member variables.
+        """
+        # Error if datasets not provided
+        if violent_words_dataset_location is None:
+            raise Exception("No Violent Words dir provided. Provide a directory that contains new line seperated "
+                            "files where each line is a violent, extremist, etc word")
+        if baseline_training_dataset_location is None:
+            raise Exception("No baseline (TF-IDF/ Minkowski) dataset provided. Thus should be a csv file containing "
+                            "extremist content and LIWC scores.")
+        # Set datasets to member variables
+        self.violent_words_dataset_location = violent_words_dataset_location
+        self.tf_idf_training_dataset_location = baseline_training_dataset_location
+        self.outputs_location = outputs_location
+        # Attempt to make the outputs folder if it doesn't exist
+        try:
+            os.makedirs(outputs_location)
+        except:
+            pass
+    def _reset_stored_feature_data(self):
+        """
+        Resets memeber variables from a previous run. Importantly does not reset to TF IDF model.
+        :return:
+        """
+        # A graph used to store connections between aggregated users
+        self.graph = grapher()
+        archived_graphs = []  # an archive of the previous graphs
+        # A list storing dictionaries of user ids and their features.
+        self.tweet_user_features = []
+        self.completed_tweet_user_features = []  # has centrality added
+        # the global TF IDF model used for the Word 2 Vec model
+        self.dict_of_users = {}
+        # Used for Minkowski distance
+        self._average_clout = 0
+        self._average_analytic = 0
+        self._average_tone = 0
+        self._average_authentic = 0
+        self._average_anger = 0
+        self._average_sadness = 0
+        self.average_anxiety = 0
+        self.average_power = 0
+        self.average_reward = 0
+        self.average_risk = 0
+        self.average_achievement = 0
+        self.average_affiliation = 0
+        self.average_p_pronoun = 0
+        self.average_i_pronoun = 0
+    def _get_unique_id_from_username(self, username):
+        """
+        A function used to retrieve a UUID based on a twitter username. If a username has been used before the same UUID
+        will be returned as it is stored in a dictionary.
+        :param username:
+        :return: a string representation of a UUID relating to a Twitter username
+        """
+        if username in self.dict_of_users:
+            # username already in dictionary
+            unique_id = self.dict_of_users[username]
+        else:
+            # make new UUID
+            unique_id = uuid.uuid4().hex
+            # stops uuid collisions
+            while unique_id in self.dict_of_users.values():
+                unique_id = uuid.uuid4().hex
+            # Add new user id to dictionary
+            self.dict_of_users[username] = unique_id
+        # todo it's less efficient writing the whole file every run
+        path = os.path.join(self.outputs_location, "users.json")
+        with open(path, 'w') as outfile:
+            json.dump(self.dict_of_users, outfile)
+        return unique_id
+    def _add_to_graph(self, originating_user_name, message):
+        """
+        A wrapper function used for adding a node/ connection to the graph.
+        :param originating_user_name: the Twitter username
+        :param message: The Tweet
+        """
+        # Adds node to graph so that if they don't interact with anyone they still have a centrality
+        self.graph.add_node(originating_user_name)
+        # Process mentions
+        mentions = re.findall("\@([a-zA-Z\-\_]+)", message)
+        # For all mentions in the tweet add them to the graph as a node
+        for mention in mentions:
+            self.graph.add_edge_wrapper(originating_user_name, mention, 1, "mention")
+        # process hashtags
+        hashtags = re.findall("\#([a-zA-Z\-\_]+)", message)
+        # For all hashtags in the tweet add them to the graph as a node
+        for hashtag in hashtags:
+            self.graph.add_edge_wrapper(originating_user_name, hashtag, 1, "hashtag")
+    def _get_capitalised_word_frequency(self, message):
+        """
+        A wrapper function for returning the frequency of capitalised words in a message.
+        :param message:
+        :return: the frequency of capitalised words in a message.
+        """
+        return wording_choice_aggregator().get_frequency_of_capatalised_words(
+            message)  # NEEDS TO BE DONE before lower case
+    def _get_violent_word_frequency(self, message):
+        """
+        A wrapper function used to retrieve the frequency of violent words in a message.
+        :param message: a string representation of a social media message
+        :return: The frequency of violent words in the message
+        """
+        return wording_choice_aggregator().get_frequency_of_violent_or_curse_words(message,
+                                                                                   self.violent_words_dataset_location)
+    def _get_tweet_vector(self, message):
+        """
+        A wrapper function used retrieve the 200 size vector representation (Average and Max vector concatenated)
+        of that message.
+        :param message: a string representation of a message
+        :param tf_idf_model:
+        :return: a 200 size vector of the tweet
+        """
+        vectors = []
+        tf_idf_model = self._get_tf_idf_model()
+        for word in message.split(" "):
+            # todo add  back word = sanitization().sanitize(word, self.outputs_location, force_new_data_and_dont_persisit=True)
+            try:
+                vectors.append(tf_idf_model.wv[word])
+                logger().print_message("Word '{}' in vocabulary...".format(word))
+            except KeyError as e:
+                pass
+                logger().print_message(e)
+                logger().print_message("Word '{}' not in vocabulary...".format(word))
+        # Lists of the values used to store the max and average vector values
+        max_value_list = []
+        average_value_list = []
+        # Check for if at least one word in the message is in the vocabulary of the model
+        final_array_of_vectors = pd.np.zeros(100)
+        if len(vectors) > 0:
+            # Loop through the elements in the vectors
+            for iterator in range(vectors[0].size):
+                list_of_all_values = []
+                # Loop through each vector
+                for vector in vectors:
+                    value = vector[iterator]
+                    list_of_all_values.append(value)
+                average_value = sum(list_of_all_values) / len(list_of_all_values)
+                max_value = max(list_of_all_values)
+                max_value_list.append(max_value)
+                average_value_list.append(average_value)
+            final_array_of_vectors = pd.np.append(pd.np.array([max_value_list]), pd.np.array([average_value_list]))
+        # Convert array to list
+        list_of_vectors = []
+        for vector in final_array_of_vectors:
+            list_of_vectors.append(vector)
+        return list_of_vectors
+    def _process_tweet(self, user_name, message, row):
+        """
+        Wrapper function for taking a username and tweet and extracting the features.
+        :param user_name:
+        :param message:
+        :return: a dictionary of all features from the message
+        """
+        self._add_to_graph(user_name, message)
+        features_dict = {"cap_freq": self._get_capitalised_word_frequency(message),
+                         "violent_freq": self._get_violent_word_frequency(message),
+                         "message_vector": self._get_tweet_vector(message)}
+        return features_dict
+    def _get_average_liwc_scores_for_baseline_data(self):
+        """
+        Calculate the LIWC scores for the baseline dataset and the minkowski dataset.
+        """
+        # Checks if the values have already been set this run, if so don't calculate again
+        # TODO what of the edge case where average clout is 0?
+        if self._average_clout == 0:
+            logger.print_message("Opening dataset {} for LIWC feature extraction and Minkowski distance".format(
+                self.tf_idf_training_dataset_location))
+            baseline_data_set_name = self.tf_idf_training_dataset_location
+            clout_list = []
+            analytic_list = []
+            tone_list = []
+            authentic_list = []
+            anger_list = []
+            sadness_list = []
+            anxiety_list = []
+            power_list = []
+            reward_list = []
+            risk_list = []
+            achievement_list = []
+            affiliation_list = []
+            p_pronoun_list = []
+            i_pronoun_list = []
+            with open(baseline_data_set_name, 'r', encoding='cp1252') as file:
+                reader = csv.reader(file)
+                is_header = True
+                for row in reader:
+                    if is_header:
+                        is_header = False
+                        continue
+                    # Try and access columns, if can't then LIWC fields haven't been set and should be set to 0
+                    try:
+                        clout = row[self.DEFAULT_BASELINE_CLOUT_COLUMN_ID]
+                        analytic = row[self.DEFAULT_BASELINE_ANALYTIC_COLUMN_ID]
+                        tone = row[self.DEFAULT_BASELINE_TONE_COLUMN_ID]
+                        authentic = row[self.DEFAULT_BASELINE_AUTHENTIC_COLUMN_ID]
+                        anger = row[self.DEFAULT_BASELINE_ANGER_COLUMN_ID]
+                        sadness = row[self.DEFAULT_BASELINE_SADNESS_COLUMN_ID]
+                        anxiety = row[self.DEFAULT_BASELINE_ANXIETY_COLUMN_ID]
+                        power = row[self.DEFAULT_BASELINE_POWER_COLUMN_ID]
+                        reward = row[self.DEFAULT_BASELINE_REWARD_COLUMN_ID]
+                        risk = row[self.DEFAULT_BASELINE_RISK_COLUMN_ID]
+                        achievement = row[self.DEFAULT_BASELINE_ACHIEVEMENT_COLUMN_ID]
+                        affiliation = row[self.DEFAULT_BASELINE_AFFILIATION_COLUMN_ID]
+                        p_pronoun = row[self.DEFAULT_BASELINE_P_PRONOUN_COLUMN_ID]
+                        i_pronoun = row[self.DEFAULT_BASELINE_I_PRONOUN_COLUMN_ID]
+                    except:
+                        clout = 0
+                        analytic = 0
+                        tone = 0
+                        authentic = 0
+                        anger = 0
+                        sadness = 0
+                        anxiety = 0
+                        power = 0
+                        reward = 0
+                        risk = 0
+                        achievement = 0
+                        affiliation = 0
+                        p_pronoun = 0
+                        i_pronoun = 0
+                    clout_list.append(float(clout))
+                    analytic_list.append(float(analytic))
+                    tone_list.append(float(tone))
+                    authentic_list.append(float(authentic))
+                    anger_list.append(float(anger))
+                    sadness_list.append(float(sadness))
+                    anxiety_list.append(float(anxiety))
+                    power_list.append(float(power))
+                    reward_list.append(float(reward))
+                    risk_list.append(float(risk))
+                    achievement_list.append(float(achievement))
+                    affiliation_list.append(float(affiliation))
+                    p_pronoun_list.append(float(p_pronoun))
+                    i_pronoun_list.append(float(i_pronoun))
+            #  Get average for variables, used for distance score. These are member variables so that they don't
+            #  have to be re-calculated on later runs
+            self._average_clout = sum(clout_list) / len(clout_list)
+            self._average_analytic = sum(analytic_list) / len(analytic_list)
+            self._average_tone = sum(tone_list) / len(tone_list)
+            self._average_authentic = sum(authentic_list) / len(authentic_list)
+            self._average_anger = sum(anger_list) / len(anger_list)
+            self._average_sadness = sum(sadness_list) / len(sadness_list)
+            self.average_anxiety = sum(anxiety_list) / len(anxiety_list)
+            self.average_power = sum(power_list) / len(power_list)
+            self.average_reward = sum(reward_list) / len(reward_list)
+            self.average_risk = sum(risk_list) / len(risk_list)
+            self.average_achievement = sum(achievement_list) / len(achievement_list)
+            self.average_affiliation = sum(affiliation_list) / len(affiliation_list)
+            self.average_p_pronoun = sum(p_pronoun_list) / len(p_pronoun_list)
+            self.average_i_pronoun = sum(i_pronoun_list) / len(i_pronoun_list)
+        return [self._average_clout, self._average_analytic, self._average_tone, self._average_authentic,
+                self._average_anger, self._average_sadness, self.average_anxiety,
+                self.average_power, self.average_reward, self.average_risk, self.average_achievement,
+                self.average_affiliation,
+                self.average_p_pronoun, self.average_i_pronoun]
+    def _get_tf_idf_model(self):
+        """
+        A function used to retrieve the TFIDF model trained on the extremist dataset. If the model has already been
+        created then the previously created model will be used.
+        :return: a TF-IDF model
+        """
+        # if already made model, reuse
+        if self.saved_tf_idf_model is None:
+            logger.print_message("Opening dataset {} for TF-IDF".format(self.tf_idf_training_dataset_location))
+            baseline_data_set_name = self.tf_idf_training_dataset_location
+            data_set = ""
+            with open(baseline_data_set_name, 'r', encoding='cp1252') as file:
+                reader = csv.reader(file)
+                is_header = True
+                for row in reader:
+                    if is_header:
+                        is_header = False
+                        continue
+                    # take quote from dataset and add it to dataset
+                    message = row[self.DEFAULT_BASELINE_MESSAGE_COLUMN_ID]  # data column
+                    data_set = data_set + message + "/n"
+            # clean data set
+            # todo should we be doing sanitization clean_data = sanitization().sanitize(data_set, self.outputs_location) # if so remove line below
+            clean_data = data_set
+            # get ngrams
+            uni_grams, bi_grams, tri_grams = n_gram_aggregator().get_ngrams(clean_data)
+            ngrams = uni_grams + bi_grams + tri_grams
+            # todo The TF_IDF most important ngrams arn't being used. Should these be used instead of the other ngrams
+            tf_idf_scores = tf_idf_aggregator().get_tf_idf_scores(ngrams, data_set)
+            number_of_most_important_ngrams = int(len(ngrams) / 2)  # number is half all ngrams
+            list_of_most_important_ngrams = sorted(tf_idf_scores, key=tf_idf_scores.get, reverse=True)[
+                                            :number_of_most_important_ngrams]
+            # create a word 2 vec model
+            model = word_2_vec_aggregator().get_model(list_of_sentences=list_of_most_important_ngrams)
+            self.saved_tf_idf_model = model
+        else:
+            model = self.saved_tf_idf_model
+        return model
+    def open_wrapper(self, location, access_type, list_of_encodings=["utf-8", 'latin-1', 'cp1252']):
+        """
+        A wrapper around the open built in function that has fallbacks for different encodings.
+        :return:
+        """
+        for encoding in list_of_encodings:
+            try:
+                file = open(location, access_type, encoding=encoding)
+                # Attempt to read file, if fails try other encoding
+                file.readlines()
+                file.seek(0)
+                file.close()
+                file = open(location, access_type, encoding=encoding)
+                return file
+            except LookupError as e:
+                continue
+            except UnicodeDecodeError as e:
+                continue
+        raise Exception(
+            "No valid encoding provided for file: '{}'. Encodings provided: '{}'".format(location, list_of_encodings))
+    def _add_user_post_db_cache(self, user_id, dict_to_add):
+        """
+        Used to add data to the post message db cache used to free up memory.
+        """
+        if not os.path.isdir(self.MESSAGE_TMP_CACHE_LOCATION):
+            os.mkdir(self.MESSAGE_TMP_CACHE_LOCATION)
+        # Save file as pickle
+        file_name = "{}-{}.pickle".format(user_id,int(time.time()))
+        file_name = os.path.join(self.MESSAGE_TMP_CACHE_LOCATION, file_name)
+        with open(file_name, 'wb') as pickle_handle:
+            pickle.dump({"description":"a temporery file used for saving memory",
+                         "data":dict_to_add}, pickle_handle, protocol=pickle.HIGHEST_PROTOCOL)
+    def _get_user_post_db_cache(self, file_name):
+        """
+        Retrieves data from the cache database used to free up memory.
+        """
+        if not os.path.isdir(self.MESSAGE_TMP_CACHE_LOCATION):
+            raise Exception("Attempted to access temporery cache files before files are created")
+        if not os.path.isfile(file_name):
+            raise Exception("Attempted to access cache file {}, however, it does not exist".format(file_name))
+        with (open(file_name, "rb")) as openfile:
+            cache_data = pickle.load(openfile)
+        return cache_data["data"]
+    def _delete_user_post_db_cache(self):
+        try:
+            if os.path.isdir(self.MESSAGE_TMP_CACHE_LOCATION):
+                shutil.rmtree(self.MESSAGE_TMP_CACHE_LOCATION)
+        except:
+            pass
+    def _get_type_of_message_data(self, data_set_location, has_header=True, is_extremist=None):
+        # Ensure all temp files are deleted
+        self._delete_user_post_db_cache()
+        # Counts the total rows in the CSV. Used for progress reporting.
+        print("Starting entity count. Will count '{}'".format(self.MAX_RECORD_SIZE))
+        # Read one entry at a time
+        max_chunksize = 1
+        row_count = 0
+        for row in pd.read_csv(data_set_location, iterator=True,encoding='latin-1'):
+            row_count = row_count + 1
+            if row_count >= self.MAX_RECORD_SIZE:
+                break
+        print("Finished entity count. Count is: '{}'".format(row_count))
+        print("")
+        # Loops through all rows in the dataset CSV file.
+        current_processed_rows = 0
+        is_header = False
+        for row in pd.read_csv(data_set_location, iterator=True,encoding='latin-1'):
+            row = row.columns
+            # Makes sure same number for each dataset
+            if current_processed_rows > row_count:
+                break
+            # Skips the first entry, as it's the CSV header
+            if has_header and is_header:
+                is_header = False
+                continue
+            # Retrieve username
+            try:
+                username = row[self.DEFAULT_USERNAME_COLUMN_ID]
+                date = row[self.DEFAULT_DATE_COLUMN_ID]
+                user_unique_id = self._get_unique_id_from_username(username)
+            except:
+                # if empty entry
+                continue
+            # Attempt to get LIWC scores from csv, if not present return 0's
+            try:
+                # Summary variables
+                clout = float(row[self.DEFAULT_CLOUT_COLUMN_ID])
+                analytic = float(row[self.DEFAULT_ANALYTIC_COLUMN_ID])
+                tone = float(row[self.DEFAULT_TONE_COLUMN_ID])
+                authentic = float(row[self.DEFAULT_AUTHENTIC_COLUMN_ID])
+                # Emotional Analysis
+                anger = float(row[self.DEFAULT_ANGER_COLUMN_ID])
+                sadness = float(row[self.DEFAULT_SADNESS_COLUMN_ID])
+                anxiety = float(row[self.DEFAULT_ANXIETY_COLUMN_ID])
+                # Personal Drives:
+                power = float(row[self.DEFAULT_POWER_COLUMN_ID])
+                reward = float(row[self.DEFAULT_REWARD_COLUMN_ID])
+                risk = float(row[self.DEFAULT_RISK_COLUMN_ID])
+                achievement = float(row[self.DEFAULT_ACHIEVEMENT_COLUMN_ID])
+                affiliation = float(row[self.DEFAULT_AFFILIATION_COLUMN_ID])
+                # Personal pronouns
+                i_pronoun = float(row[self.DEFAULT_I_PRONOUN_COLUMN_ID])
+                p_pronoun = float(row[self.DEFAULT_P_PRONOUN_COLUMN_ID])
+            except:
+                # Summary variables
+                clout = 0
+                analytic = 0
+                tone = 0
+                authentic = 0
+                # Emotional Analysis
+                anger = 0
+                sadness = 0
+                anxiety = 0
+                # Personal Drives:
+                power = 0
+                reward = 0
+                risk = 0
+                achievement = 0
+                affiliation = 0
+                # Personal pronouns
+                i_pronoun = 0
+                p_pronoun = 0
+            liwc_dict = {
+                "clout": clout,
+                "analytic": analytic,
+                "tone": tone,
+                "authentic": authentic,
+                "anger": anger,
+                "sadness": sadness,
+                "anxiety": anxiety,
+                "power": power,
+                "reward": reward,
+                "risk": risk,
+                "achievement": achievement,
+                "affiliation": affiliation,
+                "i_pronoun": i_pronoun,
+                "p_pronoun": p_pronoun,
+            }
+            # Calculate minkowski distance
+            average_row = self._get_average_liwc_scores_for_baseline_data()
+            actual_row = [clout, analytic, tone, authentic,
+                          anger, sadness, anxiety,
+                          power, reward, risk, achievement, affiliation,
+                          p_pronoun, i_pronoun
+                          ]
+            try:
+                liwc_dict["minkowski"] = distance.minkowski(actual_row, average_row, 1)
+            except ValueError:
+                continue
+            # Retrieve Tweet for message
+            tweet = str(row[self.DEFAULT_MESSAGE_COLUMN_ID])
+            # clean/ remove markup in dataset
+            sanitised_message = sanitization().sanitize(tweet, self.outputs_location,
+                                                        force_new_data_and_dont_persisit=True)
+            # If no message skip entry
+            if not len(tweet) > 0 or not len(sanitised_message) > 0 or sanitised_message == '' or not len(
+                    sanitised_message.split(" ")) > 0:
+                continue
+            # Process Tweet and save as dict
+            tweet_dict = self._process_tweet(user_unique_id, tweet, row)
+            # If the message vector is not 200 skip (meaning that a blank message was processed)
+            if not len(tweet_dict["message_vector"]) == 200:
+                continue
+            if is_extremist is not None:
+                tweet_dict["is_extremist"] = is_extremist
+            tweet_dict["date"] = date
+            # Merge liwc dict with tweet dict
+            tweet_dict = {**tweet_dict, **liwc_dict}
+            #tweet_dict["user_unique_id"]= user_unique_id
+            self._add_user_post_db_cache(user_unique_id, {user_unique_id: tweet_dict})
+            #self.tweet_user_features.append()
+            # TODO here save to cache json instead of list and graph
+            logger().print_message("Added message from user: '{}', from dataset: '{}'. {} rows of {} completed."
+                                   .format(user_unique_id, data_set_location, current_processed_rows, row_count), 1)
+            current_processed_rows = current_processed_rows + 1
+            print("Finished reading row")
+        # Add the centrality (has to be done after all users are added to graph)
+        completed_tweet_user_features = []
+        # Loops through each item in the list which represents each message/ tweet
+        # Loop through all data in cache file
+        for cached_message_file in os.listdir(self.MESSAGE_TMP_CACHE_LOCATION):
+            cached_message_file = os.fsdecode(cached_message_file)
+            cached_message_file = os.path.join(self.MESSAGE_TMP_CACHE_LOCATION,cached_message_file)
+            # Only process pickle files
+            if not cached_message_file.endswith(".pickle"):
+                continue
+            print("Reading cache file: '{}'".format(cached_message_file))
+            cached_message_data = self._get_user_post_db_cache(cached_message_file)
+            # Loops through the data in that tweet (Should only be one entry per tweet).
+            for user_id in cached_message_data.keys():
+                updated_entry = {}
+                updated_entry[user_id] = cached_message_data[user_id]
+                # Adds centrality
+                updated_entry[user_id]["centrality"] = self.graph.get_degree_centrality_for_user(user_id)
+                logger().print_message(
+                    "Added '{}' Centrality for user '{}'".format(updated_entry[user_id]["centrality"], user_id), 1)
+                completed_tweet_user_features.append(updated_entry)
+                gc.collect()
+                break  # Only one entry per list
+        self._delete_user_post_db_cache()
+        self.completed_tweet_user_features = self.completed_tweet_user_features + completed_tweet_user_features
+        self.tweet_user_features = []
+        #self.archived_graphs.append(self.graph)
+        self.graph = grapher()
+        print("Finished messages")
+    def _get_extremist_data(self, dataset_location):
+        """
+        This function is responsible for aggregating tweets from the extremist dataset, extracting the features, and
+        saving them to a file for a model to be created.
+        """
+        self._get_type_of_message_data(data_set_location=dataset_location, is_extremist=True)
+    def _get_counterpoise_data(self, dataset_location):
+        """
+        This function is responsible for aggregating tweets from the counterpoise (related to the topic but from
+        legitimate sources, e.g. news outlets) dataset, extracting the features, and saving them to a file for a
+        model to be created.
+        """
+        self._get_type_of_message_data(data_set_location=dataset_location, is_extremist=False)
+    def _get_standard_tweets(self, dataset_location):
+        """
+        This function is responsible for aggregating tweets from the baseline (random sample of twitter posts)
+        dataset, extracting the features, and saving them to a file for a model to be created.
+        """
+        self._get_type_of_message_data(data_set_location=dataset_location, is_extremist=False)
+    def dump_features_for_list_of_datasets(self, feature_file_path_to_save_to, list_of_dataset_locations,
+                                           force_new_dataset=True):
+        """
+        Saves features representing a provided dataset to a json file. Designed to be used for testing after a
+        model has been created.
+        :param feature_file_path_to_save_to:
+        :param dataset_location:
+        :return:
+        """
+        self._reset_stored_feature_data()
+        if force_new_dataset or not os.path.isfile(feature_file_path_to_save_to):
+            for dataset in list_of_dataset_locations:
+                self._get_type_of_message_data(data_set_location=dataset, is_extremist=None)
+            with open(feature_file_path_to_save_to, 'w') as outfile:
+                json.dump(self.completed_tweet_user_features, outfile, indent=4)
+        else:
+            with open(feature_file_path_to_save_to, 'r') as file:
+                data = file.read()
+            # parse file
+            self.completed_tweet_user_features = json.loads(data)
+    def dump_training_data_features(self, feature_file_path_to_save_to, extremist_data_location,
+                                    baseline_data_location, force_new_dataset=True):
+        """
+        The entrypoint function, used to dump all features, for all users in the extreamist, counterpoise, and baseline
+        datsets to a json file.
+        :param feature_file_path_to_save_to: The filepath to save the datasets to
+        """
+        self._reset_stored_feature_data()
+        if force_new_dataset or not os.path.isfile(feature_file_path_to_save_to):
+            print("Starting baseline messages")
+            self._get_standard_tweets(baseline_data_location)
+            print("Starting extremist messages")
+            self._get_extremist_data(extremist_data_location)
+            with open(feature_file_path_to_save_to, 'w') as outfile:
+                json.dump(self.completed_tweet_user_features, outfile, indent=4)

Pinpoint/Grapher.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import networkx as nx
+class grapher():
+    """
+    A wrapper class used for generating a graph for interactions between users
+    """
+    graph = None
+    def __init__(self):
+        """
+        Constructor.
+        """
+        self.graph = nx.DiGraph()
+    def add_edge_wrapper(self, node_1_name, node_2_name, weight, relationship):
+        """
+        A wrapper function used to add an edge connection or node.
+        :param node_1_name: from
+        :param node_2_name: to
+        :param weight:
+        :param relationship:
+        :return:
+        """
+        self.graph.add_edge(node_1_name, node_2_name, weight=weight, relation=relationship)
+    def add_node(self, node_name):
+        """
+        A wrapper function that adds a node with no edges to the graph
+        :param node_name:
+        """
+        self.graph.add_node(node_name)
+    def get_info(self):
+        """
+        Retrieves information about the graph
+        :return:
+        """
+        return nx.info(self.graph)
+    def show_graph(self):
+        """
+        Displays the graph
+        :return:
+        """
+        nx.spring_layout(self.graph)
+    def get_degree_centrality_for_user(self, user_name):
+        """
+        Returns the Degree of Centrality for a given user present in the graph
+        :param user_name:
+        :return: the Degree of Centrality for a given user present in the graph
+        """
+        centrality = nx.degree_centrality(self.graph)
+        return centrality[user_name]
+    # todo implement
+    # def get_eigenvector_centrality_for_user(self, user_name):
+    #    centrality = nx.eigenvector_centrality(self.graph)
+    #    return centrality[user_name]

Pinpoint/Logger.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from datetime import datetime
+class logger():
+    """
+    A  wrapper class around the Python print function used to only print
+    """
+    DEBUG = False
+    @staticmethod
+    def print_message(message, logging_level=0):
+        """
+        A  wrapper function around the Python print function used to only print
+        :param message: the message to print
+        :param override_debug: a boolean on if the DEBUG status should be override. if True a log will be printed,
+        irrespective of if in Debug mode.
+        """
+        if logging_level >= 1 or logger.DEBUG:
+            now = datetime.now()
+            current_time = now.strftime("%H:%M:%S")
+            print("{} | {}".format(current_time, message))

Pinpoint/RandomForest.py ADDED Viewed

	@@ -0,0 +1,374 @@

+import csv
+import json
+import os
+import pickle
+from datetime import datetime
+import pandas
+import pandas as pd
+from sklearn import metrics
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+from Pinpoint import Logger
+class random_forest():
+    """
+    A class used for creating a random forest binary classifier.
+    """
+    model = None
+    accuracy = None
+    precision = None
+    recall = None
+    f_measure = None
+    # Model variables populated on creation or reading of file
+    original_name = None
+    creation_date = None
+    _FRAMEWORK_VERSION = 0.2  # Used when creating a new model file
+    # v0.1 - versioning added.
+    # v0.2 - Added more LIWC scores and minkowski distance
+    model_version = _FRAMEWORK_VERSION  # can be updated if reading and using a model file of a different version
+    _outputs_folder = None
+    _model_folder = None
+    # Categories of features used in the model
+    RADICAL_LANGUAGE_ENABLED = True  # RF-IDF Scores, Word Embeddings
+    PSYCHOLOGICAL_SIGNALS_ENABLED = True  # LIWC Dictionaries, Minkowski distance
+    BEHAVIOURAL_FEATURES_ENABLED = True  # frequency of tweets, followers / following ratio,  centrality
+    def __init__(self, outputs_folder="outputs", model_folder=None):
+        """
+        Constructor
+        The random_forest() class can be initialised with outputs_folder() and model_folder(). The outputs folder is
+        where output files are stored and the model folder is where the model will be created if not overwritten.
+        """
+        if model_folder is None:
+            model_folder = outputs_folder
+        self._outputs_folder = outputs_folder
+        self._model_folder = model_folder
+    def get_features_as_df(self, features_file, force_new_dataset=True):
+        """
+        Reads a JSON file file and converts to a Pandas dataframe that can be used to train and test the classifier.
+        :param features_file: the location of the JSON features file to convert to a dataframe
+        :param force_new_dataset: if true a new CSV file will be created even if one already exists.
+        :return: a Pandas dataframe with the features.
+        """
+        with open(features_file) as json_features_file:
+            csv_file = "{}.csv".format(features_file)
+            if force_new_dataset or not os.path.isfile(csv_file):
+                features = json.load(json_features_file)
+                # todo remove the data for the features not being used.
+                filtered_list_after_filters_applied = []
+                # If any of the filters are not true remove the features not requested
+                column_names = []
+                if self.PSYCHOLOGICAL_SIGNALS_ENABLED:
+                    column_names = column_names + ["clout", "analytic", "tone", "authentic",
+                                                   "anger", "sadness", "anxiety",
+                                                   "power", "reward", "risk", "achievement", "affiliation",
+                                                   "i_pronoun", "p_pronoun",
+                                                   "minkowski"]
+                if self.BEHAVIOURAL_FEATURES_ENABLED:
+                    column_names = column_names + ['centrality']
+                if self.RADICAL_LANGUAGE_ENABLED:
+                    # Add column names
+                    column_names = column_names + ["cap_freq", "violent_freq"]
+                    # Add the two hundred vectors columns
+                    for iterator in range(1, 201):
+                        column_names.append("message_vector_{}".format(iterator))
+                column_names = column_names + ['is_extremist']
+                if not self.BEHAVIOURAL_FEATURES_ENABLED or not self.PSYCHOLOGICAL_SIGNALS_ENABLED or self.RADICAL_LANGUAGE_ENABLED:
+                    # Loops through list of dicts (messages)
+                    number_of_processed_messages = 0
+                    for message in features:
+                        number_of_processed_messages = number_of_processed_messages + 1
+                        Logger.logger.print_message(
+                            "Extracting information from message {} of {} in file {}".format(
+                                number_of_processed_messages,
+                                len(features),
+                                features_file),
+                            logging_level=1)
+                        # Loops through dict keys (usernames)
+                        for user in message.keys():
+                            message_features = message[user]
+                            feature_dict = {}
+                            if self.PSYCHOLOGICAL_SIGNALS_ENABLED:
+                                # Summary variables
+                                feature_dict["clout"] = message_features["clout"]
+                                feature_dict["analytic"] = message_features["analytic"]
+                                feature_dict["tone"] = message_features["tone"]
+                                feature_dict["authentic"] = message_features["authentic"]
+                                # Emotional Analysis
+                                feature_dict["anger"] = message_features["anger"]
+                                feature_dict["sadness"] = message_features["sadness"]
+                                feature_dict["anxiety"] = message_features["anxiety"]
+                                # Personal Drives
+                                feature_dict["power"] = message_features["power"]
+                                feature_dict["reward"] = message_features["reward"]
+                                feature_dict["risk"] = message_features["risk"]
+                                feature_dict["achievement"] = message_features["achievement"]
+                                feature_dict["affiliation"] = message_features["affiliation"]
+                                # Personal Pronouns
+                                feature_dict["i_pronoun"] = message_features["i_pronoun"]
+                                feature_dict["p_pronoun"] = message_features["p_pronoun"]
+                                # Minkowski distance
+                                feature_dict["minkowski"] = message_features["minkowski"]
+                            if self.BEHAVIOURAL_FEATURES_ENABLED:
+                                #feature_dict['post_freq'] = message_features['post_freq']
+                                #feature_dict['follower_freq'] = message_features['follower_freq']
+                                feature_dict['centrality'] = message_features['centrality']
+                            if self.RADICAL_LANGUAGE_ENABLED:
+                                feature_dict["message_vector"] = message_features["message_vector"]
+                                feature_dict["violent_freq"] = message_features["violent_freq"]
+                                feature_dict["cap_freq"] = message_features["cap_freq"]
+                            feature_dict['is_extremist'] = message_features['is_extremist']
+                            user = {user: feature_dict}
+                            filtered_list_after_filters_applied.append(user)
+                number_of_features = len(filtered_list_after_filters_applied)
+                # Creates the columns for the data frame
+                df = pd.DataFrame(
+                    columns=column_names)
+                completed_features = 0
+                iterator = 0
+                error_count = 0
+                for message in features:
+                    # should only be one user per entry
+                    for user_id in message:
+                        feature_data = message[user_id]
+                        # ID is not included as it's hexidecimal and not float
+                        row = []
+                        if self.PSYCHOLOGICAL_SIGNALS_ENABLED:
+                            clout = feature_data['clout']
+                            analytic = feature_data['analytic']
+                            tone = feature_data['tone']
+                            authentic = feature_data['authentic']
+                            anger = feature_data["anger"]
+                            sadness = feature_data["sadness"]
+                            anxiety = feature_data["anxiety"]
+                            power = feature_data["power"]
+                            reward = feature_data["reward"]
+                            risk = feature_data["risk"]
+                            achievement = feature_data["achievement"]
+                            affiliation = feature_data["affiliation"]
+                            i_pronoun = feature_data["i_pronoun"]
+                            p_pronoun = feature_data["p_pronoun"]
+                            minkowski = feature_data["minkowski"]
+                            row = row + [clout, analytic, tone, authentic, anger, sadness, anxiety, power,
+                                         reward, risk, achievement, affiliation, i_pronoun, p_pronoun, minkowski]
+                        if self.BEHAVIOURAL_FEATURES_ENABLED:
+                            #post_freq = feature_data['post_freq']
+                            #follower_freq = feature_data['follower_freq']
+                            centrality = feature_data['centrality']
+                            row = row + [#post_freq, follower_freq,
+                                         centrality]
+                        if self.RADICAL_LANGUAGE_ENABLED:
+                            cap_freq = feature_data['cap_freq']
+                            violent_freq = feature_data['violent_freq']
+                            message_vector = feature_data['message_vector']
+                            row = row + [cap_freq, violent_freq] + message_vector
+                        is_extremist = feature_data['is_extremist']
+                        row = row + [is_extremist]
+                        try:
+                            df.loc[iterator] = row
+                        except ValueError as e:
+                            print(e)
+                            error_count = error_count + 1
+                            pass  # if error with value probably column mismatch which is down to taking a mesage with no data
+                        iterator = iterator + 1
+                    completed_features = completed_features + 1
+                    user_name = list(message.keys())[0]
+                    Logger.logger.print_message(
+                        "Added a message from user {} to data frame - {} messages of {} completed".format(user_name,
+                                                                                                          completed_features,
+                                                                                                          number_of_features),
+                        logging_level=1)
+                Logger.logger.print_message("Total errors when creating data frame: {}".format(error_count),
+                                            logging_level=1)
+                # Replace boolean with float
+                df.replace({False: 0, True: 1}, inplace=True)
+                # Sets ID field
+                df.index.name = "ID"
+                df.to_csv("{}.csv".format(features_file))
+            else:
+                df = pandas.read_csv(csv_file)
+        return df
+    def create_model_info_output_file(self, location_of_output_file = None, training_data_csv_location = None):
+        """
+        If the model has been loaded or trained this function will create a summary text file with information relating to
+        the model.
+        :param location_of_output_file: The location to save the output file to.
+        :param training_data_csv_location: The location of the training data csv. This is used to retrieve the name of the
+        feature columns.
+        """
+        # Check if model has been created
+        if not  self.creation_date:
+            Logger.logger.print_message("Model has not been trained, created, or loaded. Cannot output model data in this state.",logging_level=1)
+        else:
+            Logger.logger.print_message("Creating model info text file")
+            output_text = ""
+            # Add summary information
+            output_text += "Model {}, version {}, created at {} \n".format(self.original_name, self.model_version, self.creation_date)
+            output_text += "\nAccuracy: {}\nRecall: {} \nPrecision: {}\nF-Measure: {}\n".format(self.accuracy, self.recall,
+                                                                                   self.precision, self.f_measure)
+            # Retrieve the header names if available
+            if training_data_csv_location:
+                with open(training_data_csv_location, "r") as csv_file:
+                    reader = csv.reader(csv_file)
+                    headers = next(reader)
+            # Loop through all feature importance scores
+            for iterator in range(len(self.model.feature_importances_)):
+                if training_data_csv_location:
+                    # Plus one to ignore ID field
+                    output_text += "\n{}: {}".format(headers[iterator+1], self.model.feature_importances_[iterator])
+                else:
+                    output_text += "\nFeature {}: {}".format(iterator,self.model.feature_importances_[iterator])
+        # If no name has been set write to outputs folder
+        if location_of_output_file:
+            file_name = location_of_output_file
+        else:
+            file_name = os.path.join(self._outputs_folder,"model-output-{}.txt".format(datetime.today().strftime('%Y-%m-%d-%H%M%S')))
+        # Write to file
+        with open(file_name, "w") as output_file:
+            output_file.write(output_text)
+    def train_model(self, features_file, force_new_dataset=True, model_location=None):
+        """
+        Trains the model of the proveded data unless the model file already exists or if the force new dataset flag is True.
+        :param features_file: the location of the feature file to be used to train the model
+        :param force_new_dataset: If True a new dataset will be created and new model created even if a model already exists.
+        :param model_location: the location to save the model file to
+        """
+        # Sets model location based on default folder location and placeholder name if none was given
+        if model_location is None:
+            model_location = os.path.join(self._model_folder, "predictor.model")
+        # if told to force the creation of a new dataset to train off or the model location does not exist then make a new model
+        if force_new_dataset or not os.path.isfile(model_location):
+            # Import train_test_split function
+            feature_data = self.get_features_as_df(features_file, force_new_dataset)
+            # Removes index column
+            if "ID" in feature_data.keys():
+                feature_data.drop(feature_data.columns[0], axis=1, inplace=True)
+            feature_data.reset_index(drop=True, inplace=True)
+            y = feature_data[['is_extremist']]  # Labels
+            X = feature_data.drop(axis=1, labels=['is_extremist'])  # Features
+            # Split dataset into training set and test set
+            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)  # 80% training and 20% test
+            # Create a Gaussian Classifier
+            random_forest = RandomForestClassifier(n_estimators=100, max_depth=50, oob_score=True
+                                         )  # class_weight={0:1,1:5} # A higher weight for the minority class (is_extreamist)
+            # Train the model using the training sets y_pred=random_forest.predict(X_test)
+            random_forest.fit(X_train, y_train.values.ravel())
+            y_pred = random_forest.predict(X_test)
+            # Model Accuracy, how often is the classifier correct?
+            self.accuracy = metrics.accuracy_score(y_test, y_pred)
+            self.recall = metrics.recall_score(y_test, y_pred)
+            self.precision = metrics.precision_score(y_test, y_pred)
+            self.f_measure = metrics.f1_score(y_test, y_pred)
+            Logger.logger.print_message("Accuracy: {}".format(self.accuracy), logging_level=1)
+            Logger.logger.print_message("Recall: {}".format(self.recall), logging_level=1)
+            Logger.logger.print_message("Precision: {}".format(self.precision), logging_level=1)
+            Logger.logger.print_message("F-Measure: {}".format(self.f_measure), logging_level=1)
+            self.model = random_forest
+            self.original_name = model_location
+            self.creation_date = datetime.today().strftime('%Y-%m-%d')
+            # write model and accuracy to file to file
+            model_data = {"model": self.model,
+                          "original_name": self.original_name,
+                          "creation_date": self.creation_date,
+                          "accuracy": self.accuracy,
+                          "recall": self.recall,
+                          "precision": self.precision,
+                          "f1": self.f_measure,
+                          "version": self._FRAMEWORK_VERSION
+                          }
+            pickle.dump(model_data, open(model_location, "wb"))
+        else:
+            # Read model and accuracy from file
+            saved_file = pickle.load(open(model_location, "rb"))
+            self.accuracy = saved_file["accuracy"]
+            self.recall = saved_file["recall"]
+            self.precision = saved_file["precision"]
+            self.f_measure = saved_file["f1"]
+            self.model = saved_file["model"]
+            self.model_version = saved_file["version"]
+            self.original_name = saved_file["original_name"]
+            self.creation_date = saved_file["creation_date"]
+            # A check to identify if the loaded model is of the same version as the tooling
+            if self.model_version is not self._FRAMEWORK_VERSION:
+                Logger.logger.print_message("Model provided is of version {}, tooling is of "
+                                            "version {}. Using the model may not work as expected."
+                                            .format(self.model_version, self._FRAMEWORK_VERSION))

Pinpoint/Sanitizer.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import os.path
+from nltk import *
+from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
+from Pinpoint.Logger import *
+# If NLTK data doesn't exist, downloads it
+try:
+    tagged = pos_tag(["test"])
+except LookupError:
+    download()
+# nltk.download() #todo how to get this to run once?
+class sanitization():
+    """
+    This class is used to sanitize a given corpus of data. In turn removing stop words, stemming words, removing small
+    words, removing no alphabet words, and setting words to lower case. To save on repeat runs a local copy of the
+    serialised corpus is saved that is used unless this feature is overwritten.
+    """
+    def sanitize(self, text, output_folder, force_new_data_and_dont_persisit=False):
+        """
+        Entry function for sanitizing text
+        :param text:
+        :param force_new_data_and_dont_persisit:
+        :return: sanitized text
+        """
+        sanitize_file_name = os.path.join(output_folder, "{}-sanitized_text.txt".format(uuid.uuid4()))
+        final_text = ""
+        # If a file exists don't sanitize given text
+        if os.path.isfile(sanitize_file_name) and not force_new_data_and_dont_persisit:
+            logger.print_message("Sanitized file exists. Using data")
+            with open(sanitize_file_name, 'r', encoding="utf8") as file_to_write:
+                final_text = file_to_write.read()
+        else:
+            total_words = len(text.split(" "))
+            number = 0
+            logger.print_message("Starting sanitization... {} words to go".format(total_words))
+            for word in text.split(" "):
+                number = number + 1
+                word = self.remove_non_alpha(word)
+                word = self.lower(word)
+                word = self.stemmer(word)
+                word = self.remove_stop_words(word)
+                word = self.remove_small_words(word)
+                if word is None:
+                    continue
+                final_text = final_text + word + " "
+                logger.print_message("Completed {} of {} sanitized words".format(number, total_words))
+            final_text = final_text.replace("  ", " ")
+            if not force_new_data_and_dont_persisit:
+                with open(sanitize_file_name, 'w', encoding="utf8") as file_to_write:
+                    file_to_write.write(final_text)
+        final_text = final_text.strip()
+        return final_text
+    def stemmer(self, word):
+        """
+        Get stemms of words
+        :param word:
+        :return: the stemmed word using port stemmer
+        """
+        porter = PorterStemmer()
+        # todo anouther stemmer be assessed?
+        # lancaster = LancasterStemmer()
+        # stemmed_word = lancaster.stem(word)
+        stemmed_word = porter.stem(word)
+        return stemmed_word
+    def lower(self, word):
+        """
+        get the lower case representation of words
+        :param word:
+        :return: the lowercase representation of the word
+        """
+        return word.lower()
+    def remove_stop_words(self, text):
+        """
+        Remove stop words
+        :param text:
+        :return: the word without stop words
+        """
+        text_without_stopwords = [word for word in text.split() if word not in ENGLISH_STOP_WORDS]
+        final_string = ""
+        for word in text_without_stopwords:
+            final_string = final_string + word + " "
+        return final_string
+    def remove_non_alpha(self, word):
+        """
+        Removes non alphabet characters (Excluding spaces)
+        :param word:
+        :return: the word with non-alpha characters removed
+        """
+        word = word.replace("\n", " ").replace("\t", " ").replace("  ", " ")
+        regex = re.compile('[^a-zA-Z ]')
+        return regex.sub('', word)
+    def remove_small_words(self, word, length_to_remove_if_not_equal=4):
+        """
+        Removes words that are too small, defaults to words words length 3 characters or below which are removed.
+        :param word:
+        :param length_to_remove_if_not_equal:
+        :return: "" if word below 3 characters or the word if above
+        """
+        new_word = ""
+        if len(word) >= length_to_remove_if_not_equal:
+            new_word = word
+        return new_word

Pinpoint/Serializer.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# todo This file should be used to store common serialisations across aggregating data
+def createPostDict(date, post_text, likes, comments, shares, source="self"):
+    '''
+    Creates a dictionary containing the pertinent information from a social media post. This should later be added to a list
+    of other posts from that account and then added to a master dictionary.
+    :param date:
+    :param post_text:
+    :param likes:
+    :param comments:
+    :param shares:
+    :param source:
+    :return: a dictionary containing pertinent post information
+    '''
+    return {"text": post_text, "likes": likes, "comments": comments, "shares": shares, "source": source, "date": date}
+def createWholeUserDict(unique_id, reddit_list, instagram_list, twitter_list, survey_data):
+    return {"id": unique_id, "reddit": reddit_list, "instagram": instagram_list, "twitter": twitter_list,
+            "survey": survey_data}

Pinpoint/__pycache__/Aggregator_NGram.cpython-310.pyc ADDED Viewed

Binary file (3.13 kB). View file

Pinpoint/__pycache__/Aggregator_NGram.cpython-36.pyc ADDED Viewed

Binary file (3.09 kB). View file

Pinpoint/__pycache__/Aggregator_NGram.cpython-38.pyc ADDED Viewed

Binary file (3.08 kB). View file

Pinpoint/__pycache__/Aggregator_TfIdf.cpython-310.pyc ADDED Viewed

Binary file (1.73 kB). View file

Pinpoint/__pycache__/Aggregator_TfIdf.cpython-36.pyc ADDED Viewed

Binary file (1.7 kB). View file

Pinpoint/__pycache__/Aggregator_TfIdf.cpython-38.pyc ADDED Viewed

Binary file (1.69 kB). View file

Pinpoint/__pycache__/Aggregator_Word2Vec.cpython-310.pyc ADDED Viewed

Binary file (1.05 kB). View file

Pinpoint/__pycache__/Aggregator_Word2Vec.cpython-36.pyc ADDED Viewed

Binary file (1.03 kB). View file

Pinpoint/__pycache__/Aggregator_Word2Vec.cpython-38.pyc ADDED Viewed

Binary file (1.02 kB). View file

Pinpoint/__pycache__/Aggregator_WordingChoice.cpython-310.pyc ADDED Viewed

Binary file (1.86 kB). View file

Pinpoint/__pycache__/Aggregator_WordingChoice.cpython-36.pyc ADDED Viewed

Binary file (1.83 kB). View file

Pinpoint/__pycache__/Aggregator_WordingChoice.cpython-38.pyc ADDED Viewed

Binary file (1.81 kB). View file

Pinpoint/__pycache__/FeatureExtraction.cpython-310.pyc ADDED Viewed

Binary file (19.7 kB). View file

Pinpoint/__pycache__/FeatureExtraction.cpython-36.pyc ADDED Viewed

Binary file (19.5 kB). View file

Pinpoint/__pycache__/FeatureExtraction.cpython-38.pyc ADDED Viewed

Binary file (19.4 kB). View file

Pinpoint/__pycache__/Grapher.cpython-310.pyc ADDED Viewed

Binary file (2.17 kB). View file

Pinpoint/__pycache__/Grapher.cpython-36.pyc ADDED Viewed

Binary file (2.13 kB). View file

Pinpoint/__pycache__/Grapher.cpython-38.pyc ADDED Viewed

Binary file (2.14 kB). View file

Pinpoint/__pycache__/Logger.cpython-310.pyc ADDED Viewed

Binary file (1.07 kB). View file

Pinpoint/__pycache__/Logger.cpython-36.pyc ADDED Viewed

Binary file (1.05 kB). View file

Pinpoint/__pycache__/Logger.cpython-38.pyc ADDED Viewed

Binary file (1.04 kB). View file

Pinpoint/__pycache__/RandomForest.cpython-310.pyc ADDED Viewed

Binary file (8.12 kB). View file

Pinpoint/__pycache__/RandomForest.cpython-36.pyc ADDED Viewed

Binary file (7.97 kB). View file

Pinpoint/__pycache__/RandomForest.cpython-38.pyc ADDED Viewed

Binary file (7.98 kB). View file

Pinpoint/__pycache__/Sanitizer.cpython-310.pyc ADDED Viewed

Binary file (3.99 kB). View file

Pinpoint/__pycache__/Sanitizer.cpython-36.pyc ADDED Viewed

Binary file (3.91 kB). View file

Pinpoint/__pycache__/Sanitizer.cpython-38.pyc ADDED Viewed

Binary file (3.92 kB). View file

Pinpoint/__pycache__/predictor.cpython-38.pyc ADDED Viewed

Binary file (2.39 kB). View file

Pinpoint/far-right-core.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""
+Example of training a model using this package.
+"""
+from Pinpoint.FeatureExtraction import *
+from Pinpoint.RandomForest import *
+# Performs feature extraction from the provided Extremist, Counterpoise, and Baseline datasets.
+extractor = feature_extraction(violent_words_dataset_location=r"datasets/swears",
+                               baseline_training_dataset_location=r"datasets/far-right/LIWC2015 Results (Storm_Front_Posts).csv")
+extractor.MAX_RECORD_SIZE = 50000
+extractor.dump_training_data_features(
+    feature_file_path_to_save_to=r"outputs/training_features.json",
+    extremist_data_location=r"datasets/far-right/LIWC2015 Results (extreamist-messages.csv).csv",
+    baseline_data_location=r"datasets/far-right/LIWC2015 Results (non-extreamist-messages.csv).csv")
+# Trains a model off the features file created in the previous stage
+model = random_forest()
+model.RADICAL_LANGUAGE_ENABLED = True
+model.BEHAVIOURAL_FEATURES_ENABLED = True
+model.PSYCHOLOGICAL_SIGNALS_ENABLED = True
+model.train_model(features_file= r"outputs/training_features.json",
+                  force_new_dataset=True, model_location=r"outputs/far-right-radical-language.model")  # , model_location=r"Pinpoint/model/my.model"
+model.create_model_info_output_file(location_of_output_file="outputs/far-right-radical-language-output.txt",
+                                    training_data_csv_location=r"outputs/training_features.json.csv")
+#############################################################################################
+model.RADICAL_LANGUAGE_ENABLED = False
+model.BEHAVIOURAL_FEATURES_ENABLED = True
+model.PSYCHOLOGICAL_SIGNALS_ENABLED = False
+model.train_model(features_file= r"outputs/training_features.json",
+                  force_new_dataset=True, model_location=r"outputs/far-right-behavioural.model")  # , model_location=r"Pinpoint/model/my.model"
+model.create_model_info_output_file(location_of_output_file="outputs/far-right-behavioural-output.txt",
+                                    training_data_csv_location=r"outputs/training_features.json.csv")
+############################################################################
+model.RADICAL_LANGUAGE_ENABLED = False
+model.BEHAVIOURAL_FEATURES_ENABLED = False
+model.PSYCHOLOGICAL_SIGNALS_ENABLED = True
+model.train_model(features_file= r"outputs/training_features.json",
+                  force_new_dataset=True, model_location=r"outputs/far-right-psychological.model")  # , model_location=r"Pinpoint/model/my.model"
+model.create_model_info_output_file(location_of_output_file="outputs/far-right-psychological-output.txt",
+                                    training_data_csv_location=r"outputs/training_features.json.csv")
+##############################################################################################
+model.RADICAL_LANGUAGE_ENABLED = True
+model.BEHAVIOURAL_FEATURES_ENABLED = False
+model.PSYCHOLOGICAL_SIGNALS_ENABLED = False
+model.train_model(features_file= r"outputs/training_features.json",
+                  force_new_dataset=True, model_location=r"outputs/far-right-baseline.model")  # , model_location=r"Pinpoint/model/my.model"
+model.create_model_info_output_file(location_of_output_file="outputs/far-right-baseline-output.txt",
+                                    training_data_csv_location=r"outputs/training_features.json.csv")
+print("Finished")

app.py ADDED Viewed

	@@ -0,0 +1,356 @@

+#!/usr/bin/env python
+# coding: utf-8
+import json
+import os
+import re
+import time
+from random import random
+import socket
+from threading import Thread
+from time import sleep
+test_html = '''
+<!-- Header -->
+<header class="w3-display-container w3-content w3-wide" style="max-width:1500px;" id="home">
+  <img class="w3-image" src="https://cdn.pixabay.com/photo/2018/12/10/16/22/city-3867295_960_720.png" alt="Architecture" width="1500" height="800">
+  <div class="w3-display-middle w3-margin-top w3-center">
+    <h1 class="w3-xxlarge w3-text-white"><span class="w3-padding w3-black w3-opacity-min"><b>WATCH</b></span> <span class="w3-hide-small w3-text-dark-grey">Tower</span></h1>
+  </div>
+</header>
+<!-- Container (About Section) -->
+<div class="w3-content w3-container w3-padding-64" id="about">
+  <h3 class="w3-center">Block Violent Content Before It Reaches Your Feed</h3>
+  <p class="w3-center"><em>WatchTower identifies, blocks, and filters out violent and radical content before it reaches your Twitter feed.
+</em></p>
+<br>
+  <p>WatchTower works to protect you from violent, misinformation, hate speech and other malicious communication by using a suite of machine learning models to identify user accounts that post content that commonly falls into these categories. WatchTower is broken down into two components, the first utilises the Twitter streaming API and applies a suite of machine learning models to identify users that commonly post malicious information, while the second element provides a web UI where users can authenticaate with Twitter and tailor the types and thresholds for the accounts they block.  </p>
+  <br>
+  <p> WatchTower was developed solely by James Stevenson and primarily uses Pinpoint, a machine learning model also developed by James. The future roadmap sees WatchTower incoperate other models for identifying contrent such as misinformation and hate speech. More on Pinpoint and the model WatchTower uses to identify violent extremism can be seen below.</p>
+  <p class="w3-large w3-center w3-padding-16">Model Accuracy:</p>
+  <p class="w3-center"><em>Machine learning models can be validated based on several statistics. These statistics for Pinpoint the main ML model used by WatchTower can be seen below. </p>
+  <br>
+  <p class="w3-wide"><i class="fa fa-camera"></i>Accuracy</p>
+  <div class="w3-light-grey">
+    <div class="w3-container w3-padding-small w3-dark-grey w3-center" style="width:73%">73%</div>
+  </div>
+  <p class="w3-wide"><i class="fa fa-laptop"></i>Recall</p>
+  <div class="w3-light-grey">
+    <div class="w3-container w3-padding-small w3-dark-grey w3-center" style="width:62%">62%</div>
+  </div>
+  <p class="w3-wide"><i class="fa fa-photo"></i>Precision</p>
+  <div class="w3-light-grey">
+    <div class="w3-container w3-padding-small w3-dark-grey w3-center" style="width:78%">78%</div>
+  </div>
+  <p class="w3-wide"><i class="fa fa-photo"></i>F-Measure</p>
+  <div class="w3-light-grey">
+    <div class="w3-container w3-padding-small w3-dark-grey w3-center" style="width:69%">69%</div>
+  </div>
+</div>
+<div class="w3-row w3-center w3-dark-grey w3-padding-16">
+  <div class="w3-quarter w3-section">
+    <span class="w3-xlarge">14+</span><br>
+    Partners
+  </div>
+  <div class="w3-quarter w3-section">
+    <span class="w3-xlarge">55+</span><br>
+    Projects Done
+  </div>
+  <div class="w3-quarter w3-section">
+    <span class="w3-xlarge">89+</span><br>
+    Happy Clients
+  </div>
+  <div class="w3-quarter w3-section">
+    <span class="w3-xlarge">150+</span><br>
+    Meetings
+  </div>
+</div>
+<br>
+<!-- Container (Portfolio Section) -->
+<div class="w3-content w3-container w3-padding-64" id="portfolio">
+  <h3 class="w3-center">Chirp Development Challenge 2022</h3>
+  <p class="w3-center"><em>WatchTower was developed for the Chirp 2022 Twitter API Developer Challenge</em></p>
+</div><p> Watchtower was developed solely by James Stevenson for the  Chirp 2022 Twitter API Developer Challenge. More infomration of this can be found below.</p>
+<br>
+<img class="w3-image" src="https://cdn.cms-twdigitalassets.com/content/dam/developer-twitter/redesign-2021-images/blog2022/chirp/Chirp-Hero-Banner.jpg.twimg.1920.jpg" alt="Architecture" width="1500" height="800">
+<br>
+<!-- Modal for full size images on click-->
+<div id="modal01" class="w3-modal w3-black" onclick="this.style.display='none'">
+  <span class="w3-button w3-large w3-black w3-display-topright" title="Close Modal Image"><i class="fa fa-remove"></i></span>
+  <div class="w3-modal-content w3-animate-zoom w3-center w3-transparent w3-padding-64">
+    <img id="img01" class="w3-image">
+    <p id="caption" class="w3-opacity w3-large"></p>
+  </div>
+</div>
+<script>
+// Modal Image Gallery
+function onClick(element) {
+  document.getElementById("img01").src = element.src;
+  document.getElementById("modal01").style.display = "block";
+  var captionText = document.getElementById("caption");
+  captionText.innerHTML = element.alt;
+}
+// Change style of navbar on scroll
+window.onscroll = function() {myFunction()};
+function myFunction() {
+    var navbar = document.getElementById("myNavbar");
+    if (document.body.scrollTop > 100 || document.documentElement.scrollTop > 100) {
+        navbar.className = "w3-bar" + " w3-card" + " w3-animate-top" + " w3-white";
+    } else {
+        navbar.className = navbar.className.replace(" w3-card w3-animate-top w3-white", "");
+    }
+}
+// Used to toggle the menu on small screens when clicking on the menu button
+function toggleFunction() {
+    var x = document.getElementById("navDemo");
+    if (x.className.indexOf("w3-show") == -1) {
+        x.className += " w3-show";
+    } else {
+        x.className = x.className.replace(" w3-show", "");
+    }
+}
+</script>
+</body>
+</html>
+'''
+import gradio as gr
+import tweepy
+from fastapi import FastAPI, Request
+consumer_token = os.getenv('CONSUMER_TOKEN')
+consumer_secret = os.getenv('CONSUMER_SECRET')
+my_access_token = os.getenv('ACCESS_TOKEN')
+my_access_secret = os.getenv('ACCESS_SECRET')
+global_oauth1_user_handler = None
+bearer = os.getenv('BEARER')
+oauth1_user_handler = tweepy.OAuth1UserHandler(
+    consumer_token, consumer_secret,
+    callback="http://127.0.0.1:7860/"
+)
+target_website = oauth1_user_handler.get_authorization_url(signin_with_twitter=True)
+block = gr.Blocks(css=".container { max-width: 800px; margin: auto; }")
+chat_history = []
+def get_client_from_tokens(oauth_verifier, oauth_token):
+    new_oauth1_user_handler = tweepy.OAuth1UserHandler(
+        consumer_token, consumer_secret,
+        callback="http://127.0.0.1:7860/"
+    )
+    new_oauth1_user_handler.request_token = {
+        "oauth_token": oauth_token,
+        "oauth_token_secret": consumer_secret
+    }
+    access_token, access_token_secret = new_oauth1_user_handler.get_access_token(
+        oauth_verifier
+    )
+    their_client = tweepy.Client(
+        bearer_token=bearer,
+        consumer_key=consumer_token,
+        consumer_secret=consumer_secret,
+        access_token=access_token,
+        access_token_secret=access_token_secret
+    )
+    return their_client
+def get_oath_headers():
+    oauth_verifier = None
+    oauth_token = None
+    did_find = False
+    if hasattr(block, "server"):
+        for connection in block.server.server_state.connections:
+            # connection_app_id = connection.app.app.blocks.app_id
+            # if active_app_id == connection_app_id:
+            #    print("Its a match")
+            if connection.headers != None:
+                for header in connection.headers:
+                    header = header[1].decode()
+                    if "oauth_verifier" in header:
+                        oauth_verifier = re.search(r"oauth_verifier=(.+)", header).group(1)
+                        oauth_token = re.search(r"oauth_token=(.+)&", header).group(1)
+                        if oauth_token and oauth_verifier:
+                            did_find = True
+                            break
+                if did_find:
+                    break
+    return oauth_verifier, oauth_token
+def block_users(client, threshold, dataset):
+    num_users_blocked = 0
+    for filename in os.listdir("users"):
+        filename = os.path.join("users", filename)
+        user_file = open(filename, "r")
+        users = json.load(user_file)
+        for user in users:
+            if threshold >= user["threshold"]:
+                user = user["username"].strip()
+                user_id = client.get_user(username=user)
+                finished = False
+                while not finished:
+                    try:
+                        client.block(target_user_id=user_id.data.id)
+                    except tweepy.errors.TooManyRequests as e:
+                        print(e)
+                        time.sleep(240)
+                        continue
+                    finished = True
+                    me = client.get_me()
+                    print("{} blocked {}".format(me.data["username"], user))
+                    num_users_blocked = num_users_blocked + 1
+    return num_users_blocked
+def has_oath_header():
+    headers = get_oath_headers()
+    if headers[0] == None:
+        return False
+    else:
+        return True
+username_populated = False
+def chat(radio_score = None, selected_option = None):
+    global client
+    history =  []
+# app id
+    if radio_score != None and selected_option != None:
+        response = "no blocking"
+        if client != None:
+            chat_history.append(["Model tuned to a '{}%' threshold and is using the '{}' dataset.".format(radio_score, selected_option),
+                                 "{} Account blocking initialised".format(selected_option.capitalize())])
+            num_users_blocked = block_users(client,radio_score,selected_option)
+            chat_history.append(["Blocked {} user account(s).".format(num_users_blocked), "Thank you for using Watchtower."])
+    elif radio_score != None or selected_option != None:
+        chat_history.append(["Initialisation error!","Please tune the model by using the above options"])
+    return chat_history
+def infer(prompt):
+    pass
+have_initialised = False
+client = None
+name = None
+def changed_tab():
+    global have_initialised
+    global chatbot
+    global chat_history
+    global client
+    global name
+    name = "no username"
+    chat_history = [["Welcome to Watchtower.".format(name), "Log in via Twitter and configure your blocking options above."]]
+    if client != None and name != "no username":
+        chat_history = [["Welcome {}".format(name), "Initialising WatchTower"]]
+        print("changed tabs - {}".format(name))
+        chatbot.value = chat_history
+        chatbot.update(value=chat_history)
+    elif has_oath_header() and client==None:
+        tokens = get_oath_headers()
+        if tokens[0] and client==None:
+            client = get_client_from_tokens(tokens[0],tokens[1])
+            name = client.get_me().data.name
+            have_initialised = True
+            chat_history = [["Welcome {}".format(name), "Initialising WatchTower"]]
+        chatbot.value = chat_history
+        chatbot.update(value=chat_history)
+    elif not has_oath_header() and not have_initialised:
+        chatbot.value = chat_history
+        chatbot.update(value=chat_history)
+with block:
+    gr.HTML('''
+<meta name="viewport" content="width=device-width, initial-scale=1">
+<link rel="stylesheet" href="https://www.w3schools.com/w3css/4/w3.css">
+    <!-- Navbar (sit on top) -->
+<div class="w3-top">
+  <div class="w3-bar w3-white w3-wide w3-padding w3-card">
+    <p class="w3-bar-item w3-button"><b>WATCH</b> Tower</p>
+  </div>
+</div>
+''')
+    gr.HTML("<center><p><br></p></center>")
+    #todo check if user signed in
+    user_message = "Log in via Twitter and configure your blocking options above."
+    chat_history.append(["Welcome to Watchtower.",user_message])
+    tabs = gr.Tabs()
+    with tabs:
+        intro_tab = gr.TabItem("Introduction")
+        with intro_tab:
+            gr.HTML(test_html)
+        prediction_tab = gr.TabItem("Getting Started")
+        with prediction_tab:
+            gr.HTML('''
+            <header class="w3-display-container w3-content w3-wide" style="max-height:250px;" id="home">
+          <img class="w3-image" src="https://cdn.pixabay.com/photo/2018/12/10/16/22/city-3867295_960_720.png" alt="Architecture" width="1500" height="800">
+          <div class="w3-display-middle w3-margin-top w3-center">
+            <h1 class="w3-xxlarge w3-text-white"><span class="w3-padding w3-black w3-opacity-min"><b>WATCH</b></span> <span class="w3-hide-small w3-text-dark-grey">Tower</span></h1>
+          </div>
+        </header>
+''')
+            with gr.Group():
+                with gr.Box():
+                    with gr.Row().style(mobile_collapse=False, equal_height=True):
+                        gr.HTML(
+                            value='<a href={}><img src="https://cdn.cms-twdigitalassets.com/content/dam/developer-twitter/auth-docs/sign-in-with-twitter-gray.png.twimg.1920.png" alt="Log In With Twitter"></a><br>'.format(
+                                target_website))
+                    with gr.Row().style(mobile_collapse=False, equal_height=True):
+                        radio = gr.CheckboxGroup(value="Violent", choices=["Violent", "Hate Speech", "Misinformation"],
+                                                 interactive=False, label="Behaviour To Block")
+                        slider = gr.Slider(value=80, label="Threshold Certainty Tolerance")
+                    chatbot = gr.Chatbot(value=chat_history, label="Watchtower Output").style()
+                    btn = gr.Button("Run WatchTower").style(full_width=True)
+        #radio.change(fn=chat, inputs=[radio], outputs=chatbot)
+        #slider.change(fn=chat, inputs=[slider], outputs=chatbot)
+        #text.submit(fn=chat, inputs=[text,text], outputs=chatbot)
+        btn.click(fn=chat, inputs=[slider,radio], outputs=chatbot)
+        tabs.change(fn=changed_tab, inputs=None, outputs=None)
+    gr.Markdown(
+        """___
+   <p style='text-align: center'>
+   Created by <a href="https://twitter.com/borisdayma" target="_blank">Boris Dayma</a> et al. 2021-2022
+   <br/>
+   <a href="https://github.com/borisdayma/dalle-mini" target="_blank">GitHub</a> | <a href="https://wandb.ai/dalle-mini/dalle-mini/reports/DALL-E-mini-Generate-images-from-any-text-prompt--VmlldzoyMDE4NDAy" target="_blank">Project Report</a>
+   </p>"""
+    )
+block.launch(enable_queue=False)

outputs/sanitized_text.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

outputs/users.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unknown": "aa60c20c4b0742069665b5c7d6bbff82"}

predictor.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import csv
+import time
+import uuid
+from pprint import pprint
+import Pinpoint.FeatureExtraction
+from Pinpoint.RandomForest import *
+class predictor():
+    def __init__(self):
+        self.model = random_forest()
+        self.model.PSYCHOLOGICAL_SIGNALS_ENABLED = False  # Needs LIWC markup
+        self.model.BEHAVIOURAL_FEATURES_ENABLED = False
+        self.model.train_model(features_file=None, force_new_dataset=False,
+                          model_location=r"far-right-radical-language.model")
+        self.dict_of_users_all = {}
+        self.feature_extractor = Pinpoint.FeatureExtraction.feature_extraction(
+            violent_words_dataset_location="swears",
+            baseline_training_dataset_location="LIWC2015 Results (Storm_Front_Posts).csv")
+    def predict(self, string_to_predict = None, username = "unknown"):
+        if string_to_predict == None:
+            raise Exception("No prediction material given...")
+        extended_prediction_uuid = str(uuid.uuid1())+"-"+str(uuid.uuid1())
+        self.model.model_folder = "{}-output".format(extended_prediction_uuid)
+        self.feature_extractor.MESSAGE_TMP_CACHE_LOCATION = "{}-message-cache".format(extended_prediction_uuid)
+        print("Starting prediction for {}".format(extended_prediction_uuid))
+        if string_to_predict != None:
+            users_posts = [{"username": "{}".format(username), "timestamp": "tmp", "message": "{}".format(string_to_predict)}]
+        try:
+            os.remove("./{}-messages.json".format(extended_prediction_uuid))
+        except:
+            pass
+        with open('{}-all-messages.csv'.format(extended_prediction_uuid), 'w', encoding='utf8', newline='') as output_file:
+            writer = csv.DictWriter(output_file, fieldnames=["username", "timestamp", "message"])
+            for users_post in users_posts:
+                writer.writerow(users_post)
+        try:
+            self.feature_extractor._get_standard_tweets("{}-all-messages.csv".format(extended_prediction_uuid))
+        except FileNotFoundError:
+            return False
+        with open("./{}-messages.json".format(extended_prediction_uuid), 'w') as outfile:
+            features = self.feature_extractor.completed_tweet_user_features
+            json.dump(features, outfile, indent=4)
+        rows = self.model.get_features_as_df("./{}-messages.json".format(extended_prediction_uuid), True)
+        rows.pop("is_extremist")
+        try:
+            features = rows.loc[0]
+            is_extremist = self.model.model.predict([features])
+        except FileNotFoundError as e:
+            is_extremist = False
+            print("Message cache error, next - {}".format(e))
+        print("Ending prediction for {}".format(extended_prediction_uuid))
+        dir_name = "."
+        test = os.listdir(dir_name)
+        os.remove("{}-all-messages.csv".format(extended_prediction_uuid))
+        os.remove("{}-messages.json.csv".format(extended_prediction_uuid))
+        os.remove("{}-messages.json".format(extended_prediction_uuid))
+        if is_extremist == True:
+            return True
+        else:
+            return False

python-streamer.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import gc
+import json
+import os
+from datetime import date
+from pathlib import Path
+import unicodedata
+consumer_token = os.getenv('CONSUMER_TOKEN')
+consumer_secret = os.getenv('CONSUMER_SECRET')
+my_access_token = os.getenv('ACCESS_TOKEN')
+my_access_secret = os.getenv('ACCESS_SECRET')
+bearer = os.getenv('BEARER')
+import time
+import tweepy
+from googletrans import Translator
+from predictor import predictor
+class grapher():
+    """
+    A wrapper class used for generating a graph for interactions between users
+    """
+    graph = None
+    def __init__(self):
+        """
+        Constructor.
+        """
+        self.graph = Graph()
+    def add_edge_wrapper(self, node_1_name, node_2_name, weight=1, relationship=None):
+        """
+        A wrapper function used to add an edge connection or node.
+        :param node_1_name: from
+        :param node_2_name: to
+        :param weight:
+        :param relationship:
+        :return:
+        """
+        # get node one ID
+        node_1 = None
+        for node in self.graph.vs:
+            if node["label"] == node_1_name.capitalize():
+                node_1 = node
+        if node_1 == None:
+            self.graph.add_vertices(1)
+            node_count = self.graph.vcount()
+            self.graph.vs[node_count-1]["id"] = node_count-1
+            self.graph.vs[node_count-1]["label"] = node_1_name.capitalize()
+            node_1 = self.graph.vs[node_count-1]
+        # get node two id
+        node_2 = None
+        for node in self.graph.vs:
+            if node["label"] == node_2_name.capitalize():
+                node_2 = node
+        if node_2 == None:
+            self.graph.add_vertices(1)
+            node_count = self.graph.vcount()
+            self.graph.vs[node_count - 1]["id"] = node_count - 1
+            self.graph.vs[node_count - 1]["label"] = node_2_name.capitalize()
+            node_2 = self.graph.vs[node_count - 1]
+        #print("User one {} - {}, user two {} - {}".format(node_1["label"], str(node_1["id"]),
+        #                                                  node_2["label"], str(node_2["id"])))
+        self.graph.add_edges([(node_1["id"], node_2["id"])])
+        #self.graph.add_edge(node_1_name, node_2_name, weight=weight, relation=relationship)  # , attr={""}
+    def add_node(self, node_name):
+        """
+        A wrapper function that adds a node with no edges to the graph
+        :param node_name:
+        """
+        node_1 = None
+        for node in self.graph.vs:
+            if node["label"] == node_name.capitalize():
+                node_1 = node["id"]
+        if node_1 == None:
+            self.graph.add_vertices(1)
+            node_count = self.graph.vcount()
+            self.graph.vs[node_count-1]["id"] = node_count-1
+            self.graph.vs[node_count-1]["label"] = node_name.capitalize()
+            node_1 = self.graph.vs[node_count-1]
+global_oauth1_user_handler = None
+auth = tweepy.OAuth1UserHandler(
+   consumer_token, consumer_secret,
+   my_access_token, my_access_secret
+)
+api = tweepy.API(auth)
+client = tweepy.Client(
+    bearer_token= bearer,
+    consumer_key=consumer_token,
+    consumer_secret=consumer_secret,
+    access_token=my_access_token,
+    access_token_secret=my_access_secret
+)
+class IDPrinter(tweepy.StreamingClient):
+    def on_tweet(self, tweet):
+        self.translator = Translator()
+        gc.collect()
+        if len(tweet.data["text"]) > 100:
+            #tweet = client.get_tweet(id=tweet.id)
+            if tweet and tweet.data:
+                if tweet.data["author_id"]:
+                    tweet_data = tweet.data["text"].strip().replace("@", "").replace("\n","")
+                    if tweet_data is not None or tweet != "":
+                        username = client.get_user(id=tweet.author_id).data
+                        lang = self.translator.detect(tweet_data).lang
+                        if lang == "en":
+                            tweet_data = unicodedata.normalize('NFKD', tweet_data).encode('ascii', 'ignore').decode()
+                            if tweet_data != None:
+                                is_extremist = predictor().predict(tweet_data)
+                                print("user {} post extremist {} - message: {}".format(username, is_extremist, str(tweet_data)))
+                                if is_extremist != None and is_extremist == 1:
+                                    tweets = client.get_users_tweets(id=tweet.author_id, max_results=10)
+                                    number_extreme = 0
+                                    tweets = tweets[0]
+                                    for users_tweet in tweets:
+                                        if users_tweet.text != None:
+                                            is_extremist = predictor().predict(users_tweet.text)
+                                            if is_extremist != None:
+                                                if is_extremist == True:
+                                                    number_extreme = number_extreme + 1
+                                    print(number_extreme)
+                                    threshold = number_extreme/len(tweets[0]) * 100
+                                    print("Threshold {}".format(threshold))
+                                    if threshold > 1: #
+                                        file_name = os.path.join("users","{}-radical_users.txt".format(date.today().strftime("%b-%d-%Y")))
+                                        print("User {} was found to be extremist".format(username))
+                                        file_path = Path(file_name)
+                                        file_path.touch(exist_ok=True)
+                                        with open(file_name, 'a+') as outfile:
+                                            json_to_dump = [{"username":username.id,"threshold":threshold,"date":date.today().strftime("%b-%d-%Y")}]
+                                            json.dump(json_to_dump, outfile, indent=4)
+                                        print("Got user {}".format(username))
+        gc.collect()
+    # calling the api
+while True:
+    try:
+        printer = IDPrinter(bearer_token=bearer,wait_on_rate_limit =True,chunk_size=10000)
+        printer.add_rules(tweepy.StreamRule(value="en",tag="lang",id="lang-rule"))
+        printer.sample(expansions=["author_id", "geo.place_id"],threaded=False)
+        print("-"*20)
+        gc.collect()
+    except:
+        time.sleep(900)

sign-in.png ADDED Viewed

swears/VIOLENT_TERRORIST_WORDS.txt ADDED Viewed

	@@ -0,0 +1 @@

+ ["Alert","Aim","Automatic","Anguish","Agitator","Apartheid","Agency","Aircraft","Airplane","Acid","Airport","Aerial","Assassinate","Account","Arms","Assault","Ambush","Anarchy","Authority","Aggressor","Allies","Alarm","Ashore","Atrocity","Artillery","Airfield","Annihilate","Appeasement","Arsenal","Attrition","Aggression","Armory","Ammunition","Advance","Assassin","Armedforces","Alliance","Attack","Armament","Bloodletting","Bulletproof","Brutal","Betray","Betrayal","Blood(y)","Boobytrap","Bombardment","Battalion","Bullet","Brute","Burn","Brutality","Bully","Blowup","Bunker","Booby trap","Blast","Bomb","Breach","Belligerent","Battle","Bury","Bloody","Blood","Blindside","Burning","Barrage","Barricade","Battlefield","Break","Conspiracy","Clash","Conspire","Coordinate","Civilian","Cautionary","Chief","Coalition","Camouflage","Captive","Coordinates","Corps","Carrier","Control","Concentration","Carnage","Conquer","Clamor","Compassion","Compliance","Crash","Checkpoint","Clandestine","Chopper","Confrontation","Causes","Countermand","Conflict","Crime","Counterattack","Courageous","Chaos","Commandos","Casualties","Confrontation(al)","Cautious","Consequences","Consolidate","Convoy","Checking","Crisis","Confusion","Cataclysm","Careen","Command(or)","Combat","Charred","Collapse","Cross-hairs","Capture","Culpability","Corpse","Cargo","Cadaver","Charge","Concussion","Campaign","Conflagration","Deliberate","Devastation","Discipline","Disperse","Dispatch","Dead","Death","Defensive","Dominate","Drone","Detect","Danger","Detection","Deploy","Detonate","Destruction","Demolish","Demoralize","Damage","Defend","Deception","Drama","Disaster","Dictator","Despot","Disease","Device","Domination","Duck","Duty","Debris","Dash","Decline","Defiant","Dictatorship","Defect","Doom","Disastrous","Division","Die","Downfall","Dispute","Desert","Disruption","Disarray","Dissonance","Dread","Defense","Dismantle","Dangerous","Deadly","Destroy","Demoralization","Debacle","Disarmament","Enemy","Expunge","Evacuate","Escalate","Explosion","Execute","Excess","Extremism","Evacuee","Explosive","Execution","Epithet","Exploitation","Enforce","Exercise","Explode","Expectations","Encounter","Engagement","Escape","Escalation","Enforcement","Endurance","Force(s)","Faction","Force","Fierce","Flight","Fortification","Flank","Ferment","Frenzy","Feud","Front lines","Fray","Fear","Fearless","Felon","Fugitive","Fright","Forceful","Furtive","Fuel","Fighter","Fanatic","Fiery","Fearful","Forces","Flee","Fatal","Frontlines","Foxhole","Ferocious","Fight","Gas","Germ warfare","Grenade","Guided bombs","Grave","Gang up on","Garrison","Guard","Generator","Germwarfare","Groans","Gunship","Government","Gang","Genocide","Grievous","Guerrillas","Guidedbombs","Guns","Hazard","Harass","Heroic","Hide","Hostility","Horses","Horror","Horrific","Harsh","Hit","Hiding","Helicopter","Heroism","Hijack","Hostile","Hijacker","Hatred","Hit-and-run","Howitzer","Hurt","Hatch","Holocaust","Hammering","Hate","Involvement","International","Interdiction","Infanticide","Ire","Invasion","Incident","Interrogation","Ignite","Instructions","Intimidate","Insurrection","Inflame","Inferred","Intense","Incontrovertible","Impact","Informant","Investigate","Intelligence","Improvise","Incite","Intercept","Infantry","Investigations","Infiltrate","Injuries","Inmate","Intervene","Insurgent","Jail","Join","Jets","Jeer","Knock-out","Keening","Knife","Kamikaze","Kidnap","Knives","Keen","Kill","Killing","Lamentation","Legacy","Liaison","Loathsome","Loyalty","Landmines","Laser-activated","Liberation","Linksto","Launcher","Liberators","Launch","Method","Militaristic","Mobile","Militant","Massacre","Menace","Malicious","Military","Momentum","Mines","Militancy","Maim","Militia","Mob","Mobilization","Machines","Mortars","Machineguns","March","Megalomania","Mission","Mayhem","Muscle","Murder","Missile","Mistreatment","Malevolent","Munitions","Maraud","Notorious","Nationalist","Negotiation","Nightmare","Nitrate","Neutralize","Overthrow","Onerous","Out of control","Operation","Officials","Offensive","Order","Overrun","Opposition","Outbreak","Planes","Prisoner","Pilot","Prowl","Post-traumatic","Pugnacious","Partisan","Premeditate","Prey","Patriotism","Plunder","Paramedics","Platoon","Potent","Powder","Power","Pacify","Persecute","Penetration","Pound","Provocation","Pistol","Performance","Patriot","Proliferation","Penetrate","Pushing","Pulverize","Preemptive","Petrify","Prison","Perform","Position","Photos","Patrol","Powerful","Quarrel","Quail","Quiver","Quell","Rally","Refugee","Revenge","Radical","Reputation","Retreat","Ravish","Revolution","Retribution","Radiation","Relentless","Rift","Rule","Resistance","Rounds","Recovery","Rebellion","Reparation","Retaliation","Reaction","Readiness","Recruitment","Reconnaissance","Regiment","Rot","Recruit","Reinforcements","Reprisal","Rival","Ricochet","Ravage","Rocket","Ruthless","Rescue","Rage","Rebel","Rifle","Riot","Regime","Shot","Strategy","Smash","Survival","Survivor","Showdown","Supplies","Sacrifice","Stronghold","Surrender","Storage","Salvage","Sanction","Strength","Surprise","Security","Seize","Secrecy","Seizure","Strife","Siege","Sensor","Secret","Stash","Scramble","Storm","Shock","Shells","Sedition","Skirmish","Strip","Suppression","Strangle","Special-ops","Shoot","Smuggle","Slaughter","Score","Sabotage","Spokesman","Soldier","Savage","Superstition","Suffering","Squad","Strategist","Specialized","Stalk","Struggle","Straggler","Subversive","Support","Stealth","Spysatellite","Strategic","Shelling","Spy","Screening","Strike","Setback","Spotter","Scare","Spy satellite","Submarine","Tsunami","Tactics","Triumph","Training","Tragic","Trauma","Torch","Terrorism","Threat","Terrorize","Thug","Torpedo","Tension","Turbulent","Tornado","Trigger","Trench","Tank","Terror","Topple","Tourniquet","Target","Terrain","Thwart","Treachery","Transportation","Trample","Trap","Terrorist","Threaten","Uprising","Urgency","Unruly","Unite","Unleash","Unify","Unit","Unexpected","Unbelievable","Uniform","Unconventional","Vociferous","Virulence","Violence","Vulnerability","Vow","Venomous","Victory","Vanguard","Vehicular","Vital","Vicious","Violation","Vanish","Veteran","Vehicle","Void","Vile","Vitriol","Vagrant","Vilify","Vendetta","Watchful","Warnings","Weather","Watchlist","Wince","Warplane","Watchdog","Weapon","Well-trained","Worldwide","Wreckage","Wage","Wound","Warrior","Wounds","Whiz","Warrant","Warheads","War","Wisdom","X-ray","Yearn","Yelling","Zigzag","Zeal","Zealot","Zone","pedophile","child molester","demonic","scumbag","fucking","demon-god","daemon"]

swears/bad_Words_list.txt ADDED Viewed

	@@ -0,0 +1,547 @@

+buttmuch
+snatch
+titfuck
+motherfucker
+s.o.b.
+knob end
+clitty litter
+nobhead
+fags
+booobs
+cum
+ejaculation
+fook
+damn
+piss
+motherfuckin
+fingerfucked
+fingerfuckers
+beef curtain
+xrated
+a55
+fatass
+fcuking
+pricks
+nob
+mothafucka
+blowjobs
+shitings
+t1tt1e5
+b!tch
+pimpis
+wtf
+boner
+gangbang
+numbnuts
+need the dick
+testicle
+50 yard cunt punt
+booooooobs
+shittings
+fist fuck
+cuntlick
+ass-fucker
+muthafuckker
+sh1t
+fistfucker
+goddamn
+porn
+bang (one's) box
+pisses
+cop some wood
+dinks
+master-bate
+son-of-a-bitch
+pussies
+f u c k e r
+bum
+cum dumpster
+cunts
+niggers
+carpetmuncher
+coksucka
+cyberfuck
+fuckme
+masterb8
+nigga
+fucks
+fuckhead
+fag
+mof0
+birdlock
+clit licker
+niggaz
+fuckwhit
+shitey
+m0fo
+fukwit
+fanyy
+autoerotic
+cocksucking
+mothafucker
+lusting
+vagina
+tits
+ejaculates
+arsehole
+cocksuka
+fux0r
+cunt
+facial
+w00se
+phuking
+pussy fart
+cumshot
+jiz
+nobjokey
+bellend
+motherfuckings
+scroat
+assfucker
+heshe
+rectum
+knob
+phukking
+knobhead
+fcuk
+queaf
+fucka
+donkeyribber
+nazi
+sadism
+cum freak
+lust
+mafugly
+kondum
+amateur
+carpet muncher
+nigg4h
+tw4t
+asses
+mothafuckings
+kums
+shite
+duche
+cockmunch
+anilingus
+shitted
+shitty
+masterbations
+dink
+cummer
+jism
+bastard
+fuckheads
+shagger
+coon
+feck
+scrotum
+cyberfucked
+kawk
+v1gra
+muthafecker
+fudge packer
+twat
+a_s_s
+how to kill
+kwif
+jack-off
+fagots
+kinky jesus
+horniest
+jerk-off
+mo-fo
+phuk
+pissin
+god damn
+fukkin
+cock pocket
+schlong
+ejaculatings
+nutsack
+bitch tit
+cocks
+c0cksucker
+cuntlicker
+4r5e
+dick
+jap
+cyberfucker
+cock snot
+cyalis
+knobend
+cox
+fuck yo mama
+gangbangs
+crap
+mother fucker
+retard
+hell
+whoar
+gang-bang
+cunilingus
+slut bucket
+muther
+fukker
+d1ck
+dick shy
+fellate
+fuk
+shitfuck
+phukked
+clits
+fooker
+ham flap
+p0rn
+a2m
+fuck hole
+jizz
+pissers
+fuck puppet
+orgasms
+titties
+cornhole
+bugger
+sh!t
+bollock
+wanky
+nobjocky
+twunt
+cum guzzler
+cl1t
+felching
+dlck
+bunny fucker
+spunk
+fukwhit
+tittywank
+hoer
+masterbat3
+bitching
+nigger
+shaggin
+god-dam
+sluts
+arse
+biatch
+fellatio
+boiolas
+mutha
+fanny
+ar5e
+nob jokey
+hoare
+dyke
+tittyfuck
+buttplug
+doggin
+twunter
+niggah
+motherfucked
+masterbation
+fucker
+mothafucking
+skank
+pissoff
+sandbar
+flange
+dildos
+choade
+pawn
+buceta
+cocksucker
+ass
+dick hole
+fingerfucks
+wank
+butt
+bitcher
+cockface
+shi+
+m0f0
+pissing
+motherfucking
+bestiality
+pissed
+slut
+blumpkin
+shemale
+niggas
+asshole
+xxx
+mothafuck
+mothafuckin
+teez
+fecker
+lmfao
+fistfuckers
+clit
+c0ck
+shitter
+fingerfucker
+fuckwit
+boobs
+bestial
+adult
+masturbate
+gaylord
+b1tch
+mothafuckers
+sh!+
+cokmuncher
+tittiefucker
+pigfucker
+cockhead
+vulva
+shitfull
+turd
+shag
+dog-fucker
+fucktoy
+kunilingus
+l3itch
+fuckingshitmotherfucker
+f u c k
+mothafucked
+bi+ch
+fuckings
+blow job
+willies
+god
+bitches
+phuck
+cuntlicking
+knobead
+jizm
+penis
+shit
+bareback
+breasts
+balls
+fingerfuck
+erotic
+motherfuckers
+mutherfucker
+phonesex
+screwing
+assmucus
+bangbros
+cocksucks
+chink
+ejakulate
+gassy ass
+tosser
+fucking
+m45terbate
+horny
+assholes
+fuckmeat
+fux
+hardcoresex
+pussy
+anus
+mothafucks
+dickhead
+t1tties
+cunillingus
+cuntbag
+bitchers
+boooobs
+pube
+hoar
+n1gger
+phuks
+pecker
+hotsex
+cum chugger
+scrote
+rimjaw
+pisser
+homo
+fagot
+goatse
+phuq
+tit wank
+testical
+busty
+blow me
+bitchin
+how to murdep
+ma5terb8
+5hit
+cocksukka
+tittie5
+faggs
+eat hair pie
+fuker
+blowjob
+b17ch
+cok
+shagging
+doggie style
+prick
+goddamned
+labia
+eat a dick
+kummer
+pusse
+fucked
+smegma
+anal leakage
+cocksucked
+teets
+penisfucker
+cawk
+knobjokey
+l3i+ch
+arrse
+jerk
+beastial
+muff
+pussi
+cums
+shitters
+knobed
+v14gra
+cunt-struck
+fingerfucking
+anal impaler
+len
+blue waffle
+kumming
+doosh
+fagging
+fuck-bitch
+pussys
+fuck-ass
+f4nny
+cyberfucking
+shitting
+faggot
+hore
+cumming
+assfukka
+asswhole
+fannyflaps
+orgasim
+fuck
+n1gga
+pornography
+shits
+poop
+masochist
+ejaculate
+s hit
+ass fuck
+cyberfuc
+motherfucks
+cock
+dirsa
+whore
+willy
+dirty sanchez
+god-damned
+cunnilingus
+fistfucked
+mofo
+clitoris
+dildo
+twathead
+sex
+homoerotic
+cyberfuckers
+sausage queen
+titt
+boob
+cipa
+tit
+queer
+kock
+mothafuckas
+mothafuckaz
+gaysex
+motherfuck
+beastiality
+ma5terbate
+clusterfuck
+muff puff
+kum
+dogging
+cut rope
+smut
+b00bs
+ballsack
+chota bags
+5h1t
+bloody
+slope
+masterbate
+fistfuckings
+semen
+cnut
+wang
+cockmuncher
+masterbat*
+lmao
+bust a load
+fuckers
+cuntsicle
+fistfuck
+fuck trophy
+pornos
+sadist
+bollok
+cocksuck
+flog the log
+fistfucks
+ejaculated
+f_u_c_k
+porno
+kondums
+booooobs
+fannyfucker
+phuked
+fuckin
+shithead
+fcuker
+motherfuckka
+pron
+s_h_i_t
+knobjocky
+shiting
+ejaculating
+cock-sucker
+cunt hair
+viagra
+bimbos
+shit fucker
+ballbag
+assmunch
+shited
+doggiestyle
+wanker
+orgasims
+twatty
+titwank
+omg
+butt fuck
+fudgepacker
+nut butter
+shitdick
+pissflaps
+fistfucking
+blow mud
+rimming
+orgasm
+corp whore
+faggitt
+cumdump
+butthole
+jackoff
+nigg3r
+spac
+fuks
+pussy palace
+gangbanged
+anal
+bitch

swears/badwords.txt ADDED Viewed

	@@ -0,0 +1,451 @@

+4r5e
+5h1t
+5hit
+a55
+anal
+anus
+ar5e
+arrse
+arse
+ass
+ass-fucker
+asses
+assfucker
+assfukka
+asshole
+assholes
+asswhole
+a_s_s
+b!tch
+b00bs
+b17ch
+b1tch
+ballbag
+balls
+ballsack
+bastard
+beastial
+beastiality
+bellend
+bestial
+bestiality
+bi+ch
+biatch
+bitch
+bitcher
+bitchers
+bitches
+bitchin
+bitching
+bloody
+blow job
+blowjob
+blowjobs
+boiolas
+bollock
+bollok
+boner
+boob
+boobs
+booobs
+boooobs
+booooobs
+booooooobs
+breasts
+buceta
+bugger
+bum
+bunny fucker
+butt
+butthole
+buttmuch
+buttplug
+c0ck
+c0cksucker
+carpet muncher
+cawk
+chink
+cipa
+cl1t
+clit
+clitoris
+clits
+cnut
+cock
+cock-sucker
+cockface
+cockhead
+cockmunch
+cockmuncher
+cocks
+cocksuck
+cocksucked
+cocksucker
+cocksucking
+cocksucks
+cocksuka
+cocksukka
+cok
+cokmuncher
+coksucka
+coon
+cox
+crap
+cum
+cummer
+cumming
+cums
+cumshot
+cunilingus
+cunillingus
+cunnilingus
+cunt
+cuntlick
+cuntlicker
+cuntlicking
+cunts
+cyalis
+cyberfuc
+cyberfuck
+cyberfucked
+cyberfucker
+cyberfuckers
+cyberfucking
+d1ck
+damn
+dick
+dickhead
+dildo
+dildos
+dink
+dinks
+dirsa
+dlck
+dog-fucker
+doggin
+dogging
+donkeyribber
+doosh
+duche
+dyke
+ejaculate
+ejaculated
+ejaculates
+ejaculating
+ejaculatings
+ejaculation
+ejakulate
+f u c k
+f u c k e r
+f4nny
+fag
+fagging
+faggitt
+faggot
+faggs
+fagot
+fagots
+fags
+fanny
+fannyflaps
+fannyfucker
+fanyy
+fatass
+fcuk
+fcuker
+fcuking
+feck
+fecker
+felching
+fellate
+fellatio
+fingerfuck
+fingerfucked
+fingerfucker
+fingerfuckers
+fingerfucking
+fingerfucks
+fistfuck
+fistfucked
+fistfucker
+fistfuckers
+fistfucking
+fistfuckings
+fistfucks
+flange
+fook
+fooker
+fuck
+fucka
+fucked
+fucker
+fuckers
+fuckhead
+fuckheads
+fuckin
+fucking
+fuckings
+fuckingshitmotherfucker
+fuckme
+fucks
+fuckwhit
+fuckwit
+fudge packer
+fudgepacker
+fuk
+fuker
+fukker
+fukkin
+fuks
+fukwhit
+fukwit
+fux
+fux0r
+f_u_c_k
+gangbang
+gangbanged
+gangbangs
+gaylord
+gaysex
+goatse
+God
+god-dam
+god-damned
+goddamn
+goddamned
+hardcoresex
+hell
+heshe
+hoar
+hoare
+hoer
+homo
+hore
+horniest
+horny
+hotsex
+jack-off
+jackoff
+jap
+jerk-off
+jism
+jiz
+jizm
+jizz
+kawk
+knob
+knobead
+knobed
+knobend
+knobhead
+knobjocky
+knobjokey
+kock
+kondum
+kondums
+kum
+kummer
+kumming
+kums
+kunilingus
+l3i+ch
+l3itch
+labia
+lmfao
+lust
+lusting
+m0f0
+m0fo
+m45terbate
+ma5terb8
+ma5terbate
+masochist
+master-bate
+masterb8
+masterbat*
+masterbat3
+masterbate
+masterbation
+masterbations
+masturbate
+mo-fo
+mof0
+mofo
+mothafuck
+mothafucka
+mothafuckas
+mothafuckaz
+mothafucked
+mothafucker
+mothafuckers
+mothafuckin
+mothafucking
+mothafuckings
+mothafucks
+mother fucker
+motherfuck
+motherfucked
+motherfucker
+motherfuckers
+motherfuckin
+motherfucking
+motherfuckings
+motherfuckka
+motherfucks
+muff
+mutha
+muthafecker
+muthafuckker
+muther
+mutherfucker
+n1gga
+n1gger
+nazi
+nigg3r
+nigg4h
+nigga
+niggah
+niggas
+niggaz
+nigger
+niggers
+nob
+nob jokey
+nobhead
+nobjocky
+nobjokey
+numbnuts
+nutsack
+orgasim
+orgasims
+orgasm
+orgasms
+p0rn
+pawn
+pecker
+penis
+penisfucker
+phonesex
+phuck
+phuk
+phuked
+phuking
+phukked
+phukking
+phuks
+phuq
+pigfucker
+pimpis
+piss
+pissed
+pisser
+pissers
+pisses
+pissflaps
+pissin
+pissing
+pissoff
+poop
+porn
+porno
+pornography
+pornos
+prick
+pricks
+pron
+pube
+pusse
+pussi
+pussies
+pussy
+pussys
+rectum
+retard
+rimjaw
+rimming
+s hit
+s.o.b.
+sadist
+schlong
+screwing
+scroat
+scrote
+scrotum
+semen
+sex
+sh!+
+sh!t
+sh1t
+shag
+shagger
+shaggin
+shagging
+shemale
+shi+
+shit
+shitdick
+shite
+shited
+shitey
+shitfuck
+shitfull
+shithead
+shiting
+shitings
+shits
+shitted
+shitter
+shitters
+shitting
+shittings
+shitty
+skank
+slut
+sluts
+smegma
+smut
+snatch
+son-of-a-bitch
+spac
+spunk
+s_h_i_t
+t1tt1e5
+t1tties
+teets
+teez
+testical
+testicle
+tit
+titfuck
+tits
+titt
+tittie5
+tittiefucker
+titties
+tittyfuck
+tittywank
+titwank
+tosser
+turd
+tw4t
+twat
+twathead
+twatty
+twunt
+twunter
+v14gra
+v1gra
+vagina
+viagra
+vulva
+w00se
+wang
+wank
+wanker
+wanky
+whoar
+whore
+willies
+willy
+xrated
+xxx

swears/cmu-bad-words.txt ADDED Viewed

	@@ -0,0 +1,1383 @@

+abbo
+abo
+abortion
+abuse
+addict
+addicts
+adult
+africa
+african
+alla
+allah
+alligatorbait
+amateur
+american
+anal
+analannie
+analsex
+angie
+angry
+anus
+arab
+arabs
+areola
+argie
+aroused
+arse
+arsehole
+asian
+ass
+assassin
+assassinate
+assassination
+assault
+assbagger
+assblaster
+assclown
+asscowboy
+asses
+assfuck
+assfucker
+asshat
+asshole
+assholes
+asshore
+assjockey
+asskiss
+asskisser
+assklown
+asslick
+asslicker
+asslover
+assman
+assmonkey
+assmunch
+assmuncher
+asspacker
+asspirate
+asspuppies
+assranger
+asswhore
+asswipe
+athletesfoot
+attack
+australian
+babe
+babies
+backdoor
+backdoorman
+backseat
+badfuck
+balllicker
+balls
+ballsack
+banging
+baptist
+barelylegal
+barf
+barface
+barfface
+bast
+bastard
+bazongas
+bazooms
+beaner
+beast
+beastality
+beastial
+beastiality
+beatoff
+beat-off
+beatyourmeat
+beaver
+bestial
+bestiality
+bi
+biatch
+bible
+bicurious
+bigass
+bigbastard
+bigbutt
+bigger
+bisexual
+bi-sexual
+bitch
+bitcher
+bitches
+bitchez
+bitchin
+bitching
+bitchslap
+bitchy
+biteme
+black
+blackman
+blackout
+blacks
+blind
+blow
+blowjob
+boang
+bogan
+bohunk
+bollick
+bollock
+bomb
+bombers
+bombing
+bombs
+bomd
+bondage
+boner
+bong
+boob
+boobies
+boobs
+booby
+boody
+boom
+boong
+boonga
+boonie
+booty
+bootycall
+bountybar
+bra
+brea5t
+breast
+breastjob
+breastlover
+breastman
+brothel
+bugger
+buggered
+buggery
+bullcrap
+bulldike
+bulldyke
+bullshit
+bumblefuck
+bumfuck
+bunga
+bunghole
+buried
+burn
+butchbabes
+butchdike
+butchdyke
+butt
+buttbang
+butt-bang
+buttface
+buttfuck
+butt-fuck
+buttfucker
+butt-fucker
+buttfuckers
+butt-fuckers
+butthead
+buttman
+buttmunch
+buttmuncher
+buttpirate
+buttplug
+buttstain
+byatch
+cacker
+cameljockey
+cameltoe
+canadian
+cancer
+carpetmuncher
+carruth
+catholic
+catholics
+cemetery
+chav
+cherrypopper
+chickslick
+children's
+chin
+chinaman
+chinamen
+chinese
+chink
+chinky
+choad
+chode
+christ
+christian
+church
+cigarette
+cigs
+clamdigger
+clamdiver
+clit
+clitoris
+clogwog
+cocaine
+cock
+cockblock
+cockblocker
+cockcowboy
+cockfight
+cockhead
+cockknob
+cocklicker
+cocklover
+cocknob
+cockqueen
+cockrider
+cocksman
+cocksmith
+cocksmoker
+cocksucer
+cocksuck
+cocksucked
+cocksucker
+cocksucking
+cocktail
+cocktease
+cocky
+cohee
+coitus
+color
+colored
+coloured
+commie
+communist
+condom
+conservative
+conspiracy
+coolie
+cooly
+coon
+coondog
+copulate
+cornhole
+corruption
+cra5h
+crabs
+crack
+crackpipe
+crackwhore
+crack-whore
+crap
+crapola
+crapper
+crappy
+crash
+creamy
+crime
+crimes
+criminal
+criminals
+crotch
+crotchjockey
+crotchmonkey
+crotchrot
+cum
+cumbubble
+cumfest
+cumjockey
+cumm
+cummer
+cumming
+cumquat
+cumqueen
+cumshot
+cunilingus
+cunillingus
+cunn
+cunnilingus
+cunntt
+cunt
+cunteyed
+cuntfuck
+cuntfucker
+cuntlick
+cuntlicker
+cuntlicking
+cuntsucker
+cybersex
+cyberslimer
+dago
+dahmer
+dammit
+damn
+damnation
+damnit
+darkie
+darky
+datnigga
+dead
+deapthroat
+death
+deepthroat
+defecate
+dego
+demon
+deposit
+desire
+destroy
+deth
+devil
+devilworshipper
+dick
+dickbrain
+dickforbrains
+dickhead
+dickless
+dicklick
+dicklicker
+dickman
+dickwad
+dickweed
+diddle
+die
+died
+dies
+dike
+dildo
+dingleberry
+dink
+dipshit
+dipstick
+dirty
+disease
+diseases
+disturbed
+dive
+dix
+dixiedike
+dixiedyke
+doggiestyle
+doggystyle
+dong
+doodoo
+doo-doo
+doom
+dope
+dragqueen
+dragqween
+dripdick
+drug
+drunk
+drunken
+dumb
+dumbass
+dumbbitch
+dumbfuck
+dyefly
+dyke
+easyslut
+eatballs
+eatme
+eatpussy
+ecstacy
+ejaculate
+ejaculated
+ejaculating
+ejaculation
+enema
+enemy
+erect
+erection
+ero
+escort
+ethiopian
+ethnic
+european
+evl
+excrement
+execute
+executed
+execution
+executioner
+explosion
+facefucker
+faeces
+fag
+fagging
+faggot
+fagot
+failed
+failure
+fairies
+fairy
+faith
+fannyfucker
+fart
+farted
+farting
+farty
+fastfuck
+fat
+fatah
+fatass
+fatfuck
+fatfucker
+fatso
+fckcum
+fear
+feces
+felatio
+felch
+felcher
+felching
+fellatio
+feltch
+feltcher
+feltching
+fetish
+fight
+filipina
+filipino
+fingerfood
+fingerfuck
+fingerfucked
+fingerfucker
+fingerfuckers
+fingerfucking
+fire
+firing
+fister
+fistfuck
+fistfucked
+fistfucker
+fistfucking
+fisting
+flange
+flasher
+flatulence
+floo
+flydie
+flydye
+fok
+fondle
+footaction
+footfuck
+footfucker
+footlicker
+footstar
+fore
+foreskin
+forni
+fornicate
+foursome
+fourtwenty
+fraud
+freakfuck
+freakyfucker
+freefuck
+fu
+fubar
+fuc
+fucck
+fuck
+fucka
+fuckable
+fuckbag
+fuckbuddy
+fucked
+fuckedup
+fucker
+fuckers
+fuckface
+fuckfest
+fuckfreak
+fuckfriend
+fuckhead
+fuckher
+fuckin
+fuckina
+fucking
+fuckingbitch
+fuckinnuts
+fuckinright
+fuckit
+fuckknob
+fuckme
+fuckmehard
+fuckmonkey
+fuckoff
+fuckpig
+fucks
+fucktard
+fuckwhore
+fuckyou
+fudgepacker
+fugly
+fuk
+fuks
+funeral
+funfuck
+fungus
+fuuck
+gangbang
+gangbanged
+gangbanger
+gangsta
+gatorbait
+gay
+gaymuthafuckinwhore
+gaysex
+geez
+geezer
+geni
+genital
+german
+getiton
+gin
+ginzo
+gipp
+girls
+givehead
+glazeddonut
+gob
+god
+godammit
+goddamit
+goddammit
+goddamn
+goddamned
+goddamnes
+goddamnit
+goddamnmuthafucker
+goldenshower
+gonorrehea
+gonzagas
+gook
+gotohell
+goy
+goyim
+greaseball
+gringo
+groe
+gross
+grostulation
+gubba
+gummer
+gun
+gyp
+gypo
+gypp
+gyppie
+gyppo
+gyppy
+hamas
+handjob
+hapa
+harder
+hardon
+harem
+headfuck
+headlights
+hebe
+heeb
+hell
+henhouse
+heroin
+herpes
+heterosexual
+hijack
+hijacker
+hijacking
+hillbillies
+hindoo
+hiscock
+hitler
+hitlerism
+hitlerist
+hiv
+ho
+hobo
+hodgie
+hoes
+hole
+holestuffer
+homicide
+homo
+homobangers
+homosexual
+honger
+honk
+honkers
+honkey
+honky
+hook
+hooker
+hookers
+hooters
+hore
+hork
+horn
+horney
+horniest
+horny
+horseshit
+hosejob
+hoser
+hostage
+hotdamn
+hotpussy
+hottotrot
+hummer
+husky
+hussy
+hustler
+hymen
+hymie
+iblowu
+idiot
+ikey
+illegal
+incest
+insest
+intercourse
+interracial
+intheass
+inthebuff
+israel
+israeli
+israel's
+italiano
+itch
+jackass
+jackoff
+jackshit
+jacktheripper
+jade
+jap
+japanese
+japcrap
+jebus
+jeez
+jerkoff
+jesus
+jesuschrist
+jew
+jewish
+jiga
+jigaboo
+jigg
+jigga
+jiggabo
+jigger
+jiggy
+jihad
+jijjiboo
+jimfish
+jism
+jiz
+jizim
+jizjuice
+jizm
+jizz
+jizzim
+jizzum
+joint
+juggalo
+jugs
+junglebunny
+kaffer
+kaffir
+kaffre
+kafir
+kanake
+kid
+kigger
+kike
+kill
+killed
+killer
+killing
+kills
+kink
+kinky
+kissass
+kkk
+knife
+knockers
+kock
+kondum
+koon
+kotex
+krap
+krappy
+kraut
+kum
+kumbubble
+kumbullbe
+kummer
+kumming
+kumquat
+kums
+kunilingus
+kunnilingus
+kunt
+ky
+kyke
+lactate
+laid
+lapdance
+latin
+lesbain
+lesbayn
+lesbian
+lesbin
+lesbo
+lez
+lezbe
+lezbefriends
+lezbo
+lezz
+lezzo
+liberal
+libido
+licker
+lickme
+lies
+limey
+limpdick
+limy
+lingerie
+liquor
+livesex
+loadedgun
+lolita
+looser
+loser
+lotion
+lovebone
+lovegoo
+lovegun
+lovejuice
+lovemuscle
+lovepistol
+loverocket
+lowlife
+lsd
+lubejob
+lucifer
+luckycammeltoe
+lugan
+lynch
+macaca
+mad
+mafia
+magicwand
+mams
+manhater
+manpaste
+marijuana
+mastabate
+mastabater
+masterbate
+masterblaster
+mastrabator
+masturbate
+masturbating
+mattressprincess
+meatbeatter
+meatrack
+meth
+mexican
+mgger
+mggor
+mickeyfinn
+mideast
+milf
+minority
+mockey
+mockie
+mocky
+mofo
+moky
+moles
+molest
+molestation
+molester
+molestor
+moneyshot
+mooncricket
+mormon
+moron
+moslem
+mosshead
+mothafuck
+mothafucka
+mothafuckaz
+mothafucked
+mothafucker
+mothafuckin
+mothafucking
+mothafuckings
+motherfuck
+motherfucked
+motherfucker
+motherfuckin
+motherfucking
+motherfuckings
+motherlovebone
+muff
+muffdive
+muffdiver
+muffindiver
+mufflikcer
+mulatto
+muncher
+munt
+murder
+murderer
+muslim
+naked
+narcotic
+nasty
+nastybitch
+nastyho
+nastyslut
+nastywhore
+nazi
+necro
+negro
+negroes
+negroid
+negro's
+nig
+niger
+nigerian
+nigerians
+nigg
+nigga
+niggah
+niggaracci
+niggard
+niggarded
+niggarding
+niggardliness
+niggardliness's
+niggardly
+niggards
+niggard's
+niggaz
+nigger
+niggerhead
+niggerhole
+niggers
+nigger's
+niggle
+niggled
+niggles
+niggling
+nigglings
+niggor
+niggur
+niglet
+nignog
+nigr
+nigra
+nigre
+nip
+nipple
+nipplering
+nittit
+nlgger
+nlggor
+nofuckingway
+nook
+nookey
+nookie
+noonan
+nooner
+nude
+nudger
+nuke
+nutfucker
+nymph
+ontherag
+oral
+orga
+orgasim
+orgasm
+orgies
+orgy
+osama
+paki
+palesimian
+palestinian
+pansies
+pansy
+panti
+panties
+payo
+pearlnecklace
+peck
+pecker
+peckerwood
+pee
+peehole
+pee-pee
+peepshow
+peepshpw
+pendy
+penetration
+peni5
+penile
+penis
+penises
+penthouse
+period
+perv
+phonesex
+phuk
+phuked
+phuking
+phukked
+phukking
+phungky
+phuq
+pi55
+picaninny
+piccaninny
+pickaninny
+piker
+pikey
+piky
+pimp
+pimped
+pimper
+pimpjuic
+pimpjuice
+pimpsimp
+pindick
+piss
+pissed
+pisser
+pisses
+pisshead
+pissin
+pissing
+pissoff
+pistol
+pixie
+pixy
+playboy
+playgirl
+pocha
+pocho
+pocketpool
+pohm
+polack
+pom
+pommie
+pommy
+poo
+poon
+poontang
+poop
+pooper
+pooperscooper
+pooping
+poorwhitetrash
+popimp
+porchmonkey
+porn
+pornflick
+pornking
+porno
+pornography
+pornprincess
+pot
+poverty
+premature
+pric
+prick
+prickhead
+primetime
+propaganda
+pros
+prostitute
+protestant
+pu55i
+pu55y
+pube
+pubic
+pubiclice
+pud
+pudboy
+pudd
+puddboy
+puke
+puntang
+purinapricness
+puss
+pussie
+pussies
+pussy
+pussycat
+pussyeater
+pussyfucker
+pussylicker
+pussylips
+pussylover
+pussypounder
+pusy
+quashie
+queef
+queer
+quickie
+quim
+ra8s
+rabbi
+racial
+racist
+radical
+radicals
+raghead
+randy
+rape
+raped
+raper
+rapist
+rearend
+rearentry
+rectum
+redlight
+redneck
+reefer
+reestie
+refugee
+reject
+remains
+rentafuck
+republican
+rere
+retard
+retarded
+ribbed
+rigger
+rimjob
+rimming
+roach
+robber
+roundeye
+rump
+russki
+russkie
+sadis
+sadom
+samckdaddy
+sandm
+sandnigger
+satan
+scag
+scallywag
+scat
+schlong
+screw
+screwyou
+scrotum
+scum
+semen
+seppo
+servant
+sex
+sexed
+sexfarm
+sexhound
+sexhouse
+sexing
+sexkitten
+sexpot
+sexslave
+sextogo
+sextoy
+sextoys
+sexual
+sexually
+sexwhore
+sexy
+sexymoma
+sexy-slim
+shag
+shaggin
+shagging
+shat
+shav
+shawtypimp
+sheeney
+shhit
+shinola
+shit
+shitcan
+shitdick
+shite
+shiteater
+shited
+shitface
+shitfaced
+shitfit
+shitforbrains
+shitfuck
+shitfucker
+shitfull
+shithapens
+shithappens
+shithead
+shithouse
+shiting
+shitlist
+shitola
+shitoutofluck
+shits
+shitstain
+shitted
+shitter
+shitting
+shitty
+shoot
+shooting
+shortfuck
+showtime
+sick
+sissy
+sixsixsix
+sixtynine
+sixtyniner
+skank
+skankbitch
+skankfuck
+skankwhore
+skanky
+skankybitch
+skankywhore
+skinflute
+skum
+skumbag
+slant
+slanteye
+slapper
+slaughter
+slav
+slave
+slavedriver
+sleezebag
+sleezeball
+slideitin
+slime
+slimeball
+slimebucket
+slopehead
+slopey
+slopy
+slut
+sluts
+slutt
+slutting
+slutty
+slutwear
+slutwhore
+smack
+smackthemonkey
+smut
+snatch
+snatchpatch
+snigger
+sniggered
+sniggering
+sniggers
+snigger's
+sniper
+snot
+snowback
+snownigger
+sob
+sodom
+sodomise
+sodomite
+sodomize
+sodomy
+sonofabitch
+sonofbitch
+sooty
+sos
+soviet
+spaghettibender
+spaghettinigger
+spank
+spankthemonkey
+sperm
+spermacide
+spermbag
+spermhearder
+spermherder
+spic
+spick
+spig
+spigotty
+spik
+spit
+spitter
+splittail
+spooge
+spreadeagle
+spunk
+spunky
+squaw
+stagg
+stiffy
+strapon
+stringer
+stripclub
+stroke
+stroking
+stupid
+stupidfuck
+stupidfucker
+suck
+suckdick
+sucker
+suckme
+suckmyass
+suckmydick
+suckmytit
+suckoff
+suicide
+swallow
+swallower
+swalow
+swastika
+sweetness
+syphilis
+taboo
+taff
+tampon
+tang
+tantra
+tarbaby
+tard
+teat
+terror
+terrorist
+teste
+testicle
+testicles
+thicklips
+thirdeye
+thirdleg
+threesome
+threeway
+timbernigger
+tinkle
+tit
+titbitnipply
+titfuck
+titfucker
+titfuckin
+titjob
+titlicker
+titlover
+tits
+tittie
+titties
+titty
+tnt
+toilet
+tongethruster
+tongue
+tonguethrust
+tonguetramp
+tortur
+torture
+tosser
+towelhead
+trailertrash
+tramp
+trannie
+tranny
+transexual
+transsexual
+transvestite
+triplex
+trisexual
+trojan
+trots
+tuckahoe
+tunneloflove
+turd
+turnon
+twat
+twink
+twinkie
+twobitwhore
+uck
+uk
+unfuckable
+upskirt
+uptheass
+upthebutt
+urinary
+urinate
+urine
+usama
+uterus
+vagina
+vaginal
+vatican
+vibr
+vibrater
+vibrator
+vietcong
+violence
+virgin
+virginbreaker
+vomit
+vulva
+wab
+wank
+wanker
+wanking
+waysted
+weapon
+weenie
+weewee
+welcher
+welfare
+wetb
+wetback
+wetspot
+whacker
+whash
+whigger
+whiskey
+whiskeydick
+whiskydick
+whit
+whitenigger
+whites
+whitetrash
+whitey
+whiz
+whop
+whore
+whorefucker
+whorehouse
+wigger
+willie
+williewanker
+willy
+wn
+wog
+women's
+wop
+wtf
+wuss
+wuzzie
+xtc
+xxx
+yankee
+yellowman
+zigabo
+zipperhead