import itertools import os import pickle import re from operator import itemgetter import easy_db from pprint import pprint import json import networkx as nx from Pinpoint.RandomForest import * import Pinpoint.FeatureExtraction import csv db_path = "../new-new-just-posts-and-clean-dates-parler-messages.db" log_file = open("community_logs.txt", 'w') log_file.write("") log_file.close() used_names = [] SHOULD_WRITE_CSVS = False class grapher(): """ A wrapper class used for generating a graph for interactions between users """ graph = None def __init__(self): """ Constructor. """ self.graph = Graph() def add_edge_wrapper(self, node_1_name, node_2_name, weight=1, relationship=None): """ A wrapper function used to add an edge connection or node. :param node_1_name: from :param node_2_name: to :param weight: :param relationship: :return: """ # get node one ID node_1 = None for node in self.graph.vs: if node["label"] == node_1_name.capitalize(): node_1 = node if node_1 == None: self.graph.add_vertices(1) node_count = self.graph.vcount() self.graph.vs[node_count-1]["id"] = node_count-1 self.graph.vs[node_count-1]["label"] = node_1_name.capitalize() node_1 = self.graph.vs[node_count-1] # get node two id node_2 = None for node in self.graph.vs: if node["label"] == node_2_name.capitalize(): node_2 = node if node_2 == None: self.graph.add_vertices(1) node_count = self.graph.vcount() self.graph.vs[node_count - 1]["id"] = node_count - 1 self.graph.vs[node_count - 1]["label"] = node_2_name.capitalize() node_2 = self.graph.vs[node_count - 1] #print("User one {} - {}, user two {} - {}".format(node_1["label"], str(node_1["id"]), # node_2["label"], str(node_2["id"]))) self.graph.add_edges([(node_1["id"], node_2["id"])]) #self.graph.add_edge(node_1_name, node_2_name, weight=weight, relation=relationship) # , attr={""} def add_node(self, node_name): """ A wrapper function that adds a node with no edges to the graph :param node_name: """ node_1 = None for node in self.graph.vs: if node["label"] == node_name.capitalize(): node_1 = node["id"] if node_1 == None: self.graph.add_vertices(1) node_count = self.graph.vcount() self.graph.vs[node_count-1]["id"] = node_count-1 self.graph.vs[node_count-1]["label"] = node_name.capitalize() node_1 = self.graph.vs[node_count-1] def get_database(where=None): #print(where) message_db = easy_db.DataBase(db_path) if where is None: return message_db.pull("parler_messages") else: return message_db.pull_where("parler_messages", where) def get_mentioned_usernames_from_post(post): # Process mentions mentions = re.findall("\@([a-zA-Z\-\_]+)", post) sanitised_list = [] for mention in mentions: mention = mention.replace("@", "") sanitised_list.append(mention) return sanitised_list def get_rows_from_csv_where_field_is(csv_name, username, month): rows = [] with open(csv_name, 'rt', encoding="utf8") as f: for row in csv.DictReader(f, fieldnames=["A","B","C","WC","Analytic","Clout","Authentic","Tone","WPS","Sixltr", "Dic","function","pronoun","ppron","i","we","you","shehe","they","ipron", "article","prep","auxverb","adverb","conj","negate","verb","adj","compare", "interrog","number","quant","affect","posemo","negemo","anx","anger","sad", "social","family","friend","female","male","cogproc","insight","cause","discrep", "tentat","certain","differ","percept","see","hear","feel","bio","body","health", "sexual","ingest","drives","affiliation","achieve","power","reward","risk", "focuspast","focuspresent","focusfuture","relativ","motion","space","time","work", "leisure","home","money","relig","death","informal","swear","netspeak","assent", "nonflu","filler","AllPunc","Period","Comma","Colon","SemiC","QMark","Exclam", "Dash","Quote","Apostro","Parenth","OtherP"]): if username.strip().lower() in row["A"].strip().lower() \ and month.strip().lower() in row["B"].strip().lower(): rows.append(row) return rows month_graphs = {} year_range = list(range(2017, 2022)) month_range = list(range(1, 13)) INITIAL_COMMUNITIES_FILE_NAME = "phase_one_communities_file.pickle" SECOND_COMMUNITIES_FILE_NAME = "phase_two_communities_file.pickle" print("Loading old {} file".format(INITIAL_COMMUNITIES_FILE_NAME)) pickle_file = open(INITIAL_COMMUNITIES_FILE_NAME, "rb") month_graphs = pickle.load(pickle_file) pickle_file.close() print("loaded...") # Get communities month_graph_keys = list(month_graphs.keys()) month_graph_keys.sort() list_of_community_objects = [] # get top 10 centrality users per month of parler if not os.path.isfile(SECOND_COMMUNITIES_FILE_NAME): dict_of_centrality_per_month = {} dict_of_user_count_per_month = {} dict_of_shrinkage = {} total_unique_user_list = [] total_users = [] highest_centrality = 0 highest_centrality_user = None date_of_highest_centrality = None dict_of_messages = {} number_of_users_dict = {} highest_number_of_users = 0 highest_number_of_users_month = None shrinkage_per_month = {} last_month = None all_months_centality = {} all_centralities = {} for month_key in month_graph_keys: print("Reviewing graph for date '{}'".format(month_key)) graph = month_graphs[month_key].graph user_nodes = graph.nodes.keys() print("users {}".format(len(user_nodes))) centrality_for_month = {} iterator = 0 centrality_for_month = nx.degree_centrality(graph) all_centralities[month_key] = centrality_for_month # sort if len(centrality_for_month) > 0: sorted_list = sorted(centrality_for_month, key=centrality_for_month.get, reverse=True)[:10] all_months_centality[month_key] = sorted_list unique_users = {} for month in all_months_centality: for user in all_months_centality[month]: if user not in unique_users.keys(): unique_users[user] = [{"month":month, "centrality":all_centralities[month][user]}] else: unique_users[user].append({"month":month, "centrality":all_centralities[month][user]}) pprint(unique_users) # write to csv if SHOULD_WRITE_CSVS: seen_users = [] with open('all-messages.json.csv', 'w', encoding='utf8', newline='') as output_file: writer = csv.DictWriter(output_file,fieldnames=["username","timestamp","message"]) for month in all_months_centality: graph = month_graphs[month] for user in all_months_centality[month]: if user not in seen_users: seen_users.append(user) # get from database where username == user and month == month # loop through messages. # if above threshold is extremist. if user != "-": print("getting posts for user '{}'".format(user)) posts = get_database("username='{}' COLLATE NOCASE".format(user)) print("Posts found: {}".format(len(posts))) if posts == None: raise Exception("no posts, 'where' failed") for post in posts: #users_mentioned = get_mentioned_usernames_from_post(post["body"]) writer.writerow({"username": post["username"], "timestamp": post["Time"], "message": post["body"]}) model = random_forest() model.train_model(features_file = None, force_new_dataset=False, model_location=r"far-right-baseline.model") dict_of_users_all = {} feature_extractor = Pinpoint.FeatureExtraction.feature_extraction(violent_words_dataset_location="swears",baseline_training_dataset_location="data/LIWC2015 Results (Storm_Front_Posts).csv") # Get the is-extremist score for users for the month they were in the highest centrality for month in all_months_centality: for user in all_months_centality[month]: print("Getting data for user {} and month {}".format(user, month)) # Get rows for this user and month rows = get_rows_from_csv_where_field_is("data/LIWC2015 Results (all-messages.csv).csv", user, month) # write these to a new (temp) csv pprint(rows) if len(rows) <= 1: print("Not enough rows for {} {}".format(user, month)) continue keys = rows[0].keys() with open('temp.csv', 'w', newline='', encoding='utf8') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(rows) feature_extractor._reset_stored_feature_data() feature_extractor._get_type_of_message_data(data_set_location="temp.csv") with open("messages.json", 'w') as outfile: json.dump(feature_extractor.completed_tweet_user_features, outfile, indent=4) rows = model.get_features_as_df("messages.json", True) print("Length of rows returned: {}".format(len(rows))) number_of_connections = 0 number_of_connections_extremist = 0 is_extemist_count = 0 for row in rows: post = row["C"] is_extremist = model.model.predict(post) print("Post '{}...' is extemist {}".format(post[:20], is_extremist)) if is_extremist: is_extemist_count = is_extemist_count+1 # If we were to do mentione dusers we'd need to markup with LIWC again. Could I use the less reliable version without LIWC? if is_extemist_count != 0: percentage_extremist = len(rows) /is_extemist_count else: percentage_extremist = 0 if user not in dict_of_users_all: dict_of_users_all[user] = {"months":{}} if "months" in dict_of_users_all[user].keys(): dict_of_users_all[user]["months"][month] = percentage_extremist with open('data.json', 'w') as fp: json.dump(dict_of_users_all, fp) # mark up csv with LIWC scores. # number of unique users. manual 100 max (less users), otherwise doesn't really matter. # classed as radicalised? Look at the accounts and posts, what are they up to over time. # are any posts far right, mostly extremist material, # when looking at connections - apply the same above. at time period on mention and overall. # create the csv writer # when have they been active, what monts are they extremist, how often, common words or phrases, etc '''users_of_interest[user] = { "centrality": month[user], "is_extremist":, "is_connections_extremist":, } ''' # radicalisation window? # use high centrality users that are extremist # look at the work.