Spaces:
Sleeping
Sleeping
import itertools | |
import os | |
import pickle | |
import re | |
from operator import itemgetter | |
import easy_db | |
from pprint import pprint | |
import json | |
import networkx as nx | |
from Pinpoint.RandomForest import * | |
import Pinpoint.FeatureExtraction | |
import csv | |
db_path = "../new-new-just-posts-and-clean-dates-parler-messages.db" | |
log_file = open("community_logs.txt", 'w') | |
log_file.write("") | |
log_file.close() | |
used_names = [] | |
SHOULD_WRITE_CSVS = False | |
class grapher(): | |
""" | |
A wrapper class used for generating a graph for interactions between users | |
""" | |
graph = None | |
def __init__(self): | |
""" | |
Constructor. | |
""" | |
self.graph = Graph() | |
def add_edge_wrapper(self, node_1_name, node_2_name, weight=1, relationship=None): | |
""" | |
A wrapper function used to add an edge connection or node. | |
:param node_1_name: from | |
:param node_2_name: to | |
:param weight: | |
:param relationship: | |
:return: | |
""" | |
# get node one ID | |
node_1 = None | |
for node in self.graph.vs: | |
if node["label"] == node_1_name.capitalize(): | |
node_1 = node | |
if node_1 == None: | |
self.graph.add_vertices(1) | |
node_count = self.graph.vcount() | |
self.graph.vs[node_count-1]["id"] = node_count-1 | |
self.graph.vs[node_count-1]["label"] = node_1_name.capitalize() | |
node_1 = self.graph.vs[node_count-1] | |
# get node two id | |
node_2 = None | |
for node in self.graph.vs: | |
if node["label"] == node_2_name.capitalize(): | |
node_2 = node | |
if node_2 == None: | |
self.graph.add_vertices(1) | |
node_count = self.graph.vcount() | |
self.graph.vs[node_count - 1]["id"] = node_count - 1 | |
self.graph.vs[node_count - 1]["label"] = node_2_name.capitalize() | |
node_2 = self.graph.vs[node_count - 1] | |
#print("User one {} - {}, user two {} - {}".format(node_1["label"], str(node_1["id"]), | |
# node_2["label"], str(node_2["id"]))) | |
self.graph.add_edges([(node_1["id"], node_2["id"])]) | |
#self.graph.add_edge(node_1_name, node_2_name, weight=weight, relation=relationship) # , attr={""} | |
def add_node(self, node_name): | |
""" | |
A wrapper function that adds a node with no edges to the graph | |
:param node_name: | |
""" | |
node_1 = None | |
for node in self.graph.vs: | |
if node["label"] == node_name.capitalize(): | |
node_1 = node["id"] | |
if node_1 == None: | |
self.graph.add_vertices(1) | |
node_count = self.graph.vcount() | |
self.graph.vs[node_count-1]["id"] = node_count-1 | |
self.graph.vs[node_count-1]["label"] = node_name.capitalize() | |
node_1 = self.graph.vs[node_count-1] | |
def get_database(where=None): | |
#print(where) | |
message_db = easy_db.DataBase(db_path) | |
if where is None: | |
return message_db.pull("parler_messages") | |
else: | |
return message_db.pull_where("parler_messages", where) | |
def get_mentioned_usernames_from_post(post): | |
# Process mentions | |
mentions = re.findall("\@([a-zA-Z\-\_]+)", post) | |
sanitised_list = [] | |
for mention in mentions: | |
mention = mention.replace("@", "") | |
sanitised_list.append(mention) | |
return sanitised_list | |
def get_rows_from_csv_where_field_is(csv_name, username, month): | |
rows = [] | |
with open(csv_name, 'rt', encoding="utf8") as f: | |
for row in csv.DictReader(f, fieldnames=["A","B","C","WC","Analytic","Clout","Authentic","Tone","WPS","Sixltr", | |
"Dic","function","pronoun","ppron","i","we","you","shehe","they","ipron", | |
"article","prep","auxverb","adverb","conj","negate","verb","adj","compare", | |
"interrog","number","quant","affect","posemo","negemo","anx","anger","sad", | |
"social","family","friend","female","male","cogproc","insight","cause","discrep", | |
"tentat","certain","differ","percept","see","hear","feel","bio","body","health", | |
"sexual","ingest","drives","affiliation","achieve","power","reward","risk", | |
"focuspast","focuspresent","focusfuture","relativ","motion","space","time","work", | |
"leisure","home","money","relig","death","informal","swear","netspeak","assent", | |
"nonflu","filler","AllPunc","Period","Comma","Colon","SemiC","QMark","Exclam", | |
"Dash","Quote","Apostro","Parenth","OtherP"]): | |
if username.strip().lower() in row["A"].strip().lower() \ | |
and month.strip().lower() in row["B"].strip().lower(): | |
rows.append(row) | |
return rows | |
month_graphs = {} | |
year_range = list(range(2017, 2022)) | |
month_range = list(range(1, 13)) | |
INITIAL_COMMUNITIES_FILE_NAME = "phase_one_communities_file.pickle" | |
SECOND_COMMUNITIES_FILE_NAME = "phase_two_communities_file.pickle" | |
print("Loading old {} file".format(INITIAL_COMMUNITIES_FILE_NAME)) | |
pickle_file = open(INITIAL_COMMUNITIES_FILE_NAME, "rb") | |
month_graphs = pickle.load(pickle_file) | |
pickle_file.close() | |
print("loaded...") | |
# Get communities | |
month_graph_keys = list(month_graphs.keys()) | |
month_graph_keys.sort() | |
list_of_community_objects = [] | |
# get top 10 centrality users per month of parler | |
if not os.path.isfile(SECOND_COMMUNITIES_FILE_NAME): | |
dict_of_centrality_per_month = {} | |
dict_of_user_count_per_month = {} | |
dict_of_shrinkage = {} | |
total_unique_user_list = [] | |
total_users = [] | |
highest_centrality = 0 | |
highest_centrality_user = None | |
date_of_highest_centrality = None | |
dict_of_messages = {} | |
number_of_users_dict = {} | |
highest_number_of_users = 0 | |
highest_number_of_users_month = None | |
shrinkage_per_month = {} | |
last_month = None | |
all_months_centality = {} | |
all_centralities = {} | |
for month_key in month_graph_keys: | |
print("Reviewing graph for date '{}'".format(month_key)) | |
graph = month_graphs[month_key].graph | |
user_nodes = graph.nodes.keys() | |
print("users {}".format(len(user_nodes))) | |
centrality_for_month = {} | |
iterator = 0 | |
centrality_for_month = nx.degree_centrality(graph) | |
all_centralities[month_key] = centrality_for_month | |
# sort | |
if len(centrality_for_month) > 0: | |
sorted_list = sorted(centrality_for_month, key=centrality_for_month.get, reverse=True)[:10] | |
all_months_centality[month_key] = sorted_list | |
unique_users = {} | |
for month in all_months_centality: | |
for user in all_months_centality[month]: | |
if user not in unique_users.keys(): | |
unique_users[user] = [{"month":month, "centrality":all_centralities[month][user]}] | |
else: | |
unique_users[user].append({"month":month, "centrality":all_centralities[month][user]}) | |
pprint(unique_users) | |
# write to csv | |
if SHOULD_WRITE_CSVS: | |
seen_users = [] | |
with open('all-messages.json.csv', 'w', encoding='utf8', newline='') as output_file: | |
writer = csv.DictWriter(output_file,fieldnames=["username","timestamp","message"]) | |
for month in all_months_centality: | |
graph = month_graphs[month] | |
for user in all_months_centality[month]: | |
if user not in seen_users: | |
seen_users.append(user) | |
# get from database where username == user and month == month | |
# loop through messages. | |
# if above threshold is extremist. | |
if user != "-": | |
print("getting posts for user '{}'".format(user)) | |
posts = get_database("username='{}' COLLATE NOCASE".format(user)) | |
print("Posts found: {}".format(len(posts))) | |
if posts == None: | |
raise Exception("no posts, 'where' failed") | |
for post in posts: | |
#users_mentioned = get_mentioned_usernames_from_post(post["body"]) | |
writer.writerow({"username": post["username"], "timestamp": post["Time"], "message": post["body"]}) | |
model = random_forest() | |
model.train_model(features_file = None, force_new_dataset=False, model_location=r"far-right-baseline.model") | |
dict_of_users_all = {} | |
feature_extractor = Pinpoint.FeatureExtraction.feature_extraction(violent_words_dataset_location="swears",baseline_training_dataset_location="data/LIWC2015 Results (Storm_Front_Posts).csv") | |
# Get the is-extremist score for users for the month they were in the highest centrality | |
for month in all_months_centality: | |
for user in all_months_centality[month]: | |
print("Getting data for user {} and month {}".format(user, month)) | |
# Get rows for this user and month | |
rows = get_rows_from_csv_where_field_is("data/LIWC2015 Results (all-messages.csv).csv", user, month) | |
# write these to a new (temp) csv | |
pprint(rows) | |
if len(rows) <= 1: | |
print("Not enough rows for {} {}".format(user, month)) | |
continue | |
keys = rows[0].keys() | |
with open('temp.csv', 'w', newline='', encoding='utf8') as output_file: | |
dict_writer = csv.DictWriter(output_file, keys) | |
dict_writer.writeheader() | |
dict_writer.writerows(rows) | |
feature_extractor._reset_stored_feature_data() | |
feature_extractor._get_type_of_message_data(data_set_location="temp.csv") | |
with open("messages.json", 'w') as outfile: | |
json.dump(feature_extractor.completed_tweet_user_features, outfile, indent=4) | |
rows = model.get_features_as_df("messages.json", True) | |
print("Length of rows returned: {}".format(len(rows))) | |
number_of_connections = 0 | |
number_of_connections_extremist = 0 | |
is_extemist_count = 0 | |
for row in rows: | |
post = row["C"] | |
is_extremist = model.model.predict(post) | |
print("Post '{}...' is extemist {}".format(post[:20], is_extremist)) | |
if is_extremist: | |
is_extemist_count = is_extemist_count+1 | |
# If we were to do mentione dusers we'd need to markup with LIWC again. Could I use the less reliable version without LIWC? | |
if is_extemist_count != 0: | |
percentage_extremist = len(rows) /is_extemist_count | |
else: | |
percentage_extremist = 0 | |
if user not in dict_of_users_all: | |
dict_of_users_all[user] = {"months":{}} | |
if "months" in dict_of_users_all[user].keys(): | |
dict_of_users_all[user]["months"][month] = percentage_extremist | |
with open('data.json', 'w') as fp: | |
json.dump(dict_of_users_all, fp) | |
# mark up csv with LIWC scores. | |
# number of unique users. manual 100 max (less users), otherwise doesn't really matter. | |
# classed as radicalised? Look at the accounts and posts, what are they up to over time. | |
# are any posts far right, mostly extremist material, | |
# when looking at connections - apply the same above. at time period on mention and overall. | |
# create the csv writer | |
# when have they been active, what monts are they extremist, how often, common words or phrases, etc | |
'''users_of_interest[user] = { | |
"centrality": month[user], | |
"is_extremist":, | |
"is_connections_extremist":, | |
} | |
''' | |
# radicalisation window? | |
# use high centrality users that are extremist | |
# look at the work. | |