Spaces:

User1342
/

Pinpoint-Web

Sleeping

Pinpoint-Web / Pinpoint_Internal /centrality-v2.py

James Stevenson

added lib

246df79 over 2 years ago

11.9 kB

	import itertools
	import os
	import pickle
	import re
	from operator import itemgetter

	import easy_db
	from pprint import pprint
	import json
	import networkx as nx
	from Pinpoint.RandomForest import *
	import Pinpoint.FeatureExtraction
	import csv

	db_path = "../new-new-just-posts-and-clean-dates-parler-messages.db"

	log_file = open("community_logs.txt", 'w')
	log_file.write("")
	log_file.close()

	used_names = []

	SHOULD_WRITE_CSVS = False

	class grapher():
	"""
	A wrapper class used for generating a graph for interactions between users
	"""
	graph = None

	def __init__(self):
	"""
	Constructor.
	"""
	self.graph = Graph()

	def add_edge_wrapper(self, node_1_name, node_2_name, weight=1, relationship=None):
	"""
	A wrapper function used to add an edge connection or node.
	:param node_1_name: from
	:param node_2_name: to
	:param weight:
	:param relationship:
	:return:
	"""

	# get node one ID

	node_1 = None
	for node in self.graph.vs:
	if node["label"] == node_1_name.capitalize():
	node_1 = node

	if node_1 == None:
	self.graph.add_vertices(1)
	node_count = self.graph.vcount()
	self.graph.vs[node_count-1]["id"] = node_count-1
	self.graph.vs[node_count-1]["label"] = node_1_name.capitalize()
	node_1 = self.graph.vs[node_count-1]

	# get node two id
	node_2 = None
	for node in self.graph.vs:
	if node["label"] == node_2_name.capitalize():
	node_2 = node

	if node_2 == None:
	self.graph.add_vertices(1)
	node_count = self.graph.vcount()
	self.graph.vs[node_count - 1]["id"] = node_count - 1
	self.graph.vs[node_count - 1]["label"] = node_2_name.capitalize()
	node_2 = self.graph.vs[node_count - 1]



	#print("User one {} - {}, user two {} - {}".format(node_1["label"], str(node_1["id"]),
	# node_2["label"], str(node_2["id"])))
	self.graph.add_edges([(node_1["id"], node_2["id"])])
	#self.graph.add_edge(node_1_name, node_2_name, weight=weight, relation=relationship) # , attr={""}

	def add_node(self, node_name):
	"""
	A wrapper function that adds a node with no edges to the graph
	:param node_name:
	"""

	node_1 = None
	for node in self.graph.vs:
	if node["label"] == node_name.capitalize():
	node_1 = node["id"]

	if node_1 == None:
	self.graph.add_vertices(1)
	node_count = self.graph.vcount()
	self.graph.vs[node_count-1]["id"] = node_count-1
	self.graph.vs[node_count-1]["label"] = node_name.capitalize()
	node_1 = self.graph.vs[node_count-1]


	def get_database(where=None):
	#print(where)
	message_db = easy_db.DataBase(db_path)
	if where is None:
	return message_db.pull("parler_messages")
	else:
	return message_db.pull_where("parler_messages", where)

	def get_mentioned_usernames_from_post(post):
	# Process mentions
	mentions = re.findall("\@([a-zA-Z\-\_]+)", post)

	sanitised_list = []

	for mention in mentions:
	mention = mention.replace("@", "")
	sanitised_list.append(mention)

	return sanitised_list

	def get_rows_from_csv_where_field_is(csv_name, username, month):
	rows = []
	with open(csv_name, 'rt', encoding="utf8") as f:
	for row in csv.DictReader(f, fieldnames=["A","B","C","WC","Analytic","Clout","Authentic","Tone","WPS","Sixltr",
	"Dic","function","pronoun","ppron","i","we","you","shehe","they","ipron",
	"article","prep","auxverb","adverb","conj","negate","verb","adj","compare",
	"interrog","number","quant","affect","posemo","negemo","anx","anger","sad",
	"social","family","friend","female","male","cogproc","insight","cause","discrep",
	"tentat","certain","differ","percept","see","hear","feel","bio","body","health",
	"sexual","ingest","drives","affiliation","achieve","power","reward","risk",
	"focuspast","focuspresent","focusfuture","relativ","motion","space","time","work",
	"leisure","home","money","relig","death","informal","swear","netspeak","assent",
	"nonflu","filler","AllPunc","Period","Comma","Colon","SemiC","QMark","Exclam",
	"Dash","Quote","Apostro","Parenth","OtherP"]):

	if username.strip().lower() in row["A"].strip().lower() \
	and month.strip().lower() in row["B"].strip().lower():
	rows.append(row)

	return rows


	month_graphs = {}

	year_range = list(range(2017, 2022))
	month_range = list(range(1, 13))

	INITIAL_COMMUNITIES_FILE_NAME = "phase_one_communities_file.pickle"
	SECOND_COMMUNITIES_FILE_NAME = "phase_two_communities_file.pickle"


	print("Loading old {} file".format(INITIAL_COMMUNITIES_FILE_NAME))
	pickle_file = open(INITIAL_COMMUNITIES_FILE_NAME, "rb")
	month_graphs = pickle.load(pickle_file)
	pickle_file.close()
	print("loaded...")
	# Get communities
	month_graph_keys = list(month_graphs.keys())
	month_graph_keys.sort()

	list_of_community_objects = []

	# get top 10 centrality users per month of parler
	if not os.path.isfile(SECOND_COMMUNITIES_FILE_NAME):


	dict_of_centrality_per_month = {}
	dict_of_user_count_per_month = {}
	dict_of_shrinkage = {}

	total_unique_user_list = []
	total_users = []

	highest_centrality = 0
	highest_centrality_user = None
	date_of_highest_centrality = None

	dict_of_messages = {}
	number_of_users_dict = {}
	highest_number_of_users = 0
	highest_number_of_users_month = None

	shrinkage_per_month = {}
	last_month = None

	all_months_centality = {}
	all_centralities = {}
	for month_key in month_graph_keys:
	print("Reviewing graph for date '{}'".format(month_key))
	graph = month_graphs[month_key].graph

	user_nodes = graph.nodes.keys()
	print("users {}".format(len(user_nodes)))
	centrality_for_month = {}
	iterator = 0

	centrality_for_month = nx.degree_centrality(graph)
	all_centralities[month_key] = centrality_for_month
	# sort
	if len(centrality_for_month) > 0:
	sorted_list = sorted(centrality_for_month, key=centrality_for_month.get, reverse=True)[:10]
	all_months_centality[month_key] = sorted_list

	unique_users = {}
	for month in all_months_centality:
	for user in all_months_centality[month]:
	if user not in unique_users.keys():
	unique_users[user] = [{"month":month, "centrality":all_centralities[month][user]}]
	else:
	unique_users[user].append({"month":month, "centrality":all_centralities[month][user]})
	pprint(unique_users)

	# write to csv
	if SHOULD_WRITE_CSVS:
	seen_users = []
	with open('all-messages.json.csv', 'w', encoding='utf8', newline='') as output_file:
	writer = csv.DictWriter(output_file,fieldnames=["username","timestamp","message"])

	for month in all_months_centality:
	graph = month_graphs[month]
	for user in all_months_centality[month]:
	if user not in seen_users:
	seen_users.append(user)
	# get from database where username == user and month == month
	# loop through messages.
	# if above threshold is extremist.

	if user != "-":
	print("getting posts for user '{}'".format(user))
	posts = get_database("username='{}' COLLATE NOCASE".format(user))
	print("Posts found: {}".format(len(posts)))
	if posts == None:
	raise Exception("no posts, 'where' failed")
	for post in posts:
	#users_mentioned = get_mentioned_usernames_from_post(post["body"])
	writer.writerow({"username": post["username"], "timestamp": post["Time"], "message": post["body"]})

	model = random_forest()
	model.train_model(features_file = None, force_new_dataset=False, model_location=r"far-right-baseline.model")
	dict_of_users_all = {}
	feature_extractor = Pinpoint.FeatureExtraction.feature_extraction(violent_words_dataset_location="swears",baseline_training_dataset_location="data/LIWC2015 Results (Storm_Front_Posts).csv")




	# Get the is-extremist score for users for the month they were in the highest centrality
	for month in all_months_centality:
	for user in all_months_centality[month]:
	print("Getting data for user {} and month {}".format(user, month))

	# Get rows for this user and month
	rows = get_rows_from_csv_where_field_is("data/LIWC2015 Results (all-messages.csv).csv", user, month)
	# write these to a new (temp) csv

	pprint(rows)

	if len(rows) <= 1:
	print("Not enough rows for {} {}".format(user, month))
	continue

	keys = rows[0].keys()

	with open('temp.csv', 'w', newline='', encoding='utf8') as output_file:
	dict_writer = csv.DictWriter(output_file, keys)
	dict_writer.writeheader()
	dict_writer.writerows(rows)

	feature_extractor._reset_stored_feature_data()
	feature_extractor._get_type_of_message_data(data_set_location="temp.csv")
	with open("messages.json", 'w') as outfile:
	json.dump(feature_extractor.completed_tweet_user_features, outfile, indent=4)
	rows = model.get_features_as_df("messages.json", True)

	print("Length of rows returned: {}".format(len(rows)))

	number_of_connections = 0
	number_of_connections_extremist = 0

	is_extemist_count = 0
	for row in rows:
	post = row["C"]

	is_extremist = model.model.predict(post)
	print("Post '{}...' is extemist {}".format(post[:20], is_extremist))
	if is_extremist:
	is_extemist_count = is_extemist_count+1

	# If we were to do mentione dusers we'd need to markup with LIWC again. Could I use the less reliable version without LIWC?
	if is_extemist_count != 0:
	percentage_extremist = len(rows) /is_extemist_count
	else:
	percentage_extremist = 0

	if user not in dict_of_users_all:
	dict_of_users_all[user] = {"months":{}}

	if "months" in dict_of_users_all[user].keys():
	dict_of_users_all[user]["months"][month] = percentage_extremist



	with open('data.json', 'w') as fp:
	json.dump(dict_of_users_all, fp)

	# mark up csv with LIWC scores.

	# number of unique users. manual 100 max (less users), otherwise doesn't really matter.
	# classed as radicalised? Look at the accounts and posts, what are they up to over time.
	# are any posts far right, mostly extremist material,
	# when looking at connections - apply the same above. at time period on mention and overall.

	# create the csv writer

	# when have they been active, what monts are they extremist, how often, common words or phrases, etc


	'''users_of_interest[user] = {
	"centrality": month[user],
	"is_extremist":,
	"is_connections_extremist":,
	}
	'''

	# radicalisation window?
	# use high centrality users that are extremist
	# look at the work.