Spaces:
Runtime error
Runtime error
File size: 33,358 Bytes
32a03a4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 |
import ast
import base64
import codecs
import csv
import gc
import json
import os
import pickle
import re
import shutil
import time
import numpy
import pandas as pd
import uuid
from scipy.spatial import distance
from Pinpoint.Aggregator_NGram import n_gram_aggregator
from Pinpoint.Aggregator_TfIdf import tf_idf_aggregator
from Pinpoint.Aggregator_Word2Vec import word_2_vec_aggregator
from Pinpoint.Aggregator_WordingChoice import wording_choice_aggregator
from Pinpoint.Grapher import grapher
from Pinpoint.Logger import logger
from Pinpoint.Sanitizer import sanitization, sys
class feature_extraction():
"""
This class is used to wrap the functionality of aggregating tweets from CSV files and extracting features pertinent
to building a random forest extremist classifier.
"""
# A graph used to store connections between aggregated users
graph = grapher()
archived_graphs = [] # an archive of the previous graphs
# A list storing dictionaries of user ids and their features.
tweet_user_features = []
completed_tweet_user_features = [] # has centrality added
# the global TF IDF model used for the Word 2 Vec model
saved_tf_idf_model = None
# A dictionary used for the translation of actual Twitter username to UUID
dict_of_users = {}
# The max size for all data entries (i.e. baseline tweets)
MAX_RECORD_SIZE = sys.maxsize # 3050
# Datasets for training
violent_words_dataset_location = None
tf_idf_training_dataset_location = None
outputs_location = None
# Used for knowing which columns to access data from. For Twitter data.
# Summary variables
DEFAULT_USERNAME_COLUMN_ID = 0
DEFAULT_DATE_COLUMN_ID = 1
DEFAULT_MESSAGE_COLUMN_ID = 2
DEFAULT_ANALYTIC_COLUMN_ID = 4
DEFAULT_CLOUT_COLUMN_ID = 5
DEFAULT_AUTHENTIC_COLUMN_ID = 6
DEFAULT_TONE_COLUMN_ID = 7
# Emotional Analysis
DEFAULT_ANGER_COLUMN_ID = 36
DEFAULT_SADNESS_COLUMN_ID = 37
DEFAULT_ANXIETY_COLUMN_ID = 35
# Personal Drives:
DEFAULT_POWER_COLUMN_ID = 62
DEFAULT_REWARD_COLUMN_ID = 63
DEFAULT_RISK_COLUMN_ID = 64
DEFAULT_ACHIEVEMENT_COLUMN_ID = 61
DEFAULT_AFFILIATION_COLUMN_ID = 60
# Personal pronouns
DEFAULT_P_PRONOUN_COLUMN_ID = 13
DEFAULT_I_PRONOUN_COLUMN_ID = 19
# Constants for the fields in the baseline data set (i.e. ISIS magazine/ Stormfront, etc)
DEFAULT_BASELINE_MESSAGE_COLUMN_ID = 5
# Summary variables
DEFAULT_BASELINE_CLOUT_COLUMN_ID = 10
DEFAULT_BASELINE_ANALYTIC_COLUMN_ID = 9
DEFAULT_BASELINE_TONE_COLUMN_ID = 12
DEFAULT_BASELINE_AUTHENTIC_COLUMN_ID = 11
# Emotional Analysis
DEFAULT_BASELINE_ANGER_COLUMN_ID = 41
DEFAULT_BASELINE_SADNESS_COLUMN_ID = 42
DEFAULT_BASELINE_ANXIETY_COLUMN_ID = 40
# Personal Drives
DEFAULT_BASELINE_POWER_COLUMN_ID = 67
DEFAULT_BASELINE_REWARD_COLUMN_ID = 68
DEFAULT_BASELINE_RISK_COLUMN_ID = 69
DEFAULT_BASELINE_ACHIEVEMENT_COLUMN_ID = 66
DEFAULT_BASELINE_AFFILIATION_COLUMN_ID = 65
# Personal pronouns
DEFAULT_BASELINE_P_PRONOUN_COLUMN_ID = 18
DEFAULT_BASELINE_I_PRONOUN_COLUMN_ID = 24
# Used for Minkowski distance
_average_clout = 0
_average_analytic = 0
_average_tone = 0
_average_authentic = 0
_average_anger = 0
_average_sadness = 0
average_anxiety = 0
average_power = 0
average_reward = 0
average_risk = 0
average_achievement = 0
average_affiliation = 0
average_p_pronoun = 0
average_i_pronoun = 0
# Used to chache messages to free memory
MESSAGE_TMP_CACHE_LOCATION = "message_cache"
def __init__(self, violent_words_dataset_location=None
, baseline_training_dataset_location=None,
outputs_location=r"outputs"):
"""
Constructor
The feature_extraction() class can be initialised with violent_words_dataset_location,
tf_idf_training_dataset_location, and outputs_location locations. All files in the violent_words_dataset_location
will be read (one line at a time) and added to the corpus of violent and swear words. The csv file at
baseline_training_dataset_location is used to train the TFIDF model and a Minkowski distance score is calculated based on the LIWC scores present.
If the constant variable need to be changed, do this by setting the member variables.
"""
# Error if datasets not provided
if violent_words_dataset_location is None:
raise Exception("No Violent Words dir provided. Provide a directory that contains new line seperated "
"files where each line is a violent, extremist, etc word")
if baseline_training_dataset_location is None:
raise Exception("No baseline (TF-IDF/ Minkowski) dataset provided. Thus should be a csv file containing "
"extremist content and LIWC scores.")
# Set datasets to member variables
self.violent_words_dataset_location = violent_words_dataset_location
self.tf_idf_training_dataset_location = baseline_training_dataset_location
self.outputs_location = outputs_location
# Attempt to make the outputs folder if it doesn't exist
try:
os.makedirs(outputs_location)
except:
pass
def _reset_stored_feature_data(self):
"""
Resets memeber variables from a previous run. Importantly does not reset to TF IDF model.
:return:
"""
# A graph used to store connections between aggregated users
self.graph = grapher()
archived_graphs = [] # an archive of the previous graphs
# A list storing dictionaries of user ids and their features.
self.tweet_user_features = []
self.completed_tweet_user_features = [] # has centrality added
# the global TF IDF model used for the Word 2 Vec model
self.dict_of_users = {}
# Used for Minkowski distance
self._average_clout = 0
self._average_analytic = 0
self._average_tone = 0
self._average_authentic = 0
self._average_anger = 0
self._average_sadness = 0
self.average_anxiety = 0
self.average_power = 0
self.average_reward = 0
self.average_risk = 0
self.average_achievement = 0
self.average_affiliation = 0
self.average_p_pronoun = 0
self.average_i_pronoun = 0
def _get_unique_id_from_username(self, username):
"""
A function used to retrieve a UUID based on a twitter username. If a username has been used before the same UUID
will be returned as it is stored in a dictionary.
:param username:
:return: a string representation of a UUID relating to a Twitter username
"""
if username in self.dict_of_users:
# username already in dictionary
unique_id = self.dict_of_users[username]
else:
# make new UUID
unique_id = uuid.uuid4().hex
# stops uuid collisions
while unique_id in self.dict_of_users.values():
unique_id = uuid.uuid4().hex
# Add new user id to dictionary
self.dict_of_users[username] = unique_id
# todo it's less efficient writing the whole file every run
path = os.path.join(self.outputs_location, "users.json")
with open(path, 'w') as outfile:
json.dump(self.dict_of_users, outfile)
return unique_id
def _add_to_graph(self, originating_user_name, message):
"""
A wrapper function used for adding a node/ connection to the graph.
:param originating_user_name: the Twitter username
:param message: The Tweet
"""
# Adds node to graph so that if they don't interact with anyone they still have a centrality
self.graph.add_node(originating_user_name)
# Process mentions
mentions = re.findall("\@([a-zA-Z\-\_]+)", message)
# For all mentions in the tweet add them to the graph as a node
for mention in mentions:
self.graph.add_edge_wrapper(originating_user_name, mention, 1, "mention")
# process hashtags
hashtags = re.findall("\#([a-zA-Z\-\_]+)", message)
# For all hashtags in the tweet add them to the graph as a node
for hashtag in hashtags:
self.graph.add_edge_wrapper(originating_user_name, hashtag, 1, "hashtag")
def _get_capitalised_word_frequency(self, message):
"""
A wrapper function for returning the frequency of capitalised words in a message.
:param message:
:return: the frequency of capitalised words in a message.
"""
return wording_choice_aggregator().get_frequency_of_capatalised_words(
message) # NEEDS TO BE DONE before lower case
def _get_violent_word_frequency(self, message):
"""
A wrapper function used to retrieve the frequency of violent words in a message.
:param message: a string representation of a social media message
:return: The frequency of violent words in the message
"""
return wording_choice_aggregator().get_frequency_of_violent_or_curse_words(message,
self.violent_words_dataset_location)
def _get_tweet_vector(self, message):
"""
A wrapper function used retrieve the 200 size vector representation (Average and Max vector concatenated)
of that message.
:param message: a string representation of a message
:param tf_idf_model:
:return: a 200 size vector of the tweet
"""
vectors = []
tf_idf_model = self._get_tf_idf_model()
for word in message.split(" "):
# todo add back word = sanitization().sanitize(word, self.outputs_location, force_new_data_and_dont_persisit=True)
try:
vectors.append(tf_idf_model.wv[word])
logger().print_message("Word '{}' in vocabulary...".format(word))
except KeyError as e:
pass
logger().print_message(e)
logger().print_message("Word '{}' not in vocabulary...".format(word))
# Lists of the values used to store the max and average vector values
max_value_list = []
average_value_list = []
# Check for if at least one word in the message is in the vocabulary of the model
final_array_of_vectors = pd.np.zeros(100)
if len(vectors) > 0:
# Loop through the elements in the vectors
for iterator in range(vectors[0].size):
list_of_all_values = []
# Loop through each vector
for vector in vectors:
value = vector[iterator]
list_of_all_values.append(value)
average_value = sum(list_of_all_values) / len(list_of_all_values)
max_value = max(list_of_all_values)
max_value_list.append(max_value)
average_value_list.append(average_value)
final_array_of_vectors = pd.np.append(pd.np.array([max_value_list]), pd.np.array([average_value_list]))
# Convert array to list
list_of_vectors = []
for vector in final_array_of_vectors:
list_of_vectors.append(vector)
return list_of_vectors
def _process_tweet(self, user_name, message, row):
"""
Wrapper function for taking a username and tweet and extracting the features.
:param user_name:
:param message:
:return: a dictionary of all features from the message
"""
self._add_to_graph(user_name, message)
features_dict = {"cap_freq": self._get_capitalised_word_frequency(message),
"violent_freq": self._get_violent_word_frequency(message),
"message_vector": self._get_tweet_vector(message)}
return features_dict
def _get_average_liwc_scores_for_baseline_data(self):
"""
Calculate the LIWC scores for the baseline dataset and the minkowski dataset.
"""
# Checks if the values have already been set this run, if so don't calculate again
# TODO what of the edge case where average clout is 0?
if self._average_clout == 0:
logger.print_message("Opening dataset {} for LIWC feature extraction and Minkowski distance".format(
self.tf_idf_training_dataset_location))
baseline_data_set_name = self.tf_idf_training_dataset_location
clout_list = []
analytic_list = []
tone_list = []
authentic_list = []
anger_list = []
sadness_list = []
anxiety_list = []
power_list = []
reward_list = []
risk_list = []
achievement_list = []
affiliation_list = []
p_pronoun_list = []
i_pronoun_list = []
with open(baseline_data_set_name, 'r', encoding='cp1252') as file:
reader = csv.reader(file)
is_header = True
for row in reader:
if is_header:
is_header = False
continue
# Try and access columns, if can't then LIWC fields haven't been set and should be set to 0
try:
clout = row[self.DEFAULT_BASELINE_CLOUT_COLUMN_ID]
analytic = row[self.DEFAULT_BASELINE_ANALYTIC_COLUMN_ID]
tone = row[self.DEFAULT_BASELINE_TONE_COLUMN_ID]
authentic = row[self.DEFAULT_BASELINE_AUTHENTIC_COLUMN_ID]
anger = row[self.DEFAULT_BASELINE_ANGER_COLUMN_ID]
sadness = row[self.DEFAULT_BASELINE_SADNESS_COLUMN_ID]
anxiety = row[self.DEFAULT_BASELINE_ANXIETY_COLUMN_ID]
power = row[self.DEFAULT_BASELINE_POWER_COLUMN_ID]
reward = row[self.DEFAULT_BASELINE_REWARD_COLUMN_ID]
risk = row[self.DEFAULT_BASELINE_RISK_COLUMN_ID]
achievement = row[self.DEFAULT_BASELINE_ACHIEVEMENT_COLUMN_ID]
affiliation = row[self.DEFAULT_BASELINE_AFFILIATION_COLUMN_ID]
p_pronoun = row[self.DEFAULT_BASELINE_P_PRONOUN_COLUMN_ID]
i_pronoun = row[self.DEFAULT_BASELINE_I_PRONOUN_COLUMN_ID]
except:
clout = 0
analytic = 0
tone = 0
authentic = 0
anger = 0
sadness = 0
anxiety = 0
power = 0
reward = 0
risk = 0
achievement = 0
affiliation = 0
p_pronoun = 0
i_pronoun = 0
clout_list.append(float(clout))
analytic_list.append(float(analytic))
tone_list.append(float(tone))
authentic_list.append(float(authentic))
anger_list.append(float(anger))
sadness_list.append(float(sadness))
anxiety_list.append(float(anxiety))
power_list.append(float(power))
reward_list.append(float(reward))
risk_list.append(float(risk))
achievement_list.append(float(achievement))
affiliation_list.append(float(affiliation))
p_pronoun_list.append(float(p_pronoun))
i_pronoun_list.append(float(i_pronoun))
# Get average for variables, used for distance score. These are member variables so that they don't
# have to be re-calculated on later runs
self._average_clout = sum(clout_list) / len(clout_list)
self._average_analytic = sum(analytic_list) / len(analytic_list)
self._average_tone = sum(tone_list) / len(tone_list)
self._average_authentic = sum(authentic_list) / len(authentic_list)
self._average_anger = sum(anger_list) / len(anger_list)
self._average_sadness = sum(sadness_list) / len(sadness_list)
self.average_anxiety = sum(anxiety_list) / len(anxiety_list)
self.average_power = sum(power_list) / len(power_list)
self.average_reward = sum(reward_list) / len(reward_list)
self.average_risk = sum(risk_list) / len(risk_list)
self.average_achievement = sum(achievement_list) / len(achievement_list)
self.average_affiliation = sum(affiliation_list) / len(affiliation_list)
self.average_p_pronoun = sum(p_pronoun_list) / len(p_pronoun_list)
self.average_i_pronoun = sum(i_pronoun_list) / len(i_pronoun_list)
return [self._average_clout, self._average_analytic, self._average_tone, self._average_authentic,
self._average_anger, self._average_sadness, self.average_anxiety,
self.average_power, self.average_reward, self.average_risk, self.average_achievement,
self.average_affiliation,
self.average_p_pronoun, self.average_i_pronoun]
def _get_tf_idf_model(self):
"""
A function used to retrieve the TFIDF model trained on the extremist dataset. If the model has already been
created then the previously created model will be used.
:return: a TF-IDF model
"""
# if already made model, reuse
if self.saved_tf_idf_model is None:
logger.print_message("Opening dataset {} for TF-IDF".format(self.tf_idf_training_dataset_location))
baseline_data_set_name = self.tf_idf_training_dataset_location
data_set = ""
with open(baseline_data_set_name, 'r', encoding='cp1252') as file:
reader = csv.reader(file)
is_header = True
for row in reader:
if is_header:
is_header = False
continue
# take quote from dataset and add it to dataset
message = row[self.DEFAULT_BASELINE_MESSAGE_COLUMN_ID] # data column
data_set = data_set + message + "/n"
# clean data set
# todo should we be doing sanitization clean_data = sanitization().sanitize(data_set, self.outputs_location) # if so remove line below
clean_data = data_set
# get ngrams
uni_grams, bi_grams, tri_grams = n_gram_aggregator().get_ngrams(clean_data)
ngrams = uni_grams + bi_grams + tri_grams
# todo The TF_IDF most important ngrams arn't being used. Should these be used instead of the other ngrams
tf_idf_scores = tf_idf_aggregator().get_tf_idf_scores(ngrams, data_set)
number_of_most_important_ngrams = int(len(ngrams) / 2) # number is half all ngrams
list_of_most_important_ngrams = sorted(tf_idf_scores, key=tf_idf_scores.get, reverse=True)[
:number_of_most_important_ngrams]
# create a word 2 vec model
model = word_2_vec_aggregator().get_model(list_of_sentences=list_of_most_important_ngrams)
self.saved_tf_idf_model = model
else:
model = self.saved_tf_idf_model
return model
def open_wrapper(self, location, access_type, list_of_encodings=["utf-8", 'latin-1', 'cp1252']):
"""
A wrapper around the open built in function that has fallbacks for different encodings.
:return:
"""
for encoding in list_of_encodings:
try:
file = open(location, access_type, encoding=encoding)
# Attempt to read file, if fails try other encoding
file.readlines()
file.seek(0)
file.close()
file = open(location, access_type, encoding=encoding)
return file
except LookupError as e:
continue
except UnicodeDecodeError as e:
continue
raise Exception(
"No valid encoding provided for file: '{}'. Encodings provided: '{}'".format(location, list_of_encodings))
def _add_user_post_db_cache(self, user_id, dict_to_add):
"""
Used to add data to the post message db cache used to free up memory.
"""
if not os.path.isdir(self.MESSAGE_TMP_CACHE_LOCATION):
os.mkdir(self.MESSAGE_TMP_CACHE_LOCATION)
# Save file as pickle
file_name = "{}-{}.pickle".format(user_id,int(time.time()))
file_name = os.path.join(self.MESSAGE_TMP_CACHE_LOCATION, file_name)
with open(file_name, 'wb') as pickle_handle:
pickle.dump({"description":"a temporery file used for saving memory",
"data":dict_to_add}, pickle_handle, protocol=pickle.HIGHEST_PROTOCOL)
def _get_user_post_db_cache(self, file_name):
"""
Retrieves data from the cache database used to free up memory.
"""
if not os.path.isdir(self.MESSAGE_TMP_CACHE_LOCATION):
raise Exception("Attempted to access temporery cache files before files are created")
if not os.path.isfile(file_name):
raise Exception("Attempted to access cache file {}, however, it does not exist".format(file_name))
with (open(file_name, "rb")) as openfile:
cache_data = pickle.load(openfile)
return cache_data["data"]
def _delete_user_post_db_cache(self):
try:
if os.path.isdir(self.MESSAGE_TMP_CACHE_LOCATION):
shutil.rmtree(self.MESSAGE_TMP_CACHE_LOCATION)
except:
pass
def _get_type_of_message_data(self, data_set_location, has_header=True, is_extremist=None):
# Ensure all temp files are deleted
self._delete_user_post_db_cache()
# Counts the total rows in the CSV. Used for progress reporting.
print("Starting entity count. Will count '{}'".format(self.MAX_RECORD_SIZE))
# Read one entry at a time
max_chunksize = 1
row_count = 0
for row in pd.read_csv(data_set_location, iterator=True,encoding='latin-1'):
row_count = row_count + 1
if row_count >= self.MAX_RECORD_SIZE:
break
print("Finished entity count. Count is: '{}'".format(row_count))
print("")
# Loops through all rows in the dataset CSV file.
current_processed_rows = 0
is_header = False
for row in pd.read_csv(data_set_location, iterator=True,encoding='latin-1'):
row = row.columns
# Makes sure same number for each dataset
if current_processed_rows > row_count:
break
# Skips the first entry, as it's the CSV header
if has_header and is_header:
is_header = False
continue
# Retrieve username
try:
username = row[self.DEFAULT_USERNAME_COLUMN_ID]
date = row[self.DEFAULT_DATE_COLUMN_ID]
user_unique_id = self._get_unique_id_from_username(username)
except:
# if empty entry
continue
# Attempt to get LIWC scores from csv, if not present return 0's
try:
# Summary variables
clout = float(row[self.DEFAULT_CLOUT_COLUMN_ID])
analytic = float(row[self.DEFAULT_ANALYTIC_COLUMN_ID])
tone = float(row[self.DEFAULT_TONE_COLUMN_ID])
authentic = float(row[self.DEFAULT_AUTHENTIC_COLUMN_ID])
# Emotional Analysis
anger = float(row[self.DEFAULT_ANGER_COLUMN_ID])
sadness = float(row[self.DEFAULT_SADNESS_COLUMN_ID])
anxiety = float(row[self.DEFAULT_ANXIETY_COLUMN_ID])
# Personal Drives:
power = float(row[self.DEFAULT_POWER_COLUMN_ID])
reward = float(row[self.DEFAULT_REWARD_COLUMN_ID])
risk = float(row[self.DEFAULT_RISK_COLUMN_ID])
achievement = float(row[self.DEFAULT_ACHIEVEMENT_COLUMN_ID])
affiliation = float(row[self.DEFAULT_AFFILIATION_COLUMN_ID])
# Personal pronouns
i_pronoun = float(row[self.DEFAULT_I_PRONOUN_COLUMN_ID])
p_pronoun = float(row[self.DEFAULT_P_PRONOUN_COLUMN_ID])
except:
# Summary variables
clout = 0
analytic = 0
tone = 0
authentic = 0
# Emotional Analysis
anger = 0
sadness = 0
anxiety = 0
# Personal Drives:
power = 0
reward = 0
risk = 0
achievement = 0
affiliation = 0
# Personal pronouns
i_pronoun = 0
p_pronoun = 0
liwc_dict = {
"clout": clout,
"analytic": analytic,
"tone": tone,
"authentic": authentic,
"anger": anger,
"sadness": sadness,
"anxiety": anxiety,
"power": power,
"reward": reward,
"risk": risk,
"achievement": achievement,
"affiliation": affiliation,
"i_pronoun": i_pronoun,
"p_pronoun": p_pronoun,
}
# Calculate minkowski distance
average_row = self._get_average_liwc_scores_for_baseline_data()
actual_row = [clout, analytic, tone, authentic,
anger, sadness, anxiety,
power, reward, risk, achievement, affiliation,
p_pronoun, i_pronoun
]
try:
liwc_dict["minkowski"] = distance.minkowski(actual_row, average_row, 1)
except ValueError:
continue
# Retrieve Tweet for message
tweet = str(row[self.DEFAULT_MESSAGE_COLUMN_ID])
# clean/ remove markup in dataset
sanitised_message = sanitization().sanitize(tweet, self.outputs_location,
force_new_data_and_dont_persisit=True)
# If no message skip entry
if not len(tweet) > 0 or not len(sanitised_message) > 0 or sanitised_message == '' or not len(
sanitised_message.split(" ")) > 0:
continue
# Process Tweet and save as dict
tweet_dict = self._process_tweet(user_unique_id, tweet, row)
# If the message vector is not 200 skip (meaning that a blank message was processed)
if not len(tweet_dict["message_vector"]) == 200:
continue
if is_extremist is not None:
tweet_dict["is_extremist"] = is_extremist
tweet_dict["date"] = date
# Merge liwc dict with tweet dict
tweet_dict = {**tweet_dict, **liwc_dict}
#tweet_dict["user_unique_id"]= user_unique_id
self._add_user_post_db_cache(user_unique_id, {user_unique_id: tweet_dict})
#self.tweet_user_features.append()
# TODO here save to cache json instead of list and graph
logger().print_message("Added message from user: '{}', from dataset: '{}'. {} rows of {} completed."
.format(user_unique_id, data_set_location, current_processed_rows, row_count), 1)
current_processed_rows = current_processed_rows + 1
print("Finished reading row")
# Add the centrality (has to be done after all users are added to graph)
completed_tweet_user_features = []
# Loops through each item in the list which represents each message/ tweet
# Loop through all data in cache file
for cached_message_file in os.listdir(self.MESSAGE_TMP_CACHE_LOCATION):
cached_message_file = os.fsdecode(cached_message_file)
cached_message_file = os.path.join(self.MESSAGE_TMP_CACHE_LOCATION,cached_message_file)
# Only process pickle files
if not cached_message_file.endswith(".pickle"):
continue
print("Reading cache file: '{}'".format(cached_message_file))
cached_message_data = self._get_user_post_db_cache(cached_message_file)
# Loops through the data in that tweet (Should only be one entry per tweet).
for user_id in cached_message_data.keys():
updated_entry = {}
updated_entry[user_id] = cached_message_data[user_id]
# Adds centrality
updated_entry[user_id]["centrality"] = self.graph.get_degree_centrality_for_user(user_id)
logger().print_message(
"Added '{}' Centrality for user '{}'".format(updated_entry[user_id]["centrality"], user_id), 1)
completed_tweet_user_features.append(updated_entry)
gc.collect()
break # Only one entry per list
self._delete_user_post_db_cache()
self.completed_tweet_user_features = self.completed_tweet_user_features + completed_tweet_user_features
self.tweet_user_features = []
#self.archived_graphs.append(self.graph)
self.graph = grapher()
print("Finished messages")
def _get_extremist_data(self, dataset_location):
"""
This function is responsible for aggregating tweets from the extremist dataset, extracting the features, and
saving them to a file for a model to be created.
"""
self._get_type_of_message_data(data_set_location=dataset_location, is_extremist=True)
def _get_counterpoise_data(self, dataset_location):
"""
This function is responsible for aggregating tweets from the counterpoise (related to the topic but from
legitimate sources, e.g. news outlets) dataset, extracting the features, and saving them to a file for a
model to be created.
"""
self._get_type_of_message_data(data_set_location=dataset_location, is_extremist=False)
def _get_standard_tweets(self, dataset_location):
"""
This function is responsible for aggregating tweets from the baseline (random sample of twitter posts)
dataset, extracting the features, and saving them to a file for a model to be created.
"""
self._get_type_of_message_data(data_set_location=dataset_location, is_extremist=False)
def dump_features_for_list_of_datasets(self, feature_file_path_to_save_to, list_of_dataset_locations,
force_new_dataset=True):
"""
Saves features representing a provided dataset to a json file. Designed to be used for testing after a
model has been created.
:param feature_file_path_to_save_to:
:param dataset_location:
:return:
"""
self._reset_stored_feature_data()
if force_new_dataset or not os.path.isfile(feature_file_path_to_save_to):
for dataset in list_of_dataset_locations:
self._get_type_of_message_data(data_set_location=dataset, is_extremist=None)
with open(feature_file_path_to_save_to, 'w') as outfile:
json.dump(self.completed_tweet_user_features, outfile, indent=4)
else:
with open(feature_file_path_to_save_to, 'r') as file:
data = file.read()
# parse file
self.completed_tweet_user_features = json.loads(data)
def dump_training_data_features(self, feature_file_path_to_save_to, extremist_data_location,
baseline_data_location, force_new_dataset=True):
"""
The entrypoint function, used to dump all features, for all users in the extreamist, counterpoise, and baseline
datsets to a json file.
:param feature_file_path_to_save_to: The filepath to save the datasets to
"""
self._reset_stored_feature_data()
if force_new_dataset or not os.path.isfile(feature_file_path_to_save_to):
print("Starting baseline messages")
self._get_standard_tweets(baseline_data_location)
print("Starting extremist messages")
self._get_extremist_data(extremist_data_location)
with open(feature_file_path_to_save_to, 'w') as outfile:
json.dump(self.completed_tweet_user_features, outfile, indent=4)
|