File size: 33,358 Bytes
32a03a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
import ast
import base64
import codecs
import csv
import gc
import json
import os
import pickle
import re
import shutil
import time

import numpy
import pandas as pd
import uuid
from scipy.spatial import distance

from Pinpoint.Aggregator_NGram import n_gram_aggregator
from Pinpoint.Aggregator_TfIdf import tf_idf_aggregator
from Pinpoint.Aggregator_Word2Vec import word_2_vec_aggregator
from Pinpoint.Aggregator_WordingChoice import wording_choice_aggregator
from Pinpoint.Grapher import grapher
from Pinpoint.Logger import logger
from Pinpoint.Sanitizer import sanitization, sys


class feature_extraction():
    """
    This class is used to wrap the functionality of aggregating tweets from CSV files and extracting features pertinent
    to building a random forest extremist classifier.
    """

    # A graph used to store connections between aggregated users
    graph = grapher()
    archived_graphs = []  # an archive of the previous graphs
    # A list storing dictionaries of user ids and their features.
    tweet_user_features = []
    completed_tweet_user_features = []  # has centrality added
    # the global TF IDF model used for the Word 2 Vec model
    saved_tf_idf_model = None
    # A dictionary used for the translation of actual Twitter username to UUID
    dict_of_users = {}

    # The max size for all data entries  (i.e. baseline tweets)
    MAX_RECORD_SIZE = sys.maxsize  # 3050

    # Datasets for training
    violent_words_dataset_location = None
    tf_idf_training_dataset_location = None
    outputs_location = None

    # Used for knowing which columns to access data from. For Twitter data.
    # Summary variables
    DEFAULT_USERNAME_COLUMN_ID = 0
    DEFAULT_DATE_COLUMN_ID = 1
    DEFAULT_MESSAGE_COLUMN_ID = 2
    DEFAULT_ANALYTIC_COLUMN_ID = 4
    DEFAULT_CLOUT_COLUMN_ID = 5
    DEFAULT_AUTHENTIC_COLUMN_ID = 6
    DEFAULT_TONE_COLUMN_ID = 7
    # Emotional Analysis
    DEFAULT_ANGER_COLUMN_ID = 36
    DEFAULT_SADNESS_COLUMN_ID = 37
    DEFAULT_ANXIETY_COLUMN_ID = 35
    # Personal Drives:
    DEFAULT_POWER_COLUMN_ID = 62
    DEFAULT_REWARD_COLUMN_ID = 63
    DEFAULT_RISK_COLUMN_ID = 64
    DEFAULT_ACHIEVEMENT_COLUMN_ID = 61
    DEFAULT_AFFILIATION_COLUMN_ID = 60
    # Personal pronouns
    DEFAULT_P_PRONOUN_COLUMN_ID = 13
    DEFAULT_I_PRONOUN_COLUMN_ID = 19

    # Constants for the fields in the baseline data set (i.e. ISIS magazine/ Stormfront, etc)
    DEFAULT_BASELINE_MESSAGE_COLUMN_ID = 5
    # Summary variables
    DEFAULT_BASELINE_CLOUT_COLUMN_ID = 10
    DEFAULT_BASELINE_ANALYTIC_COLUMN_ID = 9
    DEFAULT_BASELINE_TONE_COLUMN_ID = 12
    DEFAULT_BASELINE_AUTHENTIC_COLUMN_ID = 11
    # Emotional Analysis
    DEFAULT_BASELINE_ANGER_COLUMN_ID = 41
    DEFAULT_BASELINE_SADNESS_COLUMN_ID = 42
    DEFAULT_BASELINE_ANXIETY_COLUMN_ID = 40
    # Personal Drives
    DEFAULT_BASELINE_POWER_COLUMN_ID = 67
    DEFAULT_BASELINE_REWARD_COLUMN_ID = 68
    DEFAULT_BASELINE_RISK_COLUMN_ID = 69
    DEFAULT_BASELINE_ACHIEVEMENT_COLUMN_ID = 66
    DEFAULT_BASELINE_AFFILIATION_COLUMN_ID = 65
    # Personal pronouns
    DEFAULT_BASELINE_P_PRONOUN_COLUMN_ID = 18
    DEFAULT_BASELINE_I_PRONOUN_COLUMN_ID = 24

    # Used for Minkowski distance
    _average_clout = 0
    _average_analytic = 0
    _average_tone = 0
    _average_authentic = 0
    _average_anger = 0
    _average_sadness = 0
    average_anxiety = 0
    average_power = 0
    average_reward = 0
    average_risk = 0
    average_achievement = 0
    average_affiliation = 0
    average_p_pronoun = 0
    average_i_pronoun = 0

    # Used to chache messages to free memory
    MESSAGE_TMP_CACHE_LOCATION = "message_cache"

    def __init__(self, violent_words_dataset_location=None
                 , baseline_training_dataset_location=None,
                 outputs_location=r"outputs"):
        """
        Constructor

        The feature_extraction() class can be initialised with violent_words_dataset_location,
        tf_idf_training_dataset_location, and outputs_location locations. All files in the violent_words_dataset_location
        will be read (one line at a time) and added to the corpus of violent and swear words. The csv file at
        baseline_training_dataset_location is used to train the TFIDF model and a Minkowski distance score is calculated based on the LIWC scores present.

        If the constant variable need to be changed, do this by setting the member variables.
        """

        # Error if datasets not provided
        if violent_words_dataset_location is None:
            raise Exception("No Violent Words dir provided. Provide a directory that contains new line seperated "
                            "files where each line is a violent, extremist, etc word")

        if baseline_training_dataset_location is None:
            raise Exception("No baseline (TF-IDF/ Minkowski) dataset provided. Thus should be a csv file containing "
                            "extremist content and LIWC scores.")

        # Set datasets to member variables
        self.violent_words_dataset_location = violent_words_dataset_location
        self.tf_idf_training_dataset_location = baseline_training_dataset_location
        self.outputs_location = outputs_location

        # Attempt to make the outputs folder if it doesn't exist
        try:
            os.makedirs(outputs_location)
        except:
            pass

    def _reset_stored_feature_data(self):
        """
        Resets memeber variables from a previous run. Importantly does not reset to TF IDF model.
        :return:
        """

        # A graph used to store connections between aggregated users
        self.graph = grapher()
        archived_graphs = []  # an archive of the previous graphs
        # A list storing dictionaries of user ids and their features.
        self.tweet_user_features = []
        self.completed_tweet_user_features = []  # has centrality added
        # the global TF IDF model used for the Word 2 Vec model
        self.dict_of_users = {}

        # Used for Minkowski distance
        self._average_clout = 0
        self._average_analytic = 0
        self._average_tone = 0
        self._average_authentic = 0
        self._average_anger = 0
        self._average_sadness = 0
        self.average_anxiety = 0
        self.average_power = 0
        self.average_reward = 0
        self.average_risk = 0
        self.average_achievement = 0
        self.average_affiliation = 0
        self.average_p_pronoun = 0
        self.average_i_pronoun = 0

    def _get_unique_id_from_username(self, username):
        """
        A function used to retrieve a UUID based on a twitter username. If a username has been used before the same UUID
        will be returned as it is stored in a dictionary.
        :param username:
        :return: a string representation of a UUID relating to a Twitter username
        """

        if username in self.dict_of_users:
            # username already in dictionary
            unique_id = self.dict_of_users[username]
        else:
            # make new UUID
            unique_id = uuid.uuid4().hex
            # stops uuid collisions
            while unique_id in self.dict_of_users.values():
                unique_id = uuid.uuid4().hex

            # Add new user id to dictionary
            self.dict_of_users[username] = unique_id

        # todo it's less efficient writing the whole file every run
        path = os.path.join(self.outputs_location, "users.json")

        with open(path, 'w') as outfile:
            json.dump(self.dict_of_users, outfile)

        return unique_id

    def _add_to_graph(self, originating_user_name, message):
        """
        A wrapper function used for adding a node/ connection to the graph.
        :param originating_user_name: the Twitter username
        :param message: The Tweet
        """

        # Adds node to graph so that if they don't interact with anyone they still have a centrality
        self.graph.add_node(originating_user_name)

        # Process mentions
        mentions = re.findall("\@([a-zA-Z\-\_]+)", message)

        # For all mentions in the tweet add them to the graph as a node
        for mention in mentions:
            self.graph.add_edge_wrapper(originating_user_name, mention, 1, "mention")

        # process hashtags
        hashtags = re.findall("\#([a-zA-Z\-\_]+)", message)

        # For all hashtags in the tweet add them to the graph as a node
        for hashtag in hashtags:
            self.graph.add_edge_wrapper(originating_user_name, hashtag, 1, "hashtag")

    def _get_capitalised_word_frequency(self, message):
        """
        A wrapper function for returning the frequency of capitalised words in a message.
        :param message:
        :return: the frequency of capitalised words in a message.
        """
        return wording_choice_aggregator().get_frequency_of_capatalised_words(
            message)  # NEEDS TO BE DONE before lower case

    def _get_violent_word_frequency(self, message):
        """
        A wrapper function used to retrieve the frequency of violent words in a message.
        :param message: a string representation of a social media message
        :return: The frequency of violent words in the message
        """
        return wording_choice_aggregator().get_frequency_of_violent_or_curse_words(message,
                                                                                   self.violent_words_dataset_location)

    def _get_tweet_vector(self, message):
        """
        A wrapper function used retrieve the 200 size vector representation (Average and Max vector concatenated)
        of that message.
        :param message: a string representation of a message
        :param tf_idf_model:
        :return: a 200 size vector of the tweet
        """
        vectors = []
        tf_idf_model = self._get_tf_idf_model()

        for word in message.split(" "):
            # todo add  back word = sanitization().sanitize(word, self.outputs_location, force_new_data_and_dont_persisit=True)
            try:
                vectors.append(tf_idf_model.wv[word])
                logger().print_message("Word '{}' in vocabulary...".format(word))
            except KeyError as e:
                pass
                logger().print_message(e)
                logger().print_message("Word '{}' not in vocabulary...".format(word))

        # Lists of the values used to store the max and average vector values
        max_value_list = []
        average_value_list = []

        # Check for if at least one word in the message is in the vocabulary of the model
        final_array_of_vectors = pd.np.zeros(100)
        if len(vectors) > 0:

            # Loop through the elements in the vectors
            for iterator in range(vectors[0].size):

                list_of_all_values = []

                # Loop through each vector
                for vector in vectors:
                    value = vector[iterator]
                    list_of_all_values.append(value)

                average_value = sum(list_of_all_values) / len(list_of_all_values)
                max_value = max(list_of_all_values)
                max_value_list.append(max_value)
                average_value_list.append(average_value)

            final_array_of_vectors = pd.np.append(pd.np.array([max_value_list]), pd.np.array([average_value_list]))

        # Convert array to list
        list_of_vectors = []
        for vector in final_array_of_vectors:
            list_of_vectors.append(vector)

        return list_of_vectors

    def _process_tweet(self, user_name, message, row):
        """
        Wrapper function for taking a username and tweet and extracting the features.
        :param user_name:
        :param message:
        :return: a dictionary of all features from the message
        """
        self._add_to_graph(user_name, message)

        features_dict = {"cap_freq": self._get_capitalised_word_frequency(message),
                         "violent_freq": self._get_violent_word_frequency(message),
                         "message_vector": self._get_tweet_vector(message)}


        return features_dict

    def _get_average_liwc_scores_for_baseline_data(self):
        """
        Calculate the LIWC scores for the baseline dataset and the minkowski dataset.
        """

        # Checks if the values have already been set this run, if so don't calculate again
        # TODO what of the edge case where average clout is 0?
        if self._average_clout == 0:
            logger.print_message("Opening dataset {} for LIWC feature extraction and Minkowski distance".format(
                self.tf_idf_training_dataset_location))
            baseline_data_set_name = self.tf_idf_training_dataset_location

            clout_list = []
            analytic_list = []
            tone_list = []
            authentic_list = []
            anger_list = []
            sadness_list = []
            anxiety_list = []
            power_list = []
            reward_list = []
            risk_list = []
            achievement_list = []
            affiliation_list = []
            p_pronoun_list = []
            i_pronoun_list = []

            with open(baseline_data_set_name, 'r', encoding='cp1252') as file:
                reader = csv.reader(file)

                is_header = True
                for row in reader:

                    if is_header:
                        is_header = False
                        continue

                    # Try and access columns, if can't then LIWC fields haven't been set and should be set to 0
                    try:
                        clout = row[self.DEFAULT_BASELINE_CLOUT_COLUMN_ID]
                        analytic = row[self.DEFAULT_BASELINE_ANALYTIC_COLUMN_ID]
                        tone = row[self.DEFAULT_BASELINE_TONE_COLUMN_ID]
                        authentic = row[self.DEFAULT_BASELINE_AUTHENTIC_COLUMN_ID]
                        anger = row[self.DEFAULT_BASELINE_ANGER_COLUMN_ID]
                        sadness = row[self.DEFAULT_BASELINE_SADNESS_COLUMN_ID]
                        anxiety = row[self.DEFAULT_BASELINE_ANXIETY_COLUMN_ID]
                        power = row[self.DEFAULT_BASELINE_POWER_COLUMN_ID]
                        reward = row[self.DEFAULT_BASELINE_REWARD_COLUMN_ID]
                        risk = row[self.DEFAULT_BASELINE_RISK_COLUMN_ID]
                        achievement = row[self.DEFAULT_BASELINE_ACHIEVEMENT_COLUMN_ID]
                        affiliation = row[self.DEFAULT_BASELINE_AFFILIATION_COLUMN_ID]
                        p_pronoun = row[self.DEFAULT_BASELINE_P_PRONOUN_COLUMN_ID]
                        i_pronoun = row[self.DEFAULT_BASELINE_I_PRONOUN_COLUMN_ID]
                    except:
                        clout = 0
                        analytic = 0
                        tone = 0
                        authentic = 0
                        anger = 0
                        sadness = 0
                        anxiety = 0
                        power = 0
                        reward = 0
                        risk = 0
                        achievement = 0
                        affiliation = 0
                        p_pronoun = 0
                        i_pronoun = 0

                    clout_list.append(float(clout))
                    analytic_list.append(float(analytic))
                    tone_list.append(float(tone))
                    authentic_list.append(float(authentic))
                    anger_list.append(float(anger))
                    sadness_list.append(float(sadness))
                    anxiety_list.append(float(anxiety))
                    power_list.append(float(power))
                    reward_list.append(float(reward))
                    risk_list.append(float(risk))
                    achievement_list.append(float(achievement))
                    affiliation_list.append(float(affiliation))
                    p_pronoun_list.append(float(p_pronoun))
                    i_pronoun_list.append(float(i_pronoun))

            #  Get average for variables, used for distance score. These are member variables so that they don't
            #  have to be re-calculated on later runs
            self._average_clout = sum(clout_list) / len(clout_list)
            self._average_analytic = sum(analytic_list) / len(analytic_list)
            self._average_tone = sum(tone_list) / len(tone_list)
            self._average_authentic = sum(authentic_list) / len(authentic_list)
            self._average_anger = sum(anger_list) / len(anger_list)
            self._average_sadness = sum(sadness_list) / len(sadness_list)
            self.average_anxiety = sum(anxiety_list) / len(anxiety_list)
            self.average_power = sum(power_list) / len(power_list)
            self.average_reward = sum(reward_list) / len(reward_list)
            self.average_risk = sum(risk_list) / len(risk_list)
            self.average_achievement = sum(achievement_list) / len(achievement_list)
            self.average_affiliation = sum(affiliation_list) / len(affiliation_list)
            self.average_p_pronoun = sum(p_pronoun_list) / len(p_pronoun_list)
            self.average_i_pronoun = sum(i_pronoun_list) / len(i_pronoun_list)

        return [self._average_clout, self._average_analytic, self._average_tone, self._average_authentic,
                self._average_anger, self._average_sadness, self.average_anxiety,
                self.average_power, self.average_reward, self.average_risk, self.average_achievement,
                self.average_affiliation,
                self.average_p_pronoun, self.average_i_pronoun]

    def _get_tf_idf_model(self):
        """
        A function used to retrieve the TFIDF model trained on the extremist dataset. If the model has already been
        created then the previously created model will be used.
        :return: a TF-IDF model
        """

        # if already made model, reuse
        if self.saved_tf_idf_model is None:
            logger.print_message("Opening dataset {} for TF-IDF".format(self.tf_idf_training_dataset_location))
            baseline_data_set_name = self.tf_idf_training_dataset_location

            data_set = ""

            with open(baseline_data_set_name, 'r', encoding='cp1252') as file:
                reader = csv.reader(file)

                is_header = True
                for row in reader:

                    if is_header:
                        is_header = False
                        continue

                    # take quote from dataset and add it to dataset
                    message = row[self.DEFAULT_BASELINE_MESSAGE_COLUMN_ID]  # data column
                    data_set = data_set + message + "/n"

            # clean data set
            # todo should we be doing sanitization clean_data = sanitization().sanitize(data_set, self.outputs_location) # if so remove line below
            clean_data = data_set

            # get ngrams
            uni_grams, bi_grams, tri_grams = n_gram_aggregator().get_ngrams(clean_data)
            ngrams = uni_grams + bi_grams + tri_grams

            # todo The TF_IDF most important ngrams arn't being used. Should these be used instead of the other ngrams
            tf_idf_scores = tf_idf_aggregator().get_tf_idf_scores(ngrams, data_set)
            number_of_most_important_ngrams = int(len(ngrams) / 2)  # number is half all ngrams
            list_of_most_important_ngrams = sorted(tf_idf_scores, key=tf_idf_scores.get, reverse=True)[
                                            :number_of_most_important_ngrams]

            # create a word 2 vec model
            model = word_2_vec_aggregator().get_model(list_of_sentences=list_of_most_important_ngrams)
            self.saved_tf_idf_model = model
        else:
            model = self.saved_tf_idf_model

        return model

    def open_wrapper(self, location, access_type, list_of_encodings=["utf-8", 'latin-1', 'cp1252']):
        """
        A wrapper around the open built in function that has fallbacks for different encodings.
        :return:
        """

        for encoding in list_of_encodings:
            try:
                file = open(location, access_type, encoding=encoding)
                # Attempt to read file, if fails try other encoding
                file.readlines()
                file.seek(0)
                file.close()
                file = open(location, access_type, encoding=encoding)
                return file
            except LookupError as e:
                continue
            except UnicodeDecodeError as e:
                continue

        raise Exception(
            "No valid encoding provided for file: '{}'. Encodings provided: '{}'".format(location, list_of_encodings))

    def _add_user_post_db_cache(self, user_id, dict_to_add):
        """
        Used to add data to the post message db cache used to free up memory.
        """

        if not os.path.isdir(self.MESSAGE_TMP_CACHE_LOCATION):
            os.mkdir(self.MESSAGE_TMP_CACHE_LOCATION)

        # Save file as pickle
        file_name = "{}-{}.pickle".format(user_id,int(time.time()))
        file_name = os.path.join(self.MESSAGE_TMP_CACHE_LOCATION, file_name)
        with open(file_name, 'wb') as pickle_handle:
            pickle.dump({"description":"a temporery file used for saving memory",
                         "data":dict_to_add}, pickle_handle, protocol=pickle.HIGHEST_PROTOCOL)

    def _get_user_post_db_cache(self, file_name):
        """
        Retrieves data from the cache database used to free up memory.
        """
        if not os.path.isdir(self.MESSAGE_TMP_CACHE_LOCATION):
            raise Exception("Attempted to access temporery cache files before files are created")

        if not os.path.isfile(file_name):
            raise Exception("Attempted to access cache file {}, however, it does not exist".format(file_name))

        with (open(file_name, "rb")) as openfile:
            cache_data = pickle.load(openfile)

        return cache_data["data"]

    def _delete_user_post_db_cache(self):
        try:
            if os.path.isdir(self.MESSAGE_TMP_CACHE_LOCATION):
                shutil.rmtree(self.MESSAGE_TMP_CACHE_LOCATION)
        except:
            pass

    def _get_type_of_message_data(self, data_set_location, has_header=True, is_extremist=None):
        # Ensure all temp files are deleted
        self._delete_user_post_db_cache()

        # Counts the total rows in the CSV. Used for progress reporting.
        print("Starting entity count. Will count '{}'".format(self.MAX_RECORD_SIZE))

        # Read one entry at a time
        max_chunksize = 1
        row_count = 0

        for row in pd.read_csv(data_set_location, iterator=True,encoding='latin-1'):

            row_count = row_count + 1

            if row_count >= self.MAX_RECORD_SIZE:
                break


        print("Finished entity count. Count is: '{}'".format(row_count))
        print("")
        # Loops through all rows in the dataset CSV file.
        current_processed_rows = 0
        is_header = False

        for row in pd.read_csv(data_set_location, iterator=True,encoding='latin-1'):
            row = row.columns
            # Makes sure same number for each dataset
            if current_processed_rows > row_count:
                break

            # Skips the first entry, as it's the CSV header
            if has_header and is_header:
                is_header = False
                continue

            # Retrieve username
            try:
                username = row[self.DEFAULT_USERNAME_COLUMN_ID]
                date = row[self.DEFAULT_DATE_COLUMN_ID]
                user_unique_id = self._get_unique_id_from_username(username)
            except:
                # if empty entry
                continue
            # Attempt to get LIWC scores from csv, if not present return 0's
            try:
                # Summary variables
                clout = float(row[self.DEFAULT_CLOUT_COLUMN_ID])
                analytic = float(row[self.DEFAULT_ANALYTIC_COLUMN_ID])
                tone = float(row[self.DEFAULT_TONE_COLUMN_ID])
                authentic = float(row[self.DEFAULT_AUTHENTIC_COLUMN_ID])
                # Emotional Analysis
                anger = float(row[self.DEFAULT_ANGER_COLUMN_ID])
                sadness = float(row[self.DEFAULT_SADNESS_COLUMN_ID])
                anxiety = float(row[self.DEFAULT_ANXIETY_COLUMN_ID])
                # Personal Drives:
                power = float(row[self.DEFAULT_POWER_COLUMN_ID])
                reward = float(row[self.DEFAULT_REWARD_COLUMN_ID])
                risk = float(row[self.DEFAULT_RISK_COLUMN_ID])
                achievement = float(row[self.DEFAULT_ACHIEVEMENT_COLUMN_ID])
                affiliation = float(row[self.DEFAULT_AFFILIATION_COLUMN_ID])
                # Personal pronouns
                i_pronoun = float(row[self.DEFAULT_I_PRONOUN_COLUMN_ID])
                p_pronoun = float(row[self.DEFAULT_P_PRONOUN_COLUMN_ID])

            except:
                # Summary variables
                clout = 0
                analytic = 0
                tone = 0
                authentic = 0
                # Emotional Analysis
                anger = 0
                sadness = 0
                anxiety = 0
                # Personal Drives:
                power = 0
                reward = 0
                risk = 0
                achievement = 0
                affiliation = 0
                # Personal pronouns
                i_pronoun = 0
                p_pronoun = 0

            liwc_dict = {
                "clout": clout,
                "analytic": analytic,
                "tone": tone,
                "authentic": authentic,
                "anger": anger,
                "sadness": sadness,
                "anxiety": anxiety,
                "power": power,
                "reward": reward,
                "risk": risk,
                "achievement": achievement,
                "affiliation": affiliation,
                "i_pronoun": i_pronoun,
                "p_pronoun": p_pronoun,
            }

            # Calculate minkowski distance
            average_row = self._get_average_liwc_scores_for_baseline_data()

            actual_row = [clout, analytic, tone, authentic,
                          anger, sadness, anxiety,
                          power, reward, risk, achievement, affiliation,
                          p_pronoun, i_pronoun
                          ]

            try:
                liwc_dict["minkowski"] = distance.minkowski(actual_row, average_row, 1)
            except ValueError:
                continue

            # Retrieve Tweet for message
            tweet = str(row[self.DEFAULT_MESSAGE_COLUMN_ID])

            # clean/ remove markup in dataset
            sanitised_message = sanitization().sanitize(tweet, self.outputs_location,
                                                        force_new_data_and_dont_persisit=True)

            # If no message skip entry
            if not len(tweet) > 0 or not len(sanitised_message) > 0 or sanitised_message == '' or not len(
                    sanitised_message.split(" ")) > 0:
                continue

            # Process Tweet and save as dict
            tweet_dict = self._process_tweet(user_unique_id, tweet, row)

            # If the message vector is not 200 skip (meaning that a blank message was processed)
            if not len(tweet_dict["message_vector"]) == 200:
                continue

            if is_extremist is not None:
                tweet_dict["is_extremist"] = is_extremist

            tweet_dict["date"] = date

            # Merge liwc dict with tweet dict
            tweet_dict = {**tweet_dict, **liwc_dict}

            #tweet_dict["user_unique_id"]= user_unique_id

            self._add_user_post_db_cache(user_unique_id, {user_unique_id: tweet_dict})
            #self.tweet_user_features.append()
            # TODO here save to cache json instead of list and graph

            logger().print_message("Added message from user: '{}', from dataset: '{}'. {} rows of {} completed."
                                   .format(user_unique_id, data_set_location, current_processed_rows, row_count), 1)
            current_processed_rows = current_processed_rows + 1
            print("Finished reading row")

        # Add the centrality (has to be done after all users are added to graph)
        completed_tweet_user_features = []
        # Loops through each item in the list which represents each message/ tweet

        # Loop through all data in cache file
        for cached_message_file in os.listdir(self.MESSAGE_TMP_CACHE_LOCATION):
            cached_message_file = os.fsdecode(cached_message_file)
            cached_message_file = os.path.join(self.MESSAGE_TMP_CACHE_LOCATION,cached_message_file)

            # Only process pickle files
            if not cached_message_file.endswith(".pickle"):
                continue

            print("Reading cache file: '{}'".format(cached_message_file))
            cached_message_data = self._get_user_post_db_cache(cached_message_file)
            # Loops through the data in that tweet (Should only be one entry per tweet).
            for user_id in cached_message_data.keys():
                updated_entry = {}
                updated_entry[user_id] = cached_message_data[user_id]
                # Adds centrality
                updated_entry[user_id]["centrality"] = self.graph.get_degree_centrality_for_user(user_id)
                logger().print_message(
                    "Added '{}' Centrality for user '{}'".format(updated_entry[user_id]["centrality"], user_id), 1)
                completed_tweet_user_features.append(updated_entry)
                gc.collect()
                break  # Only one entry per list


        self._delete_user_post_db_cache()
        self.completed_tweet_user_features = self.completed_tweet_user_features + completed_tweet_user_features
        self.tweet_user_features = []
        #self.archived_graphs.append(self.graph)
        self.graph = grapher()
        print("Finished messages")

    def _get_extremist_data(self, dataset_location):
        """
        This function is responsible for aggregating tweets from the extremist dataset, extracting the features, and
        saving them to a file for a model to be created.
        """

        self._get_type_of_message_data(data_set_location=dataset_location, is_extremist=True)

    def _get_counterpoise_data(self, dataset_location):
        """
        This function is responsible for aggregating tweets from the counterpoise (related to the topic but from
        legitimate sources, e.g. news outlets) dataset, extracting the features, and saving them to a file for a
        model to be created.
        """

        self._get_type_of_message_data(data_set_location=dataset_location, is_extremist=False)

    def _get_standard_tweets(self, dataset_location):
        """
        This function is responsible for aggregating tweets from the baseline (random sample of twitter posts)
        dataset, extracting the features, and saving them to a file for a model to be created.
        """

        self._get_type_of_message_data(data_set_location=dataset_location, is_extremist=False)

    def dump_features_for_list_of_datasets(self, feature_file_path_to_save_to, list_of_dataset_locations,
                                           force_new_dataset=True):
        """
        Saves features representing a provided dataset to a json file. Designed to be used for testing after a
        model has been created.
        :param feature_file_path_to_save_to:
        :param dataset_location:
        :return:
        """

        self._reset_stored_feature_data()

        if force_new_dataset or not os.path.isfile(feature_file_path_to_save_to):
            for dataset in list_of_dataset_locations:
                self._get_type_of_message_data(data_set_location=dataset, is_extremist=None)

            with open(feature_file_path_to_save_to, 'w') as outfile:
                json.dump(self.completed_tweet_user_features, outfile, indent=4)

        else:
            with open(feature_file_path_to_save_to, 'r') as file:
                data = file.read()

            # parse file
            self.completed_tweet_user_features = json.loads(data)

    def dump_training_data_features(self, feature_file_path_to_save_to, extremist_data_location,
                                    baseline_data_location, force_new_dataset=True):
        """
        The entrypoint function, used to dump all features, for all users in the extreamist, counterpoise, and baseline
        datsets to a json file.
        :param feature_file_path_to_save_to: The filepath to save the datasets to
        """

        self._reset_stored_feature_data()

        if force_new_dataset or not os.path.isfile(feature_file_path_to_save_to):
            print("Starting baseline messages")
            self._get_standard_tweets(baseline_data_location)
            print("Starting extremist messages")
            self._get_extremist_data(extremist_data_location)


            with open(feature_file_path_to_save_to, 'w') as outfile:
                json.dump(self.completed_tweet_user_features, outfile, indent=4)