File size: 3,351 Bytes
f9db5d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import csv
import time
from pprint import pprint
import Pinpoint_Internal.FeatureExtraction
from Pinpoint_Internal.RandomForest import *

class predictor():

    def __init__(self):
        self.model = random_forest()
        self.model.PSYCHOLOGICAL_SIGNALS_ENABLED = False  # Needs LIWC markup
        self.model.BEHAVIOURAL_FEATURES_ENABLED = False
        self.model.train_model(features_file=None, force_new_dataset=False,
                          model_location=r"far-right-radical-language.model")
        self.dict_of_users_all = {}
        self.feature_extractor = Pinpoint_Internal.FeatureExtraction.feature_extraction(
            violent_words_dataset_location="swears",
            baseline_training_dataset_location="LIWC2015 Results (Storm_Front_Posts).csv")

    def predict(self, string_to_predict):
        self.__init__()
        try:
            os.remove("./messages.json")
        except:
            pass
        try:
            os.remove("messages.json")
        except:
            pass

        try:
            os.remove("./all-messages.csv")
        except:
            pass

        users_posts = [{"username": "tmp", "timestamp": "tmp", "message": "{}".format(string_to_predict)}]

        with open('all-messages.csv', 'w', encoding='utf8', newline='') as output_file:
            writer = csv.DictWriter(output_file, fieldnames=["username", "timestamp", "message"])
            for users_post in users_posts:
                writer.writerow(users_post)

        self.feature_extractor._get_standard_tweets("all-messages.csv")


        with open("./messages.json", 'w') as outfile:
            features = self.feature_extractor.completed_tweet_user_features

            json.dump(features, outfile, indent=4)

        rows = self.model.get_features_as_df("./messages.json", True)
        rows.pop("is_extremist")

        iter = 0

        message_vector_list = []

        for user_iter in range(0, len(users_posts)):
            rows_as_json = json.loads(rows.iloc[iter].to_json())

            tmp = []
            for i in range(1, 201):
                vect_str = "message_vector_{}".format(str(i))
                vector = rows_as_json[vect_str]
                tmp.append(vector)
            message_vector_list.append(tmp)

            iter = iter + 1

        for row in users_posts:
            user = row["username"]
            timestamp = row["timestamp"]
            message = row["message"]
            user_unique_id = str(self.feature_extractor._get_unique_id_from_username(user))

            iter = 0
            user_found = False
            while not user_found:
                try:
                    user_features = self.feature_extractor.completed_tweet_user_features[iter][user_unique_id]
                    user_found = True
                    break
                except KeyError as e:
                    iter = iter + 1

            formated_vectors = [float('%.10f' % elem) for elem in user_features["message_vector"]]
            iter = 0
            for vector_list in message_vector_list:

                if message_vector_list[iter] == formated_vectors:
                    is_extremist = self.model.model.predict([rows.iloc[iter]])

                    if is_extremist == 1:
                        return True
                    else:
                        return False