File size: 3,470 Bytes
45086c8
 
32a03a4
 
 
 
 
 
 
 
 
 
 
45086c8
 
 
32a03a4
 
 
 
 
 
 
 
 
 
 
45086c8
 
 
 
 
 
32a03a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# This file wraps around Pinpoint to provide simple prediction functionality.

import csv
import time
import uuid
from pprint import pprint

import Pinpoint.FeatureExtraction
from Pinpoint.RandomForest import *

class predictor():

    def __init__(self):
        '''
        Constructor
        '''
        self.model = random_forest()
        self.model.PSYCHOLOGICAL_SIGNALS_ENABLED = False  # Needs LIWC markup
        self.model.BEHAVIOURAL_FEATURES_ENABLED = False
        self.model.train_model(features_file=None, force_new_dataset=False,
                          model_location=r"far-right-radical-language.model")
        self.dict_of_users_all = {}
        self.feature_extractor = Pinpoint.FeatureExtraction.feature_extraction(
            violent_words_dataset_location="swears",
            baseline_training_dataset_location="LIWC2015 Results (Storm_Front_Posts).csv")

    def predict(self, string_to_predict = None, username = "unknown"):
        '''
        A wrapper function used to call pinpoint and predict if a given piece of text is extremist.
        :param string_to_predict: 
        :param username: 
        :return: boolean true/ false
        '''

        if string_to_predict == None:
            raise Exception("No prediction material given...")

        extended_prediction_uuid = str(uuid.uuid1())+"-"+str(uuid.uuid1())
        self.model.model_folder = "{}-output".format(extended_prediction_uuid)
        self.feature_extractor.MESSAGE_TMP_CACHE_LOCATION = "{}-message-cache".format(extended_prediction_uuid)
        print("Starting prediction for {}".format(extended_prediction_uuid))

        if string_to_predict != None:
            users_posts = [{"username": "{}".format(username), "timestamp": "tmp", "message": "{}".format(string_to_predict)}]

        try:
            os.remove("./{}-messages.json".format(extended_prediction_uuid))
        except:
            pass

        with open('{}-all-messages.csv'.format(extended_prediction_uuid), 'w', encoding='utf8', newline='') as output_file:
            writer = csv.DictWriter(output_file, fieldnames=["username", "timestamp", "message"])
            for users_post in users_posts:
                writer.writerow(users_post)

        try:
            self.feature_extractor._get_standard_tweets("{}-all-messages.csv".format(extended_prediction_uuid))
        except FileNotFoundError:
            return False

        with open("./{}-messages.json".format(extended_prediction_uuid), 'w') as outfile:
            features = self.feature_extractor.completed_tweet_user_features

            json.dump(features, outfile, indent=4)

        rows = self.model.get_features_as_df("./{}-messages.json".format(extended_prediction_uuid), True)
        rows.pop("is_extremist")

        try:
            features = rows.loc[0]
            is_extremist = self.model.model.predict([features])
        except FileNotFoundError as e:
            is_extremist = False
            print("Message cache error, next - {}".format(e))

        print("Ending prediction for {}".format(extended_prediction_uuid))

        dir_name = "."
        test = os.listdir(dir_name)

        os.remove("{}-all-messages.csv".format(extended_prediction_uuid))
        os.remove("{}-messages.json.csv".format(extended_prediction_uuid))
        os.remove("{}-messages.json".format(extended_prediction_uuid))

        if is_extremist == True:
            return True
        else:
            return False