Spaces:
Sleeping
Sleeping
File size: 3,351 Bytes
f9db5d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import csv
import time
from pprint import pprint
import Pinpoint_Internal.FeatureExtraction
from Pinpoint_Internal.RandomForest import *
class predictor():
def __init__(self):
self.model = random_forest()
self.model.PSYCHOLOGICAL_SIGNALS_ENABLED = False # Needs LIWC markup
self.model.BEHAVIOURAL_FEATURES_ENABLED = False
self.model.train_model(features_file=None, force_new_dataset=False,
model_location=r"far-right-radical-language.model")
self.dict_of_users_all = {}
self.feature_extractor = Pinpoint_Internal.FeatureExtraction.feature_extraction(
violent_words_dataset_location="swears",
baseline_training_dataset_location="LIWC2015 Results (Storm_Front_Posts).csv")
def predict(self, string_to_predict):
self.__init__()
try:
os.remove("./messages.json")
except:
pass
try:
os.remove("messages.json")
except:
pass
try:
os.remove("./all-messages.csv")
except:
pass
users_posts = [{"username": "tmp", "timestamp": "tmp", "message": "{}".format(string_to_predict)}]
with open('all-messages.csv', 'w', encoding='utf8', newline='') as output_file:
writer = csv.DictWriter(output_file, fieldnames=["username", "timestamp", "message"])
for users_post in users_posts:
writer.writerow(users_post)
self.feature_extractor._get_standard_tweets("all-messages.csv")
with open("./messages.json", 'w') as outfile:
features = self.feature_extractor.completed_tweet_user_features
json.dump(features, outfile, indent=4)
rows = self.model.get_features_as_df("./messages.json", True)
rows.pop("is_extremist")
iter = 0
message_vector_list = []
for user_iter in range(0, len(users_posts)):
rows_as_json = json.loads(rows.iloc[iter].to_json())
tmp = []
for i in range(1, 201):
vect_str = "message_vector_{}".format(str(i))
vector = rows_as_json[vect_str]
tmp.append(vector)
message_vector_list.append(tmp)
iter = iter + 1
for row in users_posts:
user = row["username"]
timestamp = row["timestamp"]
message = row["message"]
user_unique_id = str(self.feature_extractor._get_unique_id_from_username(user))
iter = 0
user_found = False
while not user_found:
try:
user_features = self.feature_extractor.completed_tweet_user_features[iter][user_unique_id]
user_found = True
break
except KeyError as e:
iter = iter + 1
formated_vectors = [float('%.10f' % elem) for elem in user_features["message_vector"]]
iter = 0
for vector_list in message_vector_list:
if message_vector_list[iter] == formated_vectors:
is_extremist = self.model.model.predict([rows.iloc[iter]])
if is_extremist == 1:
return True
else:
return False |