Spaces:
Runtime error
Runtime error
File size: 6,872 Bytes
a814b1b 6421f36 a814b1b 6421f36 a814b1b 6421f36 a814b1b 6421f36 a814b1b 6421f36 a814b1b 6421f36 a814b1b 6421f36 a814b1b 6421f36 a814b1b 6421f36 a814b1b 6421f36 a814b1b 6421f36 a814b1b 6421f36 a814b1b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
import gc
import json
import os
from datetime import date
from pathlib import Path
import time
import tweepy
from googletrans import Translator
from predictor import predictor
import unicodedata
# Twitter API keys
consumer_token = os.getenv('CONSUMER_TOKEN')
consumer_secret = os.getenv('CONSUMER_SECRET')
my_access_token = os.getenv('ACCESS_TOKEN')
my_access_secret = os.getenv('ACCESS_SECRET')
bearer = os.getenv('BEARER')
# TODO: is this needed for mapping the object type after reading the pickle files? If not remove.
class grapher():
"""
A wrapper class used for generating a graph for interactions between users
"""
graph = None
def __init__(self):
"""
Constructor.
"""
self.graph = Graph()
def add_edge_wrapper(self, node_1_name, node_2_name, weight=1, relationship=None):
"""
A wrapper function used to add an edge connection or node.
:param node_1_name: from
:param node_2_name: to
:param weight:
:param relationship:
:return:
"""
# get node one ID
node_1 = None
for node in self.graph.vs:
if node["label"] == node_1_name.capitalize():
node_1 = node
if node_1 == None:
self.graph.add_vertices(1)
node_count = self.graph.vcount()
self.graph.vs[node_count-1]["id"] = node_count-1
self.graph.vs[node_count-1]["label"] = node_1_name.capitalize()
node_1 = self.graph.vs[node_count-1]
# get node two id
node_2 = None
for node in self.graph.vs:
if node["label"] == node_2_name.capitalize():
node_2 = node
if node_2 == None:
self.graph.add_vertices(1)
node_count = self.graph.vcount()
self.graph.vs[node_count - 1]["id"] = node_count - 1
self.graph.vs[node_count - 1]["label"] = node_2_name.capitalize()
node_2 = self.graph.vs[node_count - 1]
self.graph.add_edges([(node_1["id"], node_2["id"])])
def add_node(self, node_name):
"""
A wrapper function that adds a node with no edges to the graph
:param node_name:
"""
node_1 = None
for node in self.graph.vs:
if node["label"] == node_name.capitalize():
node_1 = node["id"]
if node_1 == None:
self.graph.add_vertices(1)
node_count = self.graph.vcount()
self.graph.vs[node_count-1]["id"] = node_count-1
self.graph.vs[node_count-1]["label"] = node_name.capitalize()
node_1 = self.graph.vs[node_count-1]
# Setup Tweepy API and client objects
auth = tweepy.OAuth1UserHandler(
consumer_token, consumer_secret,
my_access_token, my_access_secret
)
api = tweepy.API(auth)
client = tweepy.Client(
bearer_token= bearer,
consumer_key=consumer_token,
consumer_secret=consumer_secret,
access_token=my_access_token,
access_token_secret=my_access_secret
)
# This class is used for streaming Tweets via Tweepy
class IDPrinter(tweepy.StreamingClient):
def on_tweet(self, tweet):
self.translator = Translator()
gc.collect()
if len(tweet.data["text"]) > 100:
if tweet and tweet.data:
if tweet.data["author_id"]:
tweet_data = tweet.data["text"].strip().replace("@", "").replace("\n","")
if tweet_data is not None or tweet != "":
username = client.get_user(id=tweet.author_id).data
# Ensure that Tweet is in English
lang = self.translator.detect(tweet_data).lang
if lang == "en":
tweet_data = unicodedata.normalize('NFKD', tweet_data).encode('ascii', 'ignore').decode()
if tweet_data != None:
# Use Pinpoint to identify if a Tweet is extremist or not
is_extremist = predictor().predict(tweet_data)
print("user {} post extremist {} - message: {}".format(username, is_extremist, str(tweet_data)))
# If a tweet is extremist go through 10 of that users posts and identify the percentage
# of posts that are extremist
if is_extremist != None and is_extremist == 1:
tweets = client.get_users_tweets(id=tweet.author_id, max_results=10)
number_extreme = 0
tweets = tweets[0]
for users_tweet in tweets:
if users_tweet.text != None:
is_extremist = predictor().predict(users_tweet.text)
if is_extremist != None:
if is_extremist == True:
number_extreme = number_extreme + 1
#print(number_extreme)
threshold = number_extreme/len(tweets[0]) * 100
#print("Threshold {}".format(threshold))
if threshold > 1: #
file_name = os.path.join("users","{}-{}-radical_users.txt".format(username,date.today().strftime("%b-%d-%Y")))
print("User {} was found to be extremist".format(username))
file_path = Path(file_name)
file_path.touch(exist_ok=True)
# Write user to a file in the user folder with the percentage of extremist posts
with open(file_name, 'w') as outfile:
json_to_dump = [{"username": username.id, "threshold": threshold,
"date": date.today().strftime("%b-%d-%Y")}]
json.dump(json_to_dump, outfile, indent=4)
print("Got user {}".format(username))
gc.collect()
# Continue indefinitely and collects Twitter posts
while True:
try:
printer = IDPrinter(bearer_token=bearer,wait_on_rate_limit =True,chunk_size=10000)
printer.add_rules(tweepy.StreamRule(value="en",tag="lang",id="lang-rule"))
printer.sample(expansions=["author_id", "geo.place_id"],threaded=False)
print("-"*20)
gc.collect()
except:
time.sleep(900) |