politweet / textclassifier /TextClassifier.py
Demea9000's picture
added __init__.py files so that we can import as modules
03f299a
raw
history blame
6.92 kB
import openai
import regex as re
from twitterscraper import TwitterScraper as tf
openai.api_key = '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'
class TextClassifier:
def classify_topics(tweet_dict):
tweet_list = list(tweet_dict.keys())
prediction_dict = {}
for tweet in tweet_list:
prompt_string = "Classify this tweet with a general topic and two sub-topics:\n\""
prompt_string += tweet
prompt_string += "\".\nGeneral topic: \nSub topic 1: \nSub topic 2:\n. The classifications should not be " \
"more than 5 words. Numerate each topic in the output. END "
response = openai.Completion.create(
model="text-davinci-002",
prompt=prompt_string,
temperature=0,
max_tokens=892,
top_p=1,
frequency_penalty=0,
presence_penalty=0
)
classifications_unclean = response.choices[0]['text']
prediction_dict[tweet] = classifications_unclean
return TextClassifier.cleanup_topic_results(prediction_dict, tweet_dict)
def classify_sentiments(tweet_dict):
tweet_list = list(tweet_dict.keys())
for tweet in tweet_list:
prompt_string = "Classify one sentiment for this tweet:\n \""
prompt_string += tweet
prompt_string += "\" \nFor example:\nSupport,\nOpposition,\nCriticism,\nPraise,\nDisagreement," \
"\nAgreement,\nSkepticism,\nAdmiration,\nAnecdotes,\nJokes,\nMemes,\nSarcasm,\nSatire," \
"\nQuestions,\nStatements,\nOpinions,\nPredictions.\nSENTIMENT= "
response = openai.Completion.create(
model="text-davinci-002",
prompt=prompt_string,
temperature=0,
max_tokens=256,
top_p=1,
frequency_penalty=0,
presence_penalty=0
)
classifications_unclean = response.choices[0]['text']
tweet_dict[tweet]['sentiment'] = classifications_unclean
return tweet_dict
def cleanup_topic_results(prediction_dict, tweet_dict):
temp_list = []
for tweet, item in prediction_dict.items():
temp_list = []
new_item = item.replace("\n", " ")
new_item = new_item.replace(" ", " ")
new_item = new_item[4:]
new_item = re.sub('\d', '', new_item)
sub_list = new_item.split(".")
for item in sub_list:
if item.startswith(' '):
item = item[1:]
if item.endswith(' '):
item = item[:-1]
temp_list.append(item)
tweet_dict[tweet]['topic'] = temp_list
return tweet_dict
def print_results(results_dict):
print('\033[1m' + "RESULTS" + '\033[0m', "\n")
for key in results_dict.keys():
predictions = results_dict[key]
print("\"" + key + "\"" + "\n" + str(predictions), "\n" + "---------------------------------")
def print_stats(result_dict):
user = ""
freq_dict = {}
mean_likes = {}
mean_retweets = {}
mean_replies = {}
sentiment_dict = {}
nbr_sentiment = 0
nbr_topics = 0
for key, value in result_dict.items():
nlikes = value['nlikes']
nreplies = value['nreplies']
nretweets = value['nretweets']
topic_list = value['topic']
sentiment = value['sentiment']
# Count sentiment frequency
if sentiment in sentiment_dict.keys():
sentiment_dict[sentiment] += 1
else:
sentiment_dict[sentiment] = 1
nbr_sentiment += 1
# Count topic frequency
for topic in topic_list:
if topic in freq_dict.keys():
freq_dict[topic] += 1
else:
freq_dict[topic] = 1
nbr_topics += 1
# Count total likes per topic
if topic in mean_likes.keys():
mean_likes[topic] += nlikes
else:
mean_likes[topic] = nlikes
# Count total retweets per topic
if topic in mean_retweets.keys():
mean_retweets[topic] += nretweets
else:
mean_retweets[topic] = nretweets
# Count total replies per topic
if topic in mean_replies.keys():
mean_replies[topic] += nreplies
else:
mean_replies[topic] = nreplies
# Count mean of likes
for key in mean_likes.keys():
mean_likes[key] = mean_likes[key] / freq_dict[key]
# Count mean of retweets
for key in mean_retweets.keys():
mean_retweets[key] = mean_retweets[key] / freq_dict[key]
# Print the names of the columns.
print('\033[1m' + "USER: " + '\033[0m', user)
print('\033[1m' + "NBR OF TWEETS SCRAPED: " + '\033[0m', len(list(result_dict.keys())))
print('\033[1m' + "NBR OF DIFFERENT TOPICS: " + '\033[0m', nbr_topics, "\n")
print("{:<60} {:<20} {:<30} {:<30} {:<30} {:<30}".format('\033[1m' + 'TOPIC', 'TOPIC FREQUENCY',
'AVERAGE NBR OF LIKES', 'AVERAGE NBR OF RETWEETS',
'AVERAGE NBR OF REPLIES', 'REACH AVERAGE' + '\033[0m'))
# print each data item.
for key, value in mean_likes.items():
topic = key
mean_likes = value
reach_avg = (mean_likes + mean_retweets[topic] + mean_replies[topic]) / 3
print(
"{:<60} {:<20} {:<30} {:<30} {:<30} {:<30}".format(topic, freq_dict[topic], "{:.2f}".format(mean_likes),
"{:.2f}".format(mean_retweets[topic]),
mean_replies[topic], "{:.2f}".format(reach_avg)))
print("\n")
print('\033[1m' + "NBR OF DIFFERENT SENTIMENTS: " + '\033[0m', nbr_sentiment, "\n")
print("{:<60} {:<20}".format('\033[1m' + 'SENTIMENT', 'SENTIMENT FREQUENCY' + '\033[0m'))
for key, value in sentiment_dict.items():
sentiment = key
mean_sentiment = value
print("{:<60} {:<20}".format(sentiment, sentiment_dict[sentiment], "{:.2f}".format(mean_sentiment)))
if __name__ == '__main__':
sc = tf.TwitterScraper(num_tweets=40)
dc = sc.scrape_by_user("jimmieakesson")
print(dc.head())
print(dc.shape)