Spaces:
Runtime error
Runtime error
added skeleton to TextClassifier
Browse files- textclassifier/TextClassifier.py +27 -171
textclassifier/TextClassifier.py
CHANGED
@@ -1,176 +1,32 @@
|
|
1 |
import openai
|
2 |
import regex as re
|
3 |
-
from twitterscraper import TwitterScraper
|
4 |
-
|
5 |
-
openai.api_key = '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'
|
6 |
|
7 |
|
8 |
class TextClassifier:
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
def classify_sentiments(tweet_dict):
|
35 |
-
tweet_list = list(tweet_dict.keys())
|
36 |
-
|
37 |
-
for tweet in tweet_list:
|
38 |
-
prompt_string = "Classify one sentiment for this tweet:\n \""
|
39 |
-
prompt_string += tweet
|
40 |
-
prompt_string += "\" \nFor example:\nSupport,\nOpposition,\nCriticism,\nPraise,\nDisagreement," \
|
41 |
-
"\nAgreement,\nSkepticism,\nAdmiration,\nAnecdotes,\nJokes,\nMemes,\nSarcasm,\nSatire," \
|
42 |
-
"\nQuestions,\nStatements,\nOpinions,\nPredictions.\nSENTIMENT= "
|
43 |
-
|
44 |
-
response = openai.Completion.create(
|
45 |
-
model="text-davinci-002",
|
46 |
-
prompt=prompt_string,
|
47 |
-
temperature=0,
|
48 |
-
max_tokens=256,
|
49 |
-
top_p=1,
|
50 |
-
frequency_penalty=0,
|
51 |
-
presence_penalty=0
|
52 |
-
)
|
53 |
-
classifications_unclean = response.choices[0]['text']
|
54 |
-
tweet_dict[tweet]['sentiment'] = classifications_unclean
|
55 |
-
|
56 |
-
return tweet_dict
|
57 |
-
|
58 |
-
def cleanup_topic_results(prediction_dict, tweet_dict):
|
59 |
-
temp_list = []
|
60 |
-
|
61 |
-
for tweet, item in prediction_dict.items():
|
62 |
-
temp_list = []
|
63 |
-
new_item = item.replace("\n", " ")
|
64 |
-
new_item = new_item.replace(" ", " ")
|
65 |
-
new_item = new_item[4:]
|
66 |
-
new_item = re.sub('\d', '', new_item)
|
67 |
-
sub_list = new_item.split(".")
|
68 |
-
|
69 |
-
for item in sub_list:
|
70 |
-
if item.startswith(' '):
|
71 |
-
item = item[1:]
|
72 |
-
if item.endswith(' '):
|
73 |
-
item = item[:-1]
|
74 |
-
temp_list.append(item)
|
75 |
-
tweet_dict[tweet]['topic'] = temp_list
|
76 |
-
|
77 |
-
return tweet_dict
|
78 |
-
|
79 |
-
def print_results(results_dict):
|
80 |
-
print('\033[1m' + "RESULTS" + '\033[0m', "\n")
|
81 |
-
for key in results_dict.keys():
|
82 |
-
predictions = results_dict[key]
|
83 |
-
print("\"" + key + "\"" + "\n" + str(predictions), "\n" + "---------------------------------")
|
84 |
-
|
85 |
-
def print_stats(result_dict):
|
86 |
-
user = ""
|
87 |
-
freq_dict = {}
|
88 |
-
mean_likes = {}
|
89 |
-
mean_retweets = {}
|
90 |
-
mean_replies = {}
|
91 |
-
sentiment_dict = {}
|
92 |
-
nbr_sentiment = 0
|
93 |
-
nbr_topics = 0
|
94 |
-
|
95 |
-
for key, value in result_dict.items():
|
96 |
-
|
97 |
-
nlikes = value['nlikes']
|
98 |
-
nreplies = value['nreplies']
|
99 |
-
nretweets = value['nretweets']
|
100 |
-
topic_list = value['topic']
|
101 |
-
sentiment = value['sentiment']
|
102 |
-
|
103 |
-
# Count sentiment frequency
|
104 |
-
if sentiment in sentiment_dict.keys():
|
105 |
-
sentiment_dict[sentiment] += 1
|
106 |
-
else:
|
107 |
-
sentiment_dict[sentiment] = 1
|
108 |
-
nbr_sentiment += 1
|
109 |
-
|
110 |
-
# Count topic frequency
|
111 |
-
for topic in topic_list:
|
112 |
-
if topic in freq_dict.keys():
|
113 |
-
freq_dict[topic] += 1
|
114 |
-
|
115 |
-
else:
|
116 |
-
freq_dict[topic] = 1
|
117 |
-
nbr_topics += 1
|
118 |
-
|
119 |
-
# Count total likes per topic
|
120 |
-
if topic in mean_likes.keys():
|
121 |
-
mean_likes[topic] += nlikes
|
122 |
-
else:
|
123 |
-
mean_likes[topic] = nlikes
|
124 |
-
|
125 |
-
# Count total retweets per topic
|
126 |
-
if topic in mean_retweets.keys():
|
127 |
-
mean_retweets[topic] += nretweets
|
128 |
-
else:
|
129 |
-
mean_retweets[topic] = nretweets
|
130 |
-
|
131 |
-
# Count total replies per topic
|
132 |
-
if topic in mean_replies.keys():
|
133 |
-
mean_replies[topic] += nreplies
|
134 |
-
else:
|
135 |
-
mean_replies[topic] = nreplies
|
136 |
-
|
137 |
-
# Count mean of likes
|
138 |
-
for key in mean_likes.keys():
|
139 |
-
mean_likes[key] = mean_likes[key] / freq_dict[key]
|
140 |
-
|
141 |
-
# Count mean of retweets
|
142 |
-
for key in mean_retweets.keys():
|
143 |
-
mean_retweets[key] = mean_retweets[key] / freq_dict[key]
|
144 |
-
|
145 |
-
# Print the names of the columns.
|
146 |
-
print('\033[1m' + "USER: " + '\033[0m', user)
|
147 |
-
print('\033[1m' + "NBR OF TWEETS SCRAPED: " + '\033[0m', len(list(result_dict.keys())))
|
148 |
-
print('\033[1m' + "NBR OF DIFFERENT TOPICS: " + '\033[0m', nbr_topics, "\n")
|
149 |
-
print("{:<60} {:<20} {:<30} {:<30} {:<30} {:<30}".format('\033[1m' + 'TOPIC', 'TOPIC FREQUENCY',
|
150 |
-
'AVERAGE NBR OF LIKES', 'AVERAGE NBR OF RETWEETS',
|
151 |
-
'AVERAGE NBR OF REPLIES', 'REACH AVERAGE' + '\033[0m'))
|
152 |
-
|
153 |
-
# print each data item.
|
154 |
-
for key, value in mean_likes.items():
|
155 |
-
topic = key
|
156 |
-
mean_likes = value
|
157 |
-
reach_avg = (mean_likes + mean_retweets[topic] + mean_replies[topic]) / 3
|
158 |
-
print(
|
159 |
-
"{:<60} {:<20} {:<30} {:<30} {:<30} {:<30}".format(topic, freq_dict[topic], "{:.2f}".format(mean_likes),
|
160 |
-
"{:.2f}".format(mean_retweets[topic]),
|
161 |
-
mean_replies[topic], "{:.2f}".format(reach_avg)))
|
162 |
-
|
163 |
-
print("\n")
|
164 |
-
print('\033[1m' + "NBR OF DIFFERENT SENTIMENTS: " + '\033[0m', nbr_sentiment, "\n")
|
165 |
-
print("{:<60} {:<20}".format('\033[1m' + 'SENTIMENT', 'SENTIMENT FREQUENCY' + '\033[0m'))
|
166 |
-
for key, value in sentiment_dict.items():
|
167 |
-
sentiment = key
|
168 |
-
mean_sentiment = value
|
169 |
-
print("{:<60} {:<20}".format(sentiment, sentiment_dict[sentiment], "{:.2f}".format(mean_sentiment)))
|
170 |
-
|
171 |
-
|
172 |
-
if __name__ == '__main__':
|
173 |
-
sc = tf.TwitterScraper(num_tweets=40)
|
174 |
-
dc = sc.scrape_by_user("jimmieakesson")
|
175 |
-
print(dc.head())
|
176 |
-
print(dc.shape)
|
|
|
1 |
import openai
|
2 |
import regex as re
|
3 |
+
from twitterscraper import TwitterScraper
|
4 |
+
from datetime import date
|
|
|
5 |
|
6 |
|
7 |
class TextClassifier:
|
8 |
+
def __init__(self, model_name="text-davinci-002", from_date='2022-01-01', to_date=str(date.today()), num_tweets=100):
|
9 |
+
"""
|
10 |
+
Initializes the TextClassifier.
|
11 |
+
:param model_name: name of the model from openai.
|
12 |
+
:param from_date: string of the format 'YYYY-MM-DD'.
|
13 |
+
:param to_date: string of the format 'YYYY-MM-DD'.
|
14 |
+
:param num_tweets: integer value of the maximum number of tweets to be scraped.
|
15 |
+
"""
|
16 |
+
|
17 |
+
self.model_name = model_name
|
18 |
+
self.df = TwitterScraper.TwitterScraper(from_date, to_date, num_tweets)
|
19 |
+
self.api_key = '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'
|
20 |
+
|
21 |
+
def classify_sentiment(self, text: str):
|
22 |
+
"""
|
23 |
+
Classifies the sentiment of a text.
|
24 |
+
"""
|
25 |
+
|
26 |
+
def classify_topics(self, text: str):
|
27 |
+
"""
|
28 |
+
Classifies the topics of a text.
|
29 |
+
"""
|
30 |
+
|
31 |
+
def __repr__(self):
|
32 |
+
return f"TextClassifier(df={self.df}, col={self.col}, model_name={self.model_name})"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|