Spaces:
Runtime error
Runtime error
prettier
Browse files- text-classifier/TextClassifier.py +86 -84
- text-classifier/main.py +48 -38
text-classifier/TextClassifier.py
CHANGED
@@ -1,87 +1,87 @@
|
|
1 |
-
|
2 |
-
import openai
|
3 |
import regex as re
|
4 |
-
|
|
|
|
|
5 |
|
6 |
class TextClassifier:
|
7 |
-
|
8 |
-
def classify_topics(tweet_dict):
|
9 |
-
tweet_list = list(tweet_dict.keys())
|
10 |
-
prediction_dict = {}
|
11 |
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
14 |
prompt_string += tweet
|
15 |
-
prompt_string += "\".\nGeneral topic: \nSub topic 1: \nSub topic 2:\n. The classifications should not be
|
|
|
16 |
response = openai.Completion.create(
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
)
|
25 |
|
26 |
-
classifications_unclean = response.choices[0]['text']
|
27 |
-
prediction_dict[tweet] = classifications_unclean
|
28 |
|
29 |
return TextClassifier.cleanup_topic_results(prediction_dict, tweet_dict)
|
30 |
-
|
31 |
-
|
32 |
def classify_sentiments(tweet_dict):
|
33 |
-
tweet_list = list(tweet_dict.keys())
|
34 |
|
35 |
for tweet in tweet_list:
|
36 |
prompt_string = "Classify one sentiment for this tweet:\n \""
|
37 |
prompt_string += tweet
|
38 |
-
prompt_string += "\" \nFor example:\nSupport,\nOpposition,\nCriticism,\nPraise,\nDisagreement
|
39 |
-
|
|
|
|
|
40 |
response = openai.Completion.create(
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
)
|
49 |
-
classifications_unclean = response.choices[0]['text']
|
50 |
tweet_dict[tweet]['sentiment'] = classifications_unclean
|
51 |
-
|
52 |
-
return tweet_dict
|
53 |
|
|
|
54 |
|
55 |
-
def cleanup_topic_results(prediction_dict, tweet_dict):
|
56 |
-
temp_list = []
|
57 |
|
58 |
for tweet, item in prediction_dict.items():
|
59 |
-
temp_list = []
|
60 |
new_item = item.replace("\n", " ")
|
61 |
new_item = new_item.replace(" ", " ")
|
62 |
new_item = new_item[4:]
|
63 |
-
new_item = re.sub('\d', '', new_item)
|
64 |
sub_list = new_item.split(".")
|
65 |
-
|
66 |
-
for item in sub_list:
|
67 |
if item.startswith(' '):
|
68 |
item = item[1:]
|
69 |
if item.endswith(' '):
|
70 |
-
item = item[:-1]
|
71 |
-
temp_list.append(item)
|
72 |
-
tweet_dict[tweet]['topic'] = temp_list
|
73 |
-
|
74 |
-
return tweet_dict
|
75 |
|
|
|
76 |
|
77 |
def print_results(results_dict):
|
78 |
-
print('\033[1m' + "RESULTS" + '\033[0m', "\n")
|
79 |
for key in results_dict.keys():
|
80 |
predictions = results_dict[key]
|
81 |
-
print("\"" + key + "\"" + "\n"+ str(predictions),"\n" + "---------------------------------")
|
82 |
|
83 |
-
|
84 |
-
def print_stats(result_dict):
|
85 |
user = ""
|
86 |
freq_dict = {}
|
87 |
mean_likes = {}
|
@@ -93,74 +93,76 @@ class TextClassifier:
|
|
93 |
|
94 |
for key, value in result_dict.items():
|
95 |
|
96 |
-
nlikes = value['nlikes']
|
97 |
nreplies = value['nreplies']
|
98 |
nretweets = value['nretweets']
|
99 |
-
topic_list = value['topic']
|
100 |
sentiment = value['sentiment']
|
101 |
|
102 |
-
|
103 |
if sentiment in sentiment_dict.keys():
|
104 |
sentiment_dict[sentiment] += 1
|
105 |
else:
|
106 |
-
sentiment_dict[sentiment] = 1
|
107 |
-
nbr_sentiment += 1
|
108 |
-
|
109 |
-
|
110 |
for topic in topic_list:
|
111 |
if topic in freq_dict.keys():
|
112 |
freq_dict[topic] += 1
|
113 |
-
|
114 |
else:
|
115 |
-
freq_dict[topic] = 1
|
116 |
nbr_topics += 1
|
117 |
-
|
118 |
# Count total likes per topic
|
119 |
if topic in mean_likes.keys():
|
120 |
-
mean_likes[topic] += nlikes
|
121 |
else:
|
122 |
-
mean_likes[topic] = nlikes
|
123 |
|
124 |
-
|
125 |
if topic in mean_retweets.keys():
|
126 |
-
mean_retweets[topic] += nretweets
|
127 |
else:
|
128 |
-
mean_retweets[topic] = nretweets
|
129 |
|
130 |
-
|
131 |
if topic in mean_replies.keys():
|
132 |
-
mean_replies[topic] += nreplies
|
133 |
else:
|
134 |
-
mean_replies[topic] = nreplies
|
135 |
|
136 |
-
|
137 |
-
for key in mean_likes.keys():
|
138 |
mean_likes[key] = mean_likes[key] / freq_dict[key]
|
139 |
-
|
140 |
# Count mean of retweets
|
141 |
-
for key in mean_retweets.keys():
|
142 |
mean_retweets[key] = mean_retweets[key] / freq_dict[key]
|
143 |
|
144 |
-
|
145 |
# Print the names of the columns.
|
146 |
-
print('\033[1m' + "USER: " + '\033[0m', user)
|
147 |
-
print('\033[1m' + "NBR OF TWEETS SCRAPED: "+ '\033[0m', len(list(result_dict.keys())))
|
148 |
-
print('\033[1m' + "NBR OF DIFFERENT TOPICS: "+ '\033[0m', nbr_topics, "\n")
|
149 |
-
print("{:<60} {:<20} {:<30} {:<30} {:<30} {:<30}".format('\033[1m' + 'TOPIC', 'TOPIC FREQUENCY',
|
150 |
-
|
|
|
|
|
151 |
# print each data item.
|
152 |
for key, value in mean_likes.items():
|
153 |
topic = key
|
154 |
mean_likes = value
|
155 |
-
reach_avg = (mean_likes + mean_retweets[topic] + mean_replies[topic]
|
156 |
-
print
|
|
|
|
|
|
|
157 |
|
158 |
print("\n")
|
159 |
-
print('\033[1m' + "NBR OF DIFFERENT SENTIMENTS: "+ '\033[0m', nbr_sentiment, "\n")
|
160 |
print("{:<60} {:<20}".format('\033[1m' + 'SENTIMENT', 'SENTIMENT FREQUENCY' + '\033[0m'))
|
161 |
for key, value in sentiment_dict.items():
|
162 |
sentiment = key
|
163 |
-
mean_sentiment = value
|
164 |
-
print
|
165 |
-
|
166 |
-
|
|
|
1 |
+
import openai
|
|
|
2 |
import regex as re
|
3 |
+
|
4 |
+
openai.api_key = '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'
|
5 |
+
|
6 |
|
7 |
class TextClassifier:
|
|
|
|
|
|
|
|
|
8 |
|
9 |
+
def classify_topics(tweet_dict):
|
10 |
+
tweet_list = list(tweet_dict.keys())
|
11 |
+
prediction_dict = {}
|
12 |
+
|
13 |
+
for tweet in tweet_list:
|
14 |
+
prompt_string = "Classify this tweet with a general topic and two sub-topics:\n\""
|
15 |
prompt_string += tweet
|
16 |
+
prompt_string += "\".\nGeneral topic: \nSub topic 1: \nSub topic 2:\n. The classifications should not be " \
|
17 |
+
"more than 5 words. Numerate each topic in the output. END "
|
18 |
response = openai.Completion.create(
|
19 |
+
model="text-davinci-002",
|
20 |
+
prompt=prompt_string,
|
21 |
+
temperature=0,
|
22 |
+
max_tokens=892,
|
23 |
+
top_p=1,
|
24 |
+
frequency_penalty=0,
|
25 |
+
presence_penalty=0
|
26 |
)
|
27 |
|
28 |
+
classifications_unclean = response.choices[0]['text']
|
29 |
+
prediction_dict[tweet] = classifications_unclean
|
30 |
|
31 |
return TextClassifier.cleanup_topic_results(prediction_dict, tweet_dict)
|
32 |
+
|
|
|
33 |
def classify_sentiments(tweet_dict):
|
34 |
+
tweet_list = list(tweet_dict.keys())
|
35 |
|
36 |
for tweet in tweet_list:
|
37 |
prompt_string = "Classify one sentiment for this tweet:\n \""
|
38 |
prompt_string += tweet
|
39 |
+
prompt_string += "\" \nFor example:\nSupport,\nOpposition,\nCriticism,\nPraise,\nDisagreement," \
|
40 |
+
"\nAgreement,\nSkepticism,\nAdmiration,\nAnecdotes,\nJokes,\nMemes,\nSarcasm,\nSatire," \
|
41 |
+
"\nQuestions,\nStatements,\nOpinions,\nPredictions.\nSENTIMENT= "
|
42 |
+
|
43 |
response = openai.Completion.create(
|
44 |
+
model="text-davinci-002",
|
45 |
+
prompt=prompt_string,
|
46 |
+
temperature=0,
|
47 |
+
max_tokens=256,
|
48 |
+
top_p=1,
|
49 |
+
frequency_penalty=0,
|
50 |
+
presence_penalty=0
|
51 |
)
|
52 |
+
classifications_unclean = response.choices[0]['text']
|
53 |
tweet_dict[tweet]['sentiment'] = classifications_unclean
|
|
|
|
|
54 |
|
55 |
+
return tweet_dict
|
56 |
|
57 |
+
def cleanup_topic_results(prediction_dict, tweet_dict):
|
58 |
+
temp_list = []
|
59 |
|
60 |
for tweet, item in prediction_dict.items():
|
61 |
+
temp_list = []
|
62 |
new_item = item.replace("\n", " ")
|
63 |
new_item = new_item.replace(" ", " ")
|
64 |
new_item = new_item[4:]
|
65 |
+
new_item = re.sub('\d', '', new_item)
|
66 |
sub_list = new_item.split(".")
|
67 |
+
|
68 |
+
for item in sub_list:
|
69 |
if item.startswith(' '):
|
70 |
item = item[1:]
|
71 |
if item.endswith(' '):
|
72 |
+
item = item[:-1]
|
73 |
+
temp_list.append(item)
|
74 |
+
tweet_dict[tweet]['topic'] = temp_list
|
|
|
|
|
75 |
|
76 |
+
return tweet_dict
|
77 |
|
78 |
def print_results(results_dict):
|
79 |
+
print('\033[1m' + "RESULTS" + '\033[0m', "\n")
|
80 |
for key in results_dict.keys():
|
81 |
predictions = results_dict[key]
|
82 |
+
print("\"" + key + "\"" + "\n" + str(predictions), "\n" + "---------------------------------")
|
83 |
|
84 |
+
def print_stats(result_dict):
|
|
|
85 |
user = ""
|
86 |
freq_dict = {}
|
87 |
mean_likes = {}
|
|
|
93 |
|
94 |
for key, value in result_dict.items():
|
95 |
|
96 |
+
nlikes = value['nlikes']
|
97 |
nreplies = value['nreplies']
|
98 |
nretweets = value['nretweets']
|
99 |
+
topic_list = value['topic']
|
100 |
sentiment = value['sentiment']
|
101 |
|
102 |
+
# Count sentiment frequency
|
103 |
if sentiment in sentiment_dict.keys():
|
104 |
sentiment_dict[sentiment] += 1
|
105 |
else:
|
106 |
+
sentiment_dict[sentiment] = 1
|
107 |
+
nbr_sentiment += 1
|
108 |
+
|
109 |
+
# Count topic frequency
|
110 |
for topic in topic_list:
|
111 |
if topic in freq_dict.keys():
|
112 |
freq_dict[topic] += 1
|
113 |
+
|
114 |
else:
|
115 |
+
freq_dict[topic] = 1
|
116 |
nbr_topics += 1
|
117 |
+
|
118 |
# Count total likes per topic
|
119 |
if topic in mean_likes.keys():
|
120 |
+
mean_likes[topic] += nlikes
|
121 |
else:
|
122 |
+
mean_likes[topic] = nlikes
|
123 |
|
124 |
+
# Count total retweets per topic
|
125 |
if topic in mean_retweets.keys():
|
126 |
+
mean_retweets[topic] += nretweets
|
127 |
else:
|
128 |
+
mean_retweets[topic] = nretweets
|
129 |
|
130 |
+
# Count total replies per topic
|
131 |
if topic in mean_replies.keys():
|
132 |
+
mean_replies[topic] += nreplies
|
133 |
else:
|
134 |
+
mean_replies[topic] = nreplies
|
135 |
|
136 |
+
# Count mean of likes
|
137 |
+
for key in mean_likes.keys():
|
138 |
mean_likes[key] = mean_likes[key] / freq_dict[key]
|
139 |
+
|
140 |
# Count mean of retweets
|
141 |
+
for key in mean_retweets.keys():
|
142 |
mean_retweets[key] = mean_retweets[key] / freq_dict[key]
|
143 |
|
|
|
144 |
# Print the names of the columns.
|
145 |
+
print('\033[1m' + "USER: " + '\033[0m', user)
|
146 |
+
print('\033[1m' + "NBR OF TWEETS SCRAPED: " + '\033[0m', len(list(result_dict.keys())))
|
147 |
+
print('\033[1m' + "NBR OF DIFFERENT TOPICS: " + '\033[0m', nbr_topics, "\n")
|
148 |
+
print("{:<60} {:<20} {:<30} {:<30} {:<30} {:<30}".format('\033[1m' + 'TOPIC', 'TOPIC FREQUENCY',
|
149 |
+
'AVERAGE NBR OF LIKES', 'AVERAGE NBR OF RETWEETS',
|
150 |
+
'AVERAGE NBR OF REPLIES', 'REACH AVERAGE' + '\033[0m'))
|
151 |
+
|
152 |
# print each data item.
|
153 |
for key, value in mean_likes.items():
|
154 |
topic = key
|
155 |
mean_likes = value
|
156 |
+
reach_avg = (mean_likes + mean_retweets[topic] + mean_replies[topic]) / 3
|
157 |
+
print(
|
158 |
+
"{:<60} {:<20} {:<30} {:<30} {:<30} {:<30}".format(topic, freq_dict[topic], "{:.2f}".format(mean_likes),
|
159 |
+
"{:.2f}".format(mean_retweets[topic]),
|
160 |
+
mean_replies[topic], "{:.2f}".format(reach_avg)))
|
161 |
|
162 |
print("\n")
|
163 |
+
print('\033[1m' + "NBR OF DIFFERENT SENTIMENTS: " + '\033[0m', nbr_sentiment, "\n")
|
164 |
print("{:<60} {:<20}".format('\033[1m' + 'SENTIMENT', 'SENTIMENT FREQUENCY' + '\033[0m'))
|
165 |
for key, value in sentiment_dict.items():
|
166 |
sentiment = key
|
167 |
+
mean_sentiment = value
|
168 |
+
print("{:<60} {:<20}".format(sentiment, sentiment_dict[sentiment], "{:.2f}".format(mean_sentiment)))
|
|
|
|
text-classifier/main.py
CHANGED
@@ -1,43 +1,54 @@
|
|
1 |
from TextClassifier import TextClassifier
|
2 |
|
3 |
# Some examples of tweets:
|
4 |
-
data_dict = {
|
5 |
-
'
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
'
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
# Classify the TOPICS and insert the results into the data dictionary found above
|
43 |
topic_results = TextClassifier.classify_topics(data_dict)
|
@@ -45,4 +56,3 @@ topic_results = TextClassifier.classify_topics(data_dict)
|
|
45 |
sentiment_results = TextClassifier.classify_sentiments(data_dict)
|
46 |
# Print simple statistics related to TOPICS and SENTIMENTS
|
47 |
TextClassifier.print_stats(sentiment_results)
|
48 |
-
|
|
|
1 |
from TextClassifier import TextClassifier
|
2 |
|
3 |
# Some examples of tweets:
|
4 |
+
data_dict = {
|
5 |
+
'25 years ago we made a promise to the people of Hong Kong. We intend to keep it. https://t.co/nIN96ZydgV': {
|
6 |
+
'hour': '17',
|
7 |
+
'nlikes': 7878,
|
8 |
+
'nreplies': 2999,
|
9 |
+
'nretweets': 1993,
|
10 |
+
'topic': '',
|
11 |
+
'sentiment': ''},
|
12 |
+
'A huge delight to meet @SwedishPM Magdalena Andersson and President @niinisto again. The accession of Finland '
|
13 |
+
'and Sweden to @NATO will permanently strengthen our defensive Alliance, helping to keep us all safe. #WeAreNATO '
|
14 |
+
' https://t.co/pArvdWHr2F': {
|
15 |
+
'hour': '16',
|
16 |
+
'nlikes': 3468,
|
17 |
+
'nreplies': 686,
|
18 |
+
'nretweets': 435,
|
19 |
+
'topic': '',
|
20 |
+
'sentiment': ''},
|
21 |
+
'At this @NATO Leaders’ Summit, I’ll be urging fellow nations to continue to do everything they can to support '
|
22 |
+
'Ukraine. The UK has always played a historic role in the @NATO alliance, working to address the biggest global '
|
23 |
+
'threats and build a more secure world.': {
|
24 |
+
'hour': '07',
|
25 |
+
'nlikes': 7742,
|
26 |
+
'nreplies': 1838,
|
27 |
+
'nretweets': 1112,
|
28 |
+
'topic': '',
|
29 |
+
'sentiment': ''},
|
30 |
+
'Morgan Johansson måste avgå som minister. Otryggheten biter sig fast och gängkriminaliteten är allt annat än knäckt. Antalet skjutningar ökar och sätter skräck i varje del av vårt land. Sverige har förvandlats till ett gangsterland.': {
|
31 |
+
'hour': '16',
|
32 |
+
'nlikes': 3468,
|
33 |
+
'nreplies': 686,
|
34 |
+
'nretweets': 435,
|
35 |
+
'topic': '',
|
36 |
+
'sentiment': ''},
|
37 |
+
'Döms man för brott, särskilt våldsbrott, ska man vara inlåst från det att domen faller tills straffet är avtjänat. Allt annat är vansinne.': {
|
38 |
+
'hour': '16',
|
39 |
+
'nlikes': 3468,
|
40 |
+
'nreplies': 686,
|
41 |
+
'nretweets': 435,
|
42 |
+
'topic': '',
|
43 |
+
'sentiment': ''},
|
44 |
+
'Motionerna: ' + '\n' + 'K339 avslogs av enig riksdag (inkl KD).' + '\n' + 'K220 avslogs av enig riksdag (inkl KD).' + '\n' + '1601 avslogs av enig riksdag (inkl KD).' + '\n' + 'K281 avslogs av enig riksdag (inkl KD).' + '\n' + '\n' + '¯\_(ツ)_/¯': {
|
45 |
+
'hour': '16',
|
46 |
+
'nlikes': 3468,
|
47 |
+
'nreplies': 686,
|
48 |
+
'nretweets': 435,
|
49 |
+
'topic': '',
|
50 |
+
'sentiment': ''}
|
51 |
+
}
|
52 |
|
53 |
# Classify the TOPICS and insert the results into the data dictionary found above
|
54 |
topic_results = TextClassifier.classify_topics(data_dict)
|
|
|
56 |
sentiment_results = TextClassifier.classify_sentiments(data_dict)
|
57 |
# Print simple statistics related to TOPICS and SENTIMENTS
|
58 |
TextClassifier.print_stats(sentiment_results)
|
|