13macattack37 commited on
Commit
dc67c78
1 Parent(s): 205426f

Added the text classifier class to the repo

Browse files
Files changed (1) hide show
  1. text-classifier/text_classifier.py +168 -0
text-classifier/text_classifier.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import openai
3
+ import regex as re
4
+ openai.api_key = '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'
5
+
6
+ class text_classifier:
7
+
8
+ '''def __init__(self, user, from_date, to_date):
9
+ self.user = "Janne"
10
+ self.from_date = "2022-01-05"
11
+ self.to_date = "2022-07-05"'''
12
+
13
+ def classify_topics(tweet_dict):
14
+ tweet_list = list(tweet_dict.keys())
15
+ prediction_list = []
16
+
17
+ for tweet in tweet_list:
18
+ #prompt_string = ""
19
+ prompt_string = "Classify this tweet with a general topic and two sub-topics:\n\""
20
+ prompt_string += tweet
21
+ prompt_string += "\".\nGeneral topic: \nSub topic 1: \nSub topic 2:\n. The classifications should not be more than 5 words. Numerate each topic in the output. END"
22
+ response = openai.Completion.create(
23
+ model="text-davinci-002",
24
+ prompt= prompt_string,
25
+ temperature=0,
26
+ max_tokens=892,
27
+ top_p=1,
28
+ frequency_penalty=0,
29
+ presence_penalty=0
30
+ )
31
+
32
+ classifications_unclean = response.choices[0]['text']
33
+ prediction_list.append(classifications_unclean)
34
+
35
+ return text_classifier.cleanup_results(prediction_list, tweet_dict)
36
+
37
+ def classify_sentiments(tweet_dict):
38
+ tweet_list = list(tweet_dict.keys())
39
+ prediction_list = []
40
+
41
+ for tweet in tweet_list:
42
+ prompt_string = "Classify one sentiment for this tweet:\n \""
43
+ prompt_string += tweet
44
+ prompt_string += "\" \nFor example:\nSupport,\nOpposition,\nCriticism,\nPraise,\nDisagreement,\nAgreement,\nSkepticism,\nAdmiration,\nAnecdotes,\nJokes,\nMemes,\nSarcasm,\nSatire,\nQuestions,\nStatements,\nOpinions,\nPredictions.\nSENTIMENT="
45
+
46
+
47
+ response = openai.Completion.create(
48
+ model="text-davinci-002",
49
+ prompt=prompt_string,
50
+ temperature=0,
51
+ max_tokens=256,
52
+ top_p=1,
53
+ frequency_penalty=0,
54
+ presence_penalty=0
55
+ )
56
+ classifications_unclean = response.choices[0]['text']
57
+ prediction_list.append(classifications_unclean)
58
+
59
+ return prediction_list
60
+
61
+
62
+ def cleanup_results(prediction_list, tweet_dict):
63
+ predictions_cleaned = []
64
+ temp_list = []
65
+ pred_dict = {}
66
+ i = 0
67
+ tweet_list = list(tweet_dict.keys())
68
+
69
+ for item in prediction_list:
70
+ temp_list = []
71
+ new_item = item.replace("\n", " ")
72
+ new_item = new_item.replace(" ", " ")
73
+ new_item = new_item[4:]
74
+ new_item = re.sub('\d', '', new_item)
75
+ sub_list = new_item.split(".")
76
+
77
+ for item in sub_list:
78
+ if item.startswith(' '):
79
+ item = item[1:]
80
+ if item.endswith(' '):
81
+ item = item[:-1]
82
+ temp_list.append(item)
83
+ predictions_cleaned.append(temp_list)
84
+
85
+ for tweet in tweet_list:
86
+ pred_dict[tweet] = predictions_cleaned[i]
87
+ i += 1
88
+ return pred_dict
89
+
90
+
91
+ def insert_predictions(tweet_dict, results):
92
+ for key in results:
93
+ tweet_dict[key]['topic'] = results[key]
94
+ return tweet_dict
95
+
96
+
97
+ def print_results(results_dict):
98
+ print('\033[1m' + "RESULTS" + '\033[0m', "\n")
99
+ for key in results_dict.keys():
100
+ predictions = results_dict[key]
101
+ print("\"" + key + "\"" + "\n"+ str(predictions),"\n" + "---------------------------------")
102
+
103
+
104
+ def print_stats(result_dict):
105
+ user = ""
106
+ freq_dict = {}
107
+ mean_likes = {}
108
+ mean_retweets = {}
109
+ mean_replies = {}
110
+ nbr_topics = 0
111
+
112
+ for value in result_dict.values():
113
+
114
+ nlikes = value['nlikes']
115
+ nreplies = value['nreplies']
116
+ nretweets = value['nretweets']
117
+ topic_list = value['topic']
118
+
119
+ # Count topic frequency
120
+ for topic in topic_list:
121
+ if topic in freq_dict.keys():
122
+ freq_dict[topic] += 1
123
+
124
+ else:
125
+ freq_dict[topic] = 1
126
+ nbr_topics += 1
127
+
128
+ # Count total likes per topic
129
+ if topic in mean_likes.keys():
130
+ mean_likes[topic] += nlikes
131
+ else:
132
+ mean_likes[topic] = nlikes
133
+
134
+ # Count total retweets per topic
135
+ if topic in mean_retweets.keys():
136
+ mean_retweets[topic] += nretweets
137
+ else:
138
+ mean_retweets[topic] = nretweets
139
+
140
+ # Count total replies per topic
141
+ if topic in mean_replies.keys():
142
+ mean_replies[topic] += nreplies
143
+ else:
144
+ mean_replies[topic] = nreplies
145
+
146
+ # Count mean of likes
147
+ for key in mean_likes.keys():
148
+ mean_likes[key] = mean_likes[key] / freq_dict[key]
149
+
150
+ # Count mean of retweets
151
+ for key in mean_retweets.keys():
152
+ mean_retweets[key] = mean_retweets[key] / freq_dict[key]
153
+
154
+
155
+ # Print the names of the columns.
156
+ print('\033[1m' + "USER: " + '\033[0m', user)
157
+ print('\033[1m' + "NBR OF TWEETS SCRAPED: "+ '\033[0m', len(list(result_dict.keys())))
158
+ print('\033[1m' + "NBR OF DIFFERENT TOPICS: "+ '\033[0m', nbr_topics, "\n", "\n")
159
+ print("{:<60} {:<20} {:<30} {:<30} {:<30} {:<30}".format('\033[1m' + 'TOPIC', 'TOPIC FREQUENCY', 'AVERAGE NBR OF LIKES', 'AVERAGE NBR OF RETWEETS', 'AVERAGE NBR OF REPLIES', 'REACH AVERAGE' + '\033[0m'))
160
+
161
+ # print each data item.
162
+ for key, value in mean_likes.items():
163
+ topic = key
164
+ mean_likes = value
165
+ reach_avg = (mean_likes + mean_retweets[topic] + mean_replies[topic] ) / 3
166
+ print ("{:<60} {:<20} {:<30} {:<30} {:<30} {:<30}".format(topic, freq_dict[topic], "{:.2f}".format(mean_likes), "{:.2f}".format(mean_retweets[topic]), mean_replies[topic], "{:.2f}".format(reach_avg)))
167
+
168
+