Demea9000 commited on
Commit
130dfd8
1 Parent(s): 510f63c

added skeleton to TextClassifier

Browse files
Files changed (1) hide show
  1. textclassifier/TextClassifier.py +27 -171
textclassifier/TextClassifier.py CHANGED
@@ -1,176 +1,32 @@
1
  import openai
2
  import regex as re
3
- from twitterscraper import TwitterScraper as tf
4
-
5
- openai.api_key = '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'
6
 
7
 
8
  class TextClassifier:
9
-
10
- def classify_topics(tweet_dict):
11
- tweet_list = list(tweet_dict.keys())
12
- prediction_dict = {}
13
-
14
- for tweet in tweet_list:
15
- prompt_string = "Classify this tweet with a general topic and two sub-topics:\n\""
16
- prompt_string += tweet
17
- prompt_string += "\".\nGeneral topic: \nSub topic 1: \nSub topic 2:\n. The classifications should not be " \
18
- "more than 5 words. Numerate each topic in the output. END "
19
- response = openai.Completion.create(
20
- model="text-davinci-002",
21
- prompt=prompt_string,
22
- temperature=0,
23
- max_tokens=892,
24
- top_p=1,
25
- frequency_penalty=0,
26
- presence_penalty=0
27
- )
28
-
29
- classifications_unclean = response.choices[0]['text']
30
- prediction_dict[tweet] = classifications_unclean
31
-
32
- return TextClassifier.cleanup_topic_results(prediction_dict, tweet_dict)
33
-
34
- def classify_sentiments(tweet_dict):
35
- tweet_list = list(tweet_dict.keys())
36
-
37
- for tweet in tweet_list:
38
- prompt_string = "Classify one sentiment for this tweet:\n \""
39
- prompt_string += tweet
40
- prompt_string += "\" \nFor example:\nSupport,\nOpposition,\nCriticism,\nPraise,\nDisagreement," \
41
- "\nAgreement,\nSkepticism,\nAdmiration,\nAnecdotes,\nJokes,\nMemes,\nSarcasm,\nSatire," \
42
- "\nQuestions,\nStatements,\nOpinions,\nPredictions.\nSENTIMENT= "
43
-
44
- response = openai.Completion.create(
45
- model="text-davinci-002",
46
- prompt=prompt_string,
47
- temperature=0,
48
- max_tokens=256,
49
- top_p=1,
50
- frequency_penalty=0,
51
- presence_penalty=0
52
- )
53
- classifications_unclean = response.choices[0]['text']
54
- tweet_dict[tweet]['sentiment'] = classifications_unclean
55
-
56
- return tweet_dict
57
-
58
- def cleanup_topic_results(prediction_dict, tweet_dict):
59
- temp_list = []
60
-
61
- for tweet, item in prediction_dict.items():
62
- temp_list = []
63
- new_item = item.replace("\n", " ")
64
- new_item = new_item.replace(" ", " ")
65
- new_item = new_item[4:]
66
- new_item = re.sub('\d', '', new_item)
67
- sub_list = new_item.split(".")
68
-
69
- for item in sub_list:
70
- if item.startswith(' '):
71
- item = item[1:]
72
- if item.endswith(' '):
73
- item = item[:-1]
74
- temp_list.append(item)
75
- tweet_dict[tweet]['topic'] = temp_list
76
-
77
- return tweet_dict
78
-
79
- def print_results(results_dict):
80
- print('\033[1m' + "RESULTS" + '\033[0m', "\n")
81
- for key in results_dict.keys():
82
- predictions = results_dict[key]
83
- print("\"" + key + "\"" + "\n" + str(predictions), "\n" + "---------------------------------")
84
-
85
- def print_stats(result_dict):
86
- user = ""
87
- freq_dict = {}
88
- mean_likes = {}
89
- mean_retweets = {}
90
- mean_replies = {}
91
- sentiment_dict = {}
92
- nbr_sentiment = 0
93
- nbr_topics = 0
94
-
95
- for key, value in result_dict.items():
96
-
97
- nlikes = value['nlikes']
98
- nreplies = value['nreplies']
99
- nretweets = value['nretweets']
100
- topic_list = value['topic']
101
- sentiment = value['sentiment']
102
-
103
- # Count sentiment frequency
104
- if sentiment in sentiment_dict.keys():
105
- sentiment_dict[sentiment] += 1
106
- else:
107
- sentiment_dict[sentiment] = 1
108
- nbr_sentiment += 1
109
-
110
- # Count topic frequency
111
- for topic in topic_list:
112
- if topic in freq_dict.keys():
113
- freq_dict[topic] += 1
114
-
115
- else:
116
- freq_dict[topic] = 1
117
- nbr_topics += 1
118
-
119
- # Count total likes per topic
120
- if topic in mean_likes.keys():
121
- mean_likes[topic] += nlikes
122
- else:
123
- mean_likes[topic] = nlikes
124
-
125
- # Count total retweets per topic
126
- if topic in mean_retweets.keys():
127
- mean_retweets[topic] += nretweets
128
- else:
129
- mean_retweets[topic] = nretweets
130
-
131
- # Count total replies per topic
132
- if topic in mean_replies.keys():
133
- mean_replies[topic] += nreplies
134
- else:
135
- mean_replies[topic] = nreplies
136
-
137
- # Count mean of likes
138
- for key in mean_likes.keys():
139
- mean_likes[key] = mean_likes[key] / freq_dict[key]
140
-
141
- # Count mean of retweets
142
- for key in mean_retweets.keys():
143
- mean_retweets[key] = mean_retweets[key] / freq_dict[key]
144
-
145
- # Print the names of the columns.
146
- print('\033[1m' + "USER: " + '\033[0m', user)
147
- print('\033[1m' + "NBR OF TWEETS SCRAPED: " + '\033[0m', len(list(result_dict.keys())))
148
- print('\033[1m' + "NBR OF DIFFERENT TOPICS: " + '\033[0m', nbr_topics, "\n")
149
- print("{:<60} {:<20} {:<30} {:<30} {:<30} {:<30}".format('\033[1m' + 'TOPIC', 'TOPIC FREQUENCY',
150
- 'AVERAGE NBR OF LIKES', 'AVERAGE NBR OF RETWEETS',
151
- 'AVERAGE NBR OF REPLIES', 'REACH AVERAGE' + '\033[0m'))
152
-
153
- # print each data item.
154
- for key, value in mean_likes.items():
155
- topic = key
156
- mean_likes = value
157
- reach_avg = (mean_likes + mean_retweets[topic] + mean_replies[topic]) / 3
158
- print(
159
- "{:<60} {:<20} {:<30} {:<30} {:<30} {:<30}".format(topic, freq_dict[topic], "{:.2f}".format(mean_likes),
160
- "{:.2f}".format(mean_retweets[topic]),
161
- mean_replies[topic], "{:.2f}".format(reach_avg)))
162
-
163
- print("\n")
164
- print('\033[1m' + "NBR OF DIFFERENT SENTIMENTS: " + '\033[0m', nbr_sentiment, "\n")
165
- print("{:<60} {:<20}".format('\033[1m' + 'SENTIMENT', 'SENTIMENT FREQUENCY' + '\033[0m'))
166
- for key, value in sentiment_dict.items():
167
- sentiment = key
168
- mean_sentiment = value
169
- print("{:<60} {:<20}".format(sentiment, sentiment_dict[sentiment], "{:.2f}".format(mean_sentiment)))
170
-
171
-
172
- if __name__ == '__main__':
173
- sc = tf.TwitterScraper(num_tweets=40)
174
- dc = sc.scrape_by_user("jimmieakesson")
175
- print(dc.head())
176
- print(dc.shape)
 
1
  import openai
2
  import regex as re
3
+ from twitterscraper import TwitterScraper
4
+ from datetime import date
 
5
 
6
 
7
  class TextClassifier:
8
+ def __init__(self, model_name="text-davinci-002", from_date='2022-01-01', to_date=str(date.today()), num_tweets=100):
9
+ """
10
+ Initializes the TextClassifier.
11
+ :param model_name: name of the model from openai.
12
+ :param from_date: string of the format 'YYYY-MM-DD'.
13
+ :param to_date: string of the format 'YYYY-MM-DD'.
14
+ :param num_tweets: integer value of the maximum number of tweets to be scraped.
15
+ """
16
+
17
+ self.model_name = model_name
18
+ self.df = TwitterScraper.TwitterScraper(from_date, to_date, num_tweets)
19
+ self.api_key = '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'
20
+
21
+ def classify_sentiment(self, text: str):
22
+ """
23
+ Classifies the sentiment of a text.
24
+ """
25
+
26
+ def classify_topics(self, text: str):
27
+ """
28
+ Classifies the topics of a text.
29
+ """
30
+
31
+ def __repr__(self):
32
+ return f"TextClassifier(df={self.df}, col={self.col}, model_name={self.model_name})"