Demea9000 commited on
Commit
eceff29
1 Parent(s): 13599ad
text-classifier/TextClassifier.py CHANGED
@@ -1,87 +1,87 @@
1
-
2
- import openai
3
  import regex as re
4
- openai.api_key = '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'
 
 
5
 
6
  class TextClassifier:
7
-
8
- def classify_topics(tweet_dict):
9
- tweet_list = list(tweet_dict.keys())
10
- prediction_dict = {}
11
 
12
- for tweet in tweet_list:
13
- prompt_string = "Classify this tweet with a general topic and two sub-topics:\n\""
 
 
 
 
14
  prompt_string += tweet
15
- prompt_string += "\".\nGeneral topic: \nSub topic 1: \nSub topic 2:\n. The classifications should not be more than 5 words. Numerate each topic in the output. END"
 
16
  response = openai.Completion.create(
17
- model="text-davinci-002",
18
- prompt= prompt_string,
19
- temperature=0,
20
- max_tokens=892,
21
- top_p=1,
22
- frequency_penalty=0,
23
- presence_penalty=0
24
  )
25
 
26
- classifications_unclean = response.choices[0]['text']
27
- prediction_dict[tweet] = classifications_unclean
28
 
29
  return TextClassifier.cleanup_topic_results(prediction_dict, tweet_dict)
30
-
31
-
32
  def classify_sentiments(tweet_dict):
33
- tweet_list = list(tweet_dict.keys())
34
 
35
  for tweet in tweet_list:
36
  prompt_string = "Classify one sentiment for this tweet:\n \""
37
  prompt_string += tweet
38
- prompt_string += "\" \nFor example:\nSupport,\nOpposition,\nCriticism,\nPraise,\nDisagreement,\nAgreement,\nSkepticism,\nAdmiration,\nAnecdotes,\nJokes,\nMemes,\nSarcasm,\nSatire,\nQuestions,\nStatements,\nOpinions,\nPredictions.\nSENTIMENT="
39
-
 
 
40
  response = openai.Completion.create(
41
- model="text-davinci-002",
42
- prompt=prompt_string,
43
- temperature=0,
44
- max_tokens=256,
45
- top_p=1,
46
- frequency_penalty=0,
47
- presence_penalty=0
48
  )
49
- classifications_unclean = response.choices[0]['text']
50
  tweet_dict[tweet]['sentiment'] = classifications_unclean
51
-
52
- return tweet_dict
53
 
 
54
 
55
- def cleanup_topic_results(prediction_dict, tweet_dict):
56
- temp_list = []
57
 
58
  for tweet, item in prediction_dict.items():
59
- temp_list = []
60
  new_item = item.replace("\n", " ")
61
  new_item = new_item.replace(" ", " ")
62
  new_item = new_item[4:]
63
- new_item = re.sub('\d', '', new_item)
64
  sub_list = new_item.split(".")
65
-
66
- for item in sub_list:
67
  if item.startswith(' '):
68
  item = item[1:]
69
  if item.endswith(' '):
70
- item = item[:-1]
71
- temp_list.append(item)
72
- tweet_dict[tweet]['topic'] = temp_list
73
-
74
- return tweet_dict
75
 
 
76
 
77
  def print_results(results_dict):
78
- print('\033[1m' + "RESULTS" + '\033[0m', "\n")
79
  for key in results_dict.keys():
80
  predictions = results_dict[key]
81
- print("\"" + key + "\"" + "\n"+ str(predictions),"\n" + "---------------------------------")
82
 
83
-
84
- def print_stats(result_dict):
85
  user = ""
86
  freq_dict = {}
87
  mean_likes = {}
@@ -93,74 +93,76 @@ class TextClassifier:
93
 
94
  for key, value in result_dict.items():
95
 
96
- nlikes = value['nlikes']
97
  nreplies = value['nreplies']
98
  nretweets = value['nretweets']
99
- topic_list = value['topic']
100
  sentiment = value['sentiment']
101
 
102
- # Count sentiment frequency
103
  if sentiment in sentiment_dict.keys():
104
  sentiment_dict[sentiment] += 1
105
  else:
106
- sentiment_dict[sentiment] = 1
107
- nbr_sentiment += 1
108
-
109
- # Count topic frequency
110
  for topic in topic_list:
111
  if topic in freq_dict.keys():
112
  freq_dict[topic] += 1
113
-
114
  else:
115
- freq_dict[topic] = 1
116
  nbr_topics += 1
117
-
118
  # Count total likes per topic
119
  if topic in mean_likes.keys():
120
- mean_likes[topic] += nlikes
121
  else:
122
- mean_likes[topic] = nlikes
123
 
124
- # Count total retweets per topic
125
  if topic in mean_retweets.keys():
126
- mean_retweets[topic] += nretweets
127
  else:
128
- mean_retweets[topic] = nretweets
129
 
130
- # Count total replies per topic
131
  if topic in mean_replies.keys():
132
- mean_replies[topic] += nreplies
133
  else:
134
- mean_replies[topic] = nreplies
135
 
136
- # Count mean of likes
137
- for key in mean_likes.keys():
138
  mean_likes[key] = mean_likes[key] / freq_dict[key]
139
-
140
  # Count mean of retweets
141
- for key in mean_retweets.keys():
142
  mean_retweets[key] = mean_retweets[key] / freq_dict[key]
143
 
144
-
145
  # Print the names of the columns.
146
- print('\033[1m' + "USER: " + '\033[0m', user)
147
- print('\033[1m' + "NBR OF TWEETS SCRAPED: "+ '\033[0m', len(list(result_dict.keys())))
148
- print('\033[1m' + "NBR OF DIFFERENT TOPICS: "+ '\033[0m', nbr_topics, "\n")
149
- print("{:<60} {:<20} {:<30} {:<30} {:<30} {:<30}".format('\033[1m' + 'TOPIC', 'TOPIC FREQUENCY', 'AVERAGE NBR OF LIKES', 'AVERAGE NBR OF RETWEETS', 'AVERAGE NBR OF REPLIES', 'REACH AVERAGE' + '\033[0m'))
150
-
 
 
151
  # print each data item.
152
  for key, value in mean_likes.items():
153
  topic = key
154
  mean_likes = value
155
- reach_avg = (mean_likes + mean_retweets[topic] + mean_replies[topic] ) / 3
156
- print ("{:<60} {:<20} {:<30} {:<30} {:<30} {:<30}".format(topic, freq_dict[topic], "{:.2f}".format(mean_likes), "{:.2f}".format(mean_retweets[topic]), mean_replies[topic], "{:.2f}".format(reach_avg)))
 
 
 
157
 
158
  print("\n")
159
- print('\033[1m' + "NBR OF DIFFERENT SENTIMENTS: "+ '\033[0m', nbr_sentiment, "\n")
160
  print("{:<60} {:<20}".format('\033[1m' + 'SENTIMENT', 'SENTIMENT FREQUENCY' + '\033[0m'))
161
  for key, value in sentiment_dict.items():
162
  sentiment = key
163
- mean_sentiment = value
164
- print ("{:<60} {:<20}".format(sentiment, sentiment_dict[sentiment], "{:.2f}".format(mean_sentiment)))
165
-
166
-
 
1
+ import openai
 
2
  import regex as re
3
+
4
+ openai.api_key = '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'
5
+
6
 
7
  class TextClassifier:
 
 
 
 
8
 
9
+ def classify_topics(tweet_dict):
10
+ tweet_list = list(tweet_dict.keys())
11
+ prediction_dict = {}
12
+
13
+ for tweet in tweet_list:
14
+ prompt_string = "Classify this tweet with a general topic and two sub-topics:\n\""
15
  prompt_string += tweet
16
+ prompt_string += "\".\nGeneral topic: \nSub topic 1: \nSub topic 2:\n. The classifications should not be " \
17
+ "more than 5 words. Numerate each topic in the output. END "
18
  response = openai.Completion.create(
19
+ model="text-davinci-002",
20
+ prompt=prompt_string,
21
+ temperature=0,
22
+ max_tokens=892,
23
+ top_p=1,
24
+ frequency_penalty=0,
25
+ presence_penalty=0
26
  )
27
 
28
+ classifications_unclean = response.choices[0]['text']
29
+ prediction_dict[tweet] = classifications_unclean
30
 
31
  return TextClassifier.cleanup_topic_results(prediction_dict, tweet_dict)
32
+
 
33
  def classify_sentiments(tweet_dict):
34
+ tweet_list = list(tweet_dict.keys())
35
 
36
  for tweet in tweet_list:
37
  prompt_string = "Classify one sentiment for this tweet:\n \""
38
  prompt_string += tweet
39
+ prompt_string += "\" \nFor example:\nSupport,\nOpposition,\nCriticism,\nPraise,\nDisagreement," \
40
+ "\nAgreement,\nSkepticism,\nAdmiration,\nAnecdotes,\nJokes,\nMemes,\nSarcasm,\nSatire," \
41
+ "\nQuestions,\nStatements,\nOpinions,\nPredictions.\nSENTIMENT= "
42
+
43
  response = openai.Completion.create(
44
+ model="text-davinci-002",
45
+ prompt=prompt_string,
46
+ temperature=0,
47
+ max_tokens=256,
48
+ top_p=1,
49
+ frequency_penalty=0,
50
+ presence_penalty=0
51
  )
52
+ classifications_unclean = response.choices[0]['text']
53
  tweet_dict[tweet]['sentiment'] = classifications_unclean
 
 
54
 
55
+ return tweet_dict
56
 
57
+ def cleanup_topic_results(prediction_dict, tweet_dict):
58
+ temp_list = []
59
 
60
  for tweet, item in prediction_dict.items():
61
+ temp_list = []
62
  new_item = item.replace("\n", " ")
63
  new_item = new_item.replace(" ", " ")
64
  new_item = new_item[4:]
65
+ new_item = re.sub('\d', '', new_item)
66
  sub_list = new_item.split(".")
67
+
68
+ for item in sub_list:
69
  if item.startswith(' '):
70
  item = item[1:]
71
  if item.endswith(' '):
72
+ item = item[:-1]
73
+ temp_list.append(item)
74
+ tweet_dict[tweet]['topic'] = temp_list
 
 
75
 
76
+ return tweet_dict
77
 
78
  def print_results(results_dict):
79
+ print('\033[1m' + "RESULTS" + '\033[0m', "\n")
80
  for key in results_dict.keys():
81
  predictions = results_dict[key]
82
+ print("\"" + key + "\"" + "\n" + str(predictions), "\n" + "---------------------------------")
83
 
84
+ def print_stats(result_dict):
 
85
  user = ""
86
  freq_dict = {}
87
  mean_likes = {}
 
93
 
94
  for key, value in result_dict.items():
95
 
96
+ nlikes = value['nlikes']
97
  nreplies = value['nreplies']
98
  nretweets = value['nretweets']
99
+ topic_list = value['topic']
100
  sentiment = value['sentiment']
101
 
102
+ # Count sentiment frequency
103
  if sentiment in sentiment_dict.keys():
104
  sentiment_dict[sentiment] += 1
105
  else:
106
+ sentiment_dict[sentiment] = 1
107
+ nbr_sentiment += 1
108
+
109
+ # Count topic frequency
110
  for topic in topic_list:
111
  if topic in freq_dict.keys():
112
  freq_dict[topic] += 1
113
+
114
  else:
115
+ freq_dict[topic] = 1
116
  nbr_topics += 1
117
+
118
  # Count total likes per topic
119
  if topic in mean_likes.keys():
120
+ mean_likes[topic] += nlikes
121
  else:
122
+ mean_likes[topic] = nlikes
123
 
124
+ # Count total retweets per topic
125
  if topic in mean_retweets.keys():
126
+ mean_retweets[topic] += nretweets
127
  else:
128
+ mean_retweets[topic] = nretweets
129
 
130
+ # Count total replies per topic
131
  if topic in mean_replies.keys():
132
+ mean_replies[topic] += nreplies
133
  else:
134
+ mean_replies[topic] = nreplies
135
 
136
+ # Count mean of likes
137
+ for key in mean_likes.keys():
138
  mean_likes[key] = mean_likes[key] / freq_dict[key]
139
+
140
  # Count mean of retweets
141
+ for key in mean_retweets.keys():
142
  mean_retweets[key] = mean_retweets[key] / freq_dict[key]
143
 
 
144
  # Print the names of the columns.
145
+ print('\033[1m' + "USER: " + '\033[0m', user)
146
+ print('\033[1m' + "NBR OF TWEETS SCRAPED: " + '\033[0m', len(list(result_dict.keys())))
147
+ print('\033[1m' + "NBR OF DIFFERENT TOPICS: " + '\033[0m', nbr_topics, "\n")
148
+ print("{:<60} {:<20} {:<30} {:<30} {:<30} {:<30}".format('\033[1m' + 'TOPIC', 'TOPIC FREQUENCY',
149
+ 'AVERAGE NBR OF LIKES', 'AVERAGE NBR OF RETWEETS',
150
+ 'AVERAGE NBR OF REPLIES', 'REACH AVERAGE' + '\033[0m'))
151
+
152
  # print each data item.
153
  for key, value in mean_likes.items():
154
  topic = key
155
  mean_likes = value
156
+ reach_avg = (mean_likes + mean_retweets[topic] + mean_replies[topic]) / 3
157
+ print(
158
+ "{:<60} {:<20} {:<30} {:<30} {:<30} {:<30}".format(topic, freq_dict[topic], "{:.2f}".format(mean_likes),
159
+ "{:.2f}".format(mean_retweets[topic]),
160
+ mean_replies[topic], "{:.2f}".format(reach_avg)))
161
 
162
  print("\n")
163
+ print('\033[1m' + "NBR OF DIFFERENT SENTIMENTS: " + '\033[0m', nbr_sentiment, "\n")
164
  print("{:<60} {:<20}".format('\033[1m' + 'SENTIMENT', 'SENTIMENT FREQUENCY' + '\033[0m'))
165
  for key, value in sentiment_dict.items():
166
  sentiment = key
167
+ mean_sentiment = value
168
+ print("{:<60} {:<20}".format(sentiment, sentiment_dict[sentiment], "{:.2f}".format(mean_sentiment)))
 
 
text-classifier/main.py CHANGED
@@ -1,43 +1,54 @@
1
  from TextClassifier import TextClassifier
2
 
3
  # Some examples of tweets:
4
- data_dict = {'25 years ago we made a promise to the people of Hong Kong. We intend to keep it. https://t.co/nIN96ZydgV': {'hour': '17',
5
- 'nlikes': 7878,
6
- 'nreplies': 2999,
7
- 'nretweets': 1993,
8
- 'topic': '',
9
- 'sentiment': ''},
10
- 'A huge delight to meet @SwedishPM Magdalena Andersson and President @niinisto again. The accession of Finland and Sweden to @NATO will permanently strengthen our defensive Alliance, helping to keep us all safe. #WeAreNATO https://t.co/pArvdWHr2F': {'hour': '16',
11
- 'nlikes': 3468,
12
- 'nreplies': 686,
13
- 'nretweets': 435,
14
- 'topic': '',
15
- 'sentiment': ''},
16
- 'At this @NATO Leaders’ Summit, I’ll be urging fellow nations to continue to do everything they can to support Ukraine. The UK has always played a historic role in the @NATO alliance, working to address the biggest global threats and build a more secure world.': {'hour': '07',
17
- 'nlikes': 7742,
18
- 'nreplies': 1838,
19
- 'nretweets': 1112,
20
- 'topic': '',
21
- 'sentiment': ''},
22
- 'Morgan Johansson måste avgå som minister. Otryggheten biter sig fast och gängkriminaliteten är allt annat än knäckt. Antalet skjutningar ökar och sätter skräck i varje del av vårt land. Sverige har förvandlats till ett gangsterland.': {'hour': '16',
23
- 'nlikes': 3468,
24
- 'nreplies': 686,
25
- 'nretweets': 435,
26
- 'topic': '',
27
- 'sentiment': ''},
28
- 'Döms man för brott, särskilt våldsbrott, ska man vara inlåst från det att domen faller tills straffet är avtjänat. Allt annat är vansinne.': {'hour': '16',
29
- 'nlikes': 3468,
30
- 'nreplies': 686,
31
- 'nretweets': 435,
32
- 'topic': '',
33
- 'sentiment': ''},
34
- 'Motionerna: ' + '\n' + 'K339 avslogs av enig riksdag (inkl KD).' + '\n' + 'K220 avslogs av enig riksdag (inkl KD).' + '\n' + '1601 avslogs av enig riksdag (inkl KD).' + '\n' + 'K281 avslogs av enig riksdag (inkl KD).' + '\n' + '\n' + '¯\_(ツ)_/¯': {'hour': '16',
35
- 'nlikes': 3468,
36
- 'nreplies': 686,
37
- 'nretweets': 435,
38
- 'topic': '',
39
- 'sentiment': ''}
40
- }
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  # Classify the TOPICS and insert the results into the data dictionary found above
43
  topic_results = TextClassifier.classify_topics(data_dict)
@@ -45,4 +56,3 @@ topic_results = TextClassifier.classify_topics(data_dict)
45
  sentiment_results = TextClassifier.classify_sentiments(data_dict)
46
  # Print simple statistics related to TOPICS and SENTIMENTS
47
  TextClassifier.print_stats(sentiment_results)
48
-
 
1
  from TextClassifier import TextClassifier
2
 
3
  # Some examples of tweets:
4
+ data_dict = {
5
+ '25 years ago we made a promise to the people of Hong Kong. We intend to keep it. https://t.co/nIN96ZydgV': {
6
+ 'hour': '17',
7
+ 'nlikes': 7878,
8
+ 'nreplies': 2999,
9
+ 'nretweets': 1993,
10
+ 'topic': '',
11
+ 'sentiment': ''},
12
+ 'A huge delight to meet @SwedishPM Magdalena Andersson and President @niinisto again. The accession of Finland '
13
+ 'and Sweden to @NATO will permanently strengthen our defensive Alliance, helping to keep us all safe. #WeAreNATO '
14
+ ' https://t.co/pArvdWHr2F': {
15
+ 'hour': '16',
16
+ 'nlikes': 3468,
17
+ 'nreplies': 686,
18
+ 'nretweets': 435,
19
+ 'topic': '',
20
+ 'sentiment': ''},
21
+ 'At this @NATO Leaders’ Summit, I’ll be urging fellow nations to continue to do everything they can to support '
22
+ 'Ukraine. The UK has always played a historic role in the @NATO alliance, working to address the biggest global '
23
+ 'threats and build a more secure world.': {
24
+ 'hour': '07',
25
+ 'nlikes': 7742,
26
+ 'nreplies': 1838,
27
+ 'nretweets': 1112,
28
+ 'topic': '',
29
+ 'sentiment': ''},
30
+ 'Morgan Johansson måste avgå som minister. Otryggheten biter sig fast och gängkriminaliteten är allt annat än knäckt. Antalet skjutningar ökar och sätter skräck i varje del av vårt land. Sverige har förvandlats till ett gangsterland.': {
31
+ 'hour': '16',
32
+ 'nlikes': 3468,
33
+ 'nreplies': 686,
34
+ 'nretweets': 435,
35
+ 'topic': '',
36
+ 'sentiment': ''},
37
+ 'Döms man för brott, särskilt våldsbrott, ska man vara inlåst från det att domen faller tills straffet är avtjänat. Allt annat är vansinne.': {
38
+ 'hour': '16',
39
+ 'nlikes': 3468,
40
+ 'nreplies': 686,
41
+ 'nretweets': 435,
42
+ 'topic': '',
43
+ 'sentiment': ''},
44
+ 'Motionerna: ' + '\n' + 'K339 avslogs av enig riksdag (inkl KD).' + '\n' + 'K220 avslogs av enig riksdag (inkl KD).' + '\n' + '1601 avslogs av enig riksdag (inkl KD).' + '\n' + 'K281 avslogs av enig riksdag (inkl KD).' + '\n' + '\n' + '¯\_(ツ)_/¯': {
45
+ 'hour': '16',
46
+ 'nlikes': 3468,
47
+ 'nreplies': 686,
48
+ 'nretweets': 435,
49
+ 'topic': '',
50
+ 'sentiment': ''}
51
+ }
52
 
53
  # Classify the TOPICS and insert the results into the data dictionary found above
54
  topic_results = TextClassifier.classify_topics(data_dict)
 
56
  sentiment_results = TextClassifier.classify_sentiments(data_dict)
57
  # Print simple statistics related to TOPICS and SENTIMENTS
58
  TextClassifier.print_stats(sentiment_results)