MarcusAscard commited on
Commit
5aaf93b
·
unverified ·
1 Parent(s): 4841d65

Final push

Browse files
Files changed (1) hide show
  1. textclassifier/TextClassifier.py +606 -217
textclassifier/TextClassifier.py CHANGED
@@ -1,217 +1,606 @@
1
- import os
2
- import time
3
- import warnings
4
- from datetime import date
5
-
6
- import openai
7
- import pandas as pd
8
- import regex as re
9
- from dotenv import find_dotenv, load_dotenv
10
- from pandas.core.common import SettingWithCopyWarning
11
-
12
- from twitterscraper import TwitterScraper
13
- from functions import functions as f
14
-
15
- warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
16
-
17
- # Set one directory up into ROOT_PATH
18
- ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
19
-
20
- dotenv_path = find_dotenv()
21
- load_dotenv(dotenv_path)
22
- OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
23
-
24
-
25
- class TextClassifier:
26
- def __init__(self, model_name="text-davinci-002", from_date='2022-01-01', to_date=str(date.today()),
27
-
28
- user_list='jimmieakesson',
29
- num_tweets=20, ):
30
- """
31
- Initializes the TextClassifier.
32
- :param model_name: name of the model from openai.
33
- :param from_date: string of the format 'YYYY-MM-DD'.
34
- :param to_date: string of the format 'YYYY-MM-DD'.
35
- :param num_tweets: integer value of the maximum number of tweets to be scraped.
36
- """
37
- # Make sure user_name is not empty
38
- assert user_list is not None, "user_name cannot be empty"
39
-
40
- self.ts = TwitterScraper.TwitterScraper(from_date, to_date, num_tweets)
41
- self.model_name = model_name
42
- self.from_date = from_date
43
- self.to_date = to_date
44
- self.num_tweets = num_tweets
45
- self.user_name = user_list
46
- # Assure that scrape_by_user actually gets num_tweets
47
- # add timer in time-loop and stop after 10 seconds
48
- start_time = time.time()
49
- while True:
50
- self.df = self.ts.scrape_by_user(user_list)
51
- if num_tweets-5 < len(self.df) <= num_tweets:
52
- break
53
- else:
54
- if time.time() - start_time > 15:
55
- raise Exception("Could not get enough tweets. Please try again. Perhaps try different time range.")
56
- continue
57
- # Make id as type int64
58
- self.df.loc[:, 'id'] = self.df.id.copy().apply(lambda x: int(x))
59
- # self.api_key = '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'
60
- openai.api_key = OPENAI_API_KEY
61
-
62
- def classify_all(self, tweet: str):
63
- """
64
- Classifies the topic, subtopic, sentiment and target of a user's tweets.
65
- """
66
- import os
67
- import openai
68
-
69
- openai.api_key = os.getenv("OPENAI_API_KEY")
70
- promptstring = "Decide a Tweet's political TOPIC and SUBTOPIC, without classifying it as 'politics'. Also " \
71
- "decide whether a political Tweet's " \
72
- "SENTIMENT is " \
73
- "positive, " \
74
- "negative or neutral. Also give the TARGET of the sentiment. \nGive the answer in the form ' (" \
75
- "TOPIC, SUBTOPIC, SENTIMENT, TARGET)'\n\nTweet: {} \nAnswer: ".format(tweet)
76
- response = openai.Completion.create(
77
- model="text-davinci-002",
78
- prompt=promptstring,
79
- temperature=0,
80
- max_tokens=30,
81
- top_p=1,
82
- frequency_penalty=0.5,
83
- presence_penalty=0
84
- )
85
- classification_unclean = response.choices[0]['text']
86
- classification_clean = self.cleanup_topic_results(classification_unclean)
87
-
88
- return classification_clean.lower()
89
-
90
- def classify_all_list(self):
91
- """
92
- Classifies the topics of a user's tweets.
93
- """
94
- df_topic = self.df.copy()
95
- df_topic['class_tuple'] = df_topic['tweet'].apply(self.classify_all)
96
- self.df = df_topic
97
- self.split_tuple_into_columns()
98
- return self.df
99
-
100
- @staticmethod
101
- def cleanup_topic_results(text):
102
- new_item = text.strip()
103
- new_item = new_item.replace("\n", "")
104
- new_item = new_item.replace(" ", "")
105
- return new_item
106
-
107
- def df_to_csv(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
108
- """
109
- Writes pandas df to csv file. If it already exists, it appends. If not, it creates. It also removes duplicates.
110
- :param filename:
111
- :return:
112
- """
113
- if not os.path.exists(filename):
114
- self.df.to_csv(filename, index=False)
115
- else:
116
- self.df.to_csv(filename, mode='a', header=False, index=False)
117
-
118
- self.remove_duplicates_from_csv(filename)
119
-
120
- @staticmethod
121
- def remove_duplicates_from_csv(filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
122
- """
123
- Removes duplicates from csv file.
124
- :param filename: filename of csv file
125
- :return: None
126
- """
127
- with open(filename, 'r') as f:
128
- lines = f.readlines()
129
- with open(filename, 'w') as f:
130
- for line in lines:
131
- if line not in lines[lines.index(line) + 1:]:
132
- f.write(line)
133
-
134
- def remove_already_classified_tweets(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
135
- """
136
- Removes tweets that have already been classified.
137
- :param filename: filename of csv file
138
- :return: None
139
- """
140
- df = self.df
141
- df = df[df['sentiment'].isnull()]
142
- self.df = df
143
- self.df_to_csv(filename)
144
-
145
- def split_tuple_into_columns(self):
146
- """
147
- Splits the topics (topic, subtopic, sentiment, target) into columns.
148
- :return: None
149
- """
150
- df_topic = self.df.copy()
151
- df_topic['topics_temp'] = df_topic['class_tuple'].apply(f.convert_to_tuple)
152
- df_topic_split = pd.DataFrame(df_topic['topics_temp'].tolist(),
153
- columns=['main_topic', 'sub_topic', 'sentiment', 'target'])
154
-
155
- # Manually add columns to self.df
156
- self.df['main_topic'] = df_topic_split['main_topic'].astype(str)
157
- self.df['sub_topic'] = df_topic_split['sub_topic'].astype(str)
158
- self.df['sentiment'] = df_topic_split['sentiment'].astype(str)
159
- self.df['target'] = df_topic_split['target'].astype(str)
160
-
161
- def run_main_pipeline(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
162
- """
163
- Classifies the topics/sentiments of a user's tweets.
164
- #We presume that all tweets inside the twitterdata.csv file are already classified.
165
- :return: None
166
- """
167
- # Check if file exists, if not, create it
168
- if os.path.exists(filename):
169
- # Fetch tweets from csv file
170
- already_classified_df = pd.read_csv(filename, on_bad_lines='skip')
171
- print("Already classified tweets: {}".format(already_classified_df.shape[0]))
172
- # Create a temporary df where values from already_classified_df that are not it self.df are stored
173
- temp_df = already_classified_df[already_classified_df['id'].isin(self.df['id'])]
174
- # Remove rows from self.df that are not in already_classified_df
175
- self.df = self.df[~self.df['id'].isin(already_classified_df['id'])]
176
- # Only classify non-empty rows
177
- if self.df.shape[0] > 0:
178
- print("Classifying topic, subtopic, sentiment and target of {} tweets...".format(self.df.shape[0]))
179
- self.df = self.classify_all_list()
180
- print("Writing to csv...")
181
- self.df_to_csv(filename)
182
- # Concatenate temp_df and self.df
183
- self.df = pd.concat([temp_df, self.df], ignore_index=True)
184
- print("Appended {}.".format(filename))
185
- return None
186
- else:
187
- self.df = pd.concat([temp_df, self.df], ignore_index=True)
188
- print("No new tweets to classify.")
189
- return None
190
- else:
191
- print("No csv file found. Continuing without removing already classified tweets.")
192
- print("Classifying topic, subtopic, sentiment and target of {} tweets...".format(self.df.shape[0]))
193
- self.df = self.classify_all_list()
194
- print("Writing to csv file...")
195
- self.df_to_csv(filename)
196
- print("Created {}.".format(filename))
197
- return None
198
-
199
- def get_dataframe(self):
200
- """
201
- Returns the dataframe.
202
- :return: dataframe
203
- """
204
- return self.df
205
-
206
- def __repr__(self):
207
- """
208
- Gives a string that describes which user is classified
209
- :return:
210
- """
211
- return "Classifier for user: " + self.user_name + " with model: " + self.model_name + "."
212
-
213
-
214
- if __name__ == "__main__":
215
- text_classifier = TextClassifier(from_date='2020-01-01', to_date="2022-07-15", user_list=['jimmieakesson'],
216
- num_tweets=60)
217
- text_classifier.run_main_pipeline()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import warnings
4
+ import openai
5
+ import pandas as pd
6
+ from dotenv import find_dotenv, load_dotenv
7
+ from pandas.core.common import SettingWithCopyWarning
8
+ from twitterscraper import TwitterScraper
9
+ from sentence_transformers import SentenceTransformer
10
+ from scipy import spatial
11
+ from datetime import date, timedelta
12
+
13
+ warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
14
+
15
+ # Set one directory up into ROOT_PATH
16
+ ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
17
+
18
+ dotenv_path = find_dotenv()
19
+ load_dotenv(dotenv_path)
20
+ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
21
+
22
+
23
+ class TextClassifier:
24
+ def __init__(self, model_name="text-davinci-002", from_date='2022-01-01', to_date=str(date.today()),
25
+ user_list=['jimmieakesson'],
26
+ num_tweets=20):
27
+ """
28
+ Initializes the TextClassifier.
29
+ :param model_name: name of the model from openai.
30
+ :param from_date: string of the format 'YYYY-MM-DD'.
31
+ :param to_date: string of the format 'YYYY-MM-DD'.
32
+ :param num_tweets: integer value of the maximum number of tweets to be scraped.
33
+ """
34
+ # Make sure user_name is not empty
35
+ assert user_list is not None, "user_name cannot be empty"
36
+
37
+ self.ts = TwitterScraper.TwitterScraper(from_date, to_date, num_tweets)
38
+ self.model_name = model_name
39
+ self.from_date = from_date
40
+ self.to_date = to_date
41
+ self.num_tweets = num_tweets
42
+ self.user_name = user_list
43
+ # Assure that scrape_by_user actually gets num_tweets
44
+ # add timer in time-loop and stop after 10 seconds
45
+ # self.df = self.ts.scrape_by_user(user_name)
46
+ self.df = self.ts.scrape_by_several_users(user_list)
47
+ # Make id as type int64
48
+ self.df.loc[:, 'id'] = self.df.id.copy().apply(lambda x: int(x))
49
+ openai.api_key = OPENAI_API_KEY
50
+
51
+ def classify_all(self, tweet: str):
52
+ """
53
+ Classifies the topic, subtopic, sentiment and target of a user's tweets.
54
+ """
55
+ import os
56
+ import openai
57
+
58
+ valid_tweet = len(tweet.split()) > 4
59
+ if valid_tweet:
60
+ openai.api_key = os.getenv("OPENAI_API_KEY")
61
+ promptstring = "Decide a Tweet's political TOPIC and SUBTOPIC, without classifying it as 'politics'. Also " \
62
+ "decide whether a political Tweet's " \
63
+ "SENTIMENT is " \
64
+ "positive, " \
65
+ "negative or neutral. Also give the TARGET of the sentiment. \nGive the answer in the form ' (" \
66
+ "TOPIC, SUBTOPIC, SENTIMENT, TARGET)'\n\nTweet: {} \nAnswer: ".format(tweet)
67
+ response = openai.Completion.create(
68
+ model="text-davinci-002",
69
+ prompt=promptstring,
70
+ temperature=0,
71
+ max_tokens=30,
72
+ top_p=1,
73
+ frequency_penalty=0.5,
74
+ presence_penalty=0
75
+ )
76
+ classification_unclean = response.choices[0]['text']
77
+ classification_clean = self.cleanup_topic_results(classification_unclean)
78
+ if classification_clean.lower() == "(topic, subtopic, sentiment, target)":
79
+ classification_clean = "(none, none, none, none)"
80
+ else:
81
+ classification_clean = "(none, none, none, none)"
82
+ return classification_clean.lower()
83
+
84
+ def classify_all_list(self):
85
+ """
86
+ Classifies the topics of a user's tweets.
87
+ """
88
+ df_topic = self.df.copy()
89
+ df_topic['class_tuple'] = df_topic['tweet'].apply(self.classify_all)
90
+ self.df = df_topic
91
+ self.split_tuple_into_columns()
92
+ return self.df
93
+
94
+ @staticmethod
95
+ def cleanup_topic_results(text):
96
+ """
97
+ Cleanup response from GPT-3 to a string matching the format: "(main_topic, sub_topic, sentiment, target)"
98
+ :param text: GPT-3 response
99
+ :return: A string on the format: "(main_topic, sub_topic, sentiment, target)"
100
+ """
101
+ new_item = text.strip()
102
+ new_item = new_item.replace("\n", "")
103
+ new_item = new_item.replace(" ", "")
104
+ item_control = new_item.replace("(", "")
105
+ item_control = item_control.replace(")", "")
106
+ item_control = item_control.split(",")
107
+ if ' ' or '' in item_control:
108
+ item_control = [s.strip() if not (s == ' ' or s == '') else 'none' for s in
109
+ item_control] # Replace empty classifications with 'none'
110
+ diff = 4 - len(item_control)
111
+ if diff < 0: # If response gave more than four predictions
112
+ cutout = item_control[diff - 1:] # Cut out the superflous predictions
113
+ item_control = item_control[:diff - 1] # Save the rest
114
+ new_s = ""
115
+ for i in range(len(cutout)):
116
+ new_s += cutout[i]
117
+ if i < -diff:
118
+ new_s += " and " # Merge superflous predictions. E.g. target = 's', 'mp', 'v' -> target = 's and mp and v'
119
+ item_control.append(new_s)
120
+ elif diff > 0: # If response gave less than four predictions
121
+ for i in range(diff):
122
+ item_control.append("none") # Fill out tuple with nones
123
+ new_item = str(tuple(item_control))
124
+ new_item = new_item.replace("'", "")
125
+ return new_item
126
+
127
+ def df_to_csv(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
128
+ """
129
+ Writes pandas df to csv file. If it already exists, it appends. If not, it creates. It also removes duplicates.
130
+ :param filename:
131
+ :return:
132
+ """
133
+ if not os.path.exists(filename):
134
+ self.df.to_csv(filename, index=False)
135
+ else:
136
+ self.df.to_csv(filename, mode='a', header=False, index=False)
137
+
138
+ self.remove_duplicates_from_csv(filename)
139
+
140
+ @staticmethod
141
+ def remove_duplicates_from_csv(filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
142
+ """
143
+ Removes duplicates from csv file.
144
+ :param filename: filename of csv file
145
+ :return: None
146
+ """
147
+ with open(filename, 'r', encoding="utf8") as f:
148
+ lines = f.readlines()
149
+ with open(filename, 'w', encoding="utf8") as f:
150
+ for line in lines:
151
+ if line not in lines[lines.index(line) + 1:]:
152
+ f.write(line)
153
+
154
+ def remove_already_classified_tweets(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
155
+ """
156
+ Removes tweets that have already been classified.
157
+ :param filename: filename of csv file
158
+ :return: None
159
+ """
160
+ df = self.df
161
+ df = df[df['sentiment'].isnull()]
162
+ self.df = df
163
+ self.df_to_csv(filename)
164
+
165
+ def split_tuple_into_columns(self):
166
+ """
167
+ Splits the topics (topic, subtopic, sentiment, target) into columns.
168
+ :return: None
169
+ """
170
+ df_topic = self.df.copy()
171
+ df_topic['topics_temp'] = df_topic['class_tuple'].apply(lambda x: tuple(x[1:-1].split(",")))
172
+ df_topic_split = pd.DataFrame(df_topic['topics_temp'].tolist(),
173
+ columns=['main_topic', 'sub_topic', 'sentiment', 'target'])
174
+ # Manually add columns to self.df
175
+ self.df['main_topic'] = df_topic_split['main_topic'].tolist()
176
+ self.df['main_topic'] = self.df['main_topic'].replace(["n/a", "", " "], "none", regex=True)
177
+ self.df['main_topic'] = self.df['main_topic'].apply(lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
178
+
179
+ self.df['sub_topic'] = df_topic_split['sub_topic'].tolist()
180
+ # In a few of the outputs from GPT-3 the sub_topic = "sentiment"
181
+ self.df['sub_topic'] = self.df['sub_topic'].replace(["n/a", "sentiment", "", " "], "none", regex=True)
182
+ self.df['sub_topic'] = self.df['sub_topic'].apply(lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
183
+
184
+ self.df['sentiment'] = df_topic_split['sentiment'].tolist()
185
+ self.df['sentiment'] = self.df['sentiment'].replace(["n/a", "sentiment", "", " "], "none", regex=True)
186
+ self.df['sentiment'] = self.df['sentiment'].apply(lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
187
+
188
+ self.df['target'] = df_topic_split['target'].tolist()
189
+ self.df['target'] = self.df['target'].replace(["n/a", "", " "], "none", regex=True)
190
+ self.df['target'] = self.df['target'].apply(lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
191
+
192
+ self.df.fillna('none', inplace=True)
193
+
194
+ def get_dataframe(self):
195
+ """
196
+ Returns the dataframe.
197
+ :return: dataframe
198
+ """
199
+ return self.df
200
+
201
+ def __repr__(self):
202
+ """
203
+ Gives a string that describes which user is classified
204
+ :return:
205
+ """
206
+ return "Classifier for user: " + self.user_name + " with model: " + self.model_name + "."
207
+
208
+ def get_database(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
209
+ """
210
+ Returns the database containing all dataframes.
211
+ :param filename: filename of csv file
212
+ :return:
213
+ """
214
+ db = pd.read_csv(filename)
215
+ return db
216
+
217
+ def cleanup_list(self, uncleaned_list):
218
+ """
219
+ Cleans up faulty predictions.
220
+ :param uncleaned_list: the list to be cleaned
221
+ :return: cleaned list
222
+ """
223
+ uncleaned_list = [s if not isinstance(s, float) else "none" for s in uncleaned_list]
224
+ uncleaned_list = [s if not len(s.split()) > 5 else "none" for s in uncleaned_list]
225
+ uncleaned_list = [s if not "swedish" in s else s.replace("swedish", " ") for s in uncleaned_list]
226
+ uncleaned_list = [s if not "politics" in s else s.replace("politics", "none") for s in uncleaned_list]
227
+ uncleaned_list = [s.replace(" ", " ") for s in uncleaned_list]
228
+ cleaned_list = [s.strip() for s in uncleaned_list]
229
+ return cleaned_list
230
+
231
+ def merge_lists(self, main_topic_list, sub_topic_list):
232
+ """
233
+ Merges the topic lists. If either topic is a faulty classification, only the non-faulty topic wil be used.
234
+ If both are faulty, the merged topic will be labeled as faulty (ERROR_496).
235
+ :param main_topic_list: A list containing main topics
236
+ :param sub_topic_list: A list containing sub topics
237
+ :return: A list containing string items on the form "main_topic and sub_topic"
238
+ """
239
+ new_list = []
240
+ main_topic_list = self.clean_party_names(main_topic_list)
241
+ sub_topic_list = self.clean_party_names(sub_topic_list)
242
+ for i in range(len(main_topic_list)):
243
+ if main_topic_list[i].lower() == "none" and sub_topic_list[i].lower() == "none": # If the predictions are faulty
244
+ new_list.append("ERROR_496") # Label as ERROR_496 (faulty prediction)
245
+ elif main_topic_list[i].lower() == "none":
246
+ new_list.append(sub_topic_list[i])
247
+ elif sub_topic_list[i].lower() == "none":
248
+ new_list.append(main_topic_list[i])
249
+ else:
250
+ new_list.append(main_topic_list[i] + " and " + sub_topic_list[i])
251
+ return new_list
252
+
253
+ def file_to_mat(self, classification_type):
254
+ """
255
+ Converts a synonym textfile to a matrix in which the rows contain a general topic/target and its related words.
256
+ :param classification_type: The type of classification: topic or target
257
+ :return: a matrix in which the first element of each row is a general topic/target, and the rest are words related to
258
+ the topic
259
+ """
260
+ filename = "{}/data/".format(ROOT_PATH)
261
+ filename += classification_type + "_synonyms.txt"
262
+ with open(filename, encoding='utf-8') as f:
263
+ lines = f.read()
264
+ lines = lines.split("\n")
265
+
266
+ topic_list = []
267
+ temp_list = []
268
+
269
+ for topic in lines:
270
+ if not topic.endswith("####"):
271
+ temp_list.append(topic)
272
+ else:
273
+ temp_list.append(topic[:-4]) # Remove the marker (####)
274
+ topic_list.append(temp_list)
275
+ temp_list = []
276
+
277
+ return topic_list
278
+
279
+ def mat_to_list(self, mat):
280
+ """
281
+ Converts a matrix from file_to_mat() into one list containing all topics and synonyms, and one list with
282
+ mappings for the synonyms.
283
+ :param mat: a matrix from file_to_mat()
284
+ :return:
285
+ """
286
+ full_list = []
287
+ mapped_synonyms = []
288
+ for syns in mat:
289
+ for topic in syns:
290
+ full_list.append(topic)
291
+ mapped_synonyms.append(syns[0])
292
+ return full_list, mapped_synonyms
293
+
294
+ def clean_party_names(self, old_topic_list):
295
+ """
296
+ Encodes all party names to sentences that will yield a high cosine similarity value when merged with another
297
+ topic, without taking the actual party name into account. These sentences have deliberately been composed such
298
+ that they pose a low risk of being close (in the sentence embedding-space) to any possible merged topic or
299
+ target that may be encountered.
300
+ :param old_topic_list: list of topics
301
+ :return: list of encoded topics
302
+ """
303
+ # Problem 1: When a party name is encountered, we want to bias the merging towards that party since the
304
+ # occurrence of a very general main topic (as in the example below) plus a party name as subtopic is frequent.
305
+ # Example: main_topic = "politics", sub_topic = "sweden democrats" ->
306
+ # combined_topics = "politics and sweden democrats"
307
+ # Problem 2: The party names themselves are biased towards certain topics/targets and lead to faulty merges.
308
+ # Example: Variations of words such as "Sweden" as the target/topic will be biased towards getting merged with
309
+ # "Sweden Democrats".
310
+ # Solution: Encode party names with sentences that are HIGHLY unlikely to be close to anything in the embedding
311
+ # space and thus enforcing a strong bias in the cosine similarity computation towards the party if encountered.
312
+
313
+ party_names = {}
314
+ party_names["m"] = "parrot computer is swimming as screen time"
315
+ party_names["moderaterna"] = "parrot computer is swimming as screen time"
316
+ party_names["moderates"] = "parrot computer is swimming as screen time"
317
+ party_names["the moderates"] = "parrot computer is swimming as screen time"
318
+ party_names["moderate party"] = "parrot computer is swimming as screen time"
319
+ party_names["the moderate party"] = "parrot computer is swimming as screen time"
320
+ party_names["the moderaterna party"] = "parrot computer is swimming as screen time"
321
+
322
+ party_names["sd"] = "keyboard can hire the yellow elephant in cosmos"
323
+ party_names["sverigedemokraterna"] = "keyboard can hire the yellow elephant in cosmos"
324
+ party_names["sweden democrats"] = "keyboard can hire the yellow elephant in cosmos"
325
+ party_names["the sweden democrats"] = "keyboard can hire the yellow elephant in cosmos"
326
+ party_names["the swedish democrats"] = "keyboard can hire the yellow elephant in cosmos"
327
+ party_names["swedish democrats"] = "keyboard can hire the yellow elephant in cosmos"
328
+ party_names["@jimmieakesson"] = "keyboard can hire the yellow elephant in cosmos"
329
+
330
+ party_names["l"] = "red weather jokes with music and the mathematician"
331
+ party_names["liberalerna"] = "red weather jokes with music and the mathematician"
332
+ party_names["liberals"] = "red weather jokes with music and the mathematician"
333
+ party_names["the liberals"] = "red weather jokes with music and the mathematician"
334
+ party_names["the liberal party"] = "red weather jokes with music and the mathematician"
335
+ party_names["liberal people's party"] = "red weather jokes with music and the mathematician"
336
+ party_names["@johanpehrson"] = "red weather jokes with music and the mathematician"
337
+
338
+ party_names["mp"] = "ice piano flies with pencil as direction"
339
+ party_names["miljöpartiet"] = "ice piano flies with pencil as direction"
340
+ party_names["de gröna"] = "ice piano flies with pencil as direction"
341
+ party_names["green party"] = "ice piano flies with pencil as direction"
342
+ party_names["the green party"] = "ice piano flies with pencil as direction"
343
+ party_names["miljopartiet"] = "ice piano flies with pencil as direction"
344
+ party_names["@bolund"] = "ice piano flies with pencil as direction"
345
+ party_names["@martastenevi"] = "ice piano flies with pencil as direction"
346
+
347
+ party_names["s"] = "lamp of fire walks bird gladly tomorrow"
348
+ party_names["socialdemokraterna"] = "lamp of fire walks bird gladly tomorrow"
349
+ party_names["social democratic party"] = "lamp of fire walks bird gladly tomorrow"
350
+ party_names["the social democratic party"] = "lamp of fire walks bird gladly tomorrow"
351
+ party_names["social democrats"] = "lamp of fire walks bird gladly tomorrow"
352
+ party_names["the social democrats"] = "lamp of fire walks bird gladly tomorrow"
353
+ party_names["sosse"] = "lamp of fire walks bird gladly tomorrow"
354
+ party_names["sossen"] = "lamp of fire walks bird gladly tomorrow"
355
+ party_names["sossar"] = "lamp of fire walks bird gladly tomorrow"
356
+ party_names["sossarna"] = "lamp of fire walks bird gladly tomorrow"
357
+ party_names["sossarnas"] = "lamp of fire walks bird gladly tomorrow"
358
+ party_names["swedish social democrats"] = "lamp of fire walks bird gladly tomorrow"
359
+ party_names["@swedishpm"] = "lamp of fire walks bird gladly tomorrow"
360
+
361
+ party_names["v"] = "rooftop cats play physics with cardboard fire"
362
+ party_names["vänsterpartiet"] = "rooftop cats play physics with cardboard fire"
363
+ party_names["left party"] = "rooftop cats play physics with cardboard fire"
364
+ party_names["the left party"] = "rooftop cats play physics with cardboard fire"
365
+ party_names["@dadgostarnooshi"] = "rooftop cats play physics with cardboard fire"
366
+
367
+ party_names["c"] = "differential donuts program sunny waters"
368
+ party_names["centerpartiet"] = "differential donuts program sunny waters"
369
+ party_names["center party"] = "differential donuts program sunny waters"
370
+ party_names["centre party"] = "differential donuts program sunny waters"
371
+ party_names["the center party"] = "differential donuts program sunny waters"
372
+ party_names["@annieloof"] = "differential donuts program sunny waters"
373
+
374
+ party_names["kd"] = "cauchy-riemann met sunglasses after rolling yellow"
375
+ party_names["kristdemokraterna"] = "cauchy-riemann met sunglasses after rolling yellow"
376
+ party_names["christian democrats"] = "cauchy-riemann met sunglasses after rolling yellow"
377
+ party_names["the christian democrats"] = "cauchy-riemann met sunglasses after rolling yellow"
378
+ party_names["@buschebba"] = "cauchy-riemann met sunglasses after rolling yellow"
379
+
380
+ for i, topic in enumerate(old_topic_list):
381
+ topic = topic.lower()
382
+ topic = topic.replace(" ", " ")
383
+ topic = topic.strip()
384
+ if topic in party_names:
385
+ old_topic_list[i] = party_names.get(topic)
386
+
387
+ return old_topic_list
388
+
389
+ def reset_party_names(self, old_topic_list):
390
+ """
391
+ Decodes the encoded party names.
392
+ :param old_topic_list: list of topics
393
+ :return: list of encoded topics
394
+ """
395
+ party_names = {}
396
+ party_names["m"] = "parrot computer is swimming as screen time"
397
+ party_names["sd"] = "keyboard can hire the yellow elephant in cosmos"
398
+ party_names["l"] = "red weather jokes with music and the mathematician"
399
+ party_names["mp"] = "ice piano flies with pencil as direction"
400
+ party_names["s"] = "lamp of fire walks bird gladly tomorrow"
401
+ party_names["v"] = "rooftop cats play physics with cardboard fire"
402
+ party_names["c"] = "differential donuts program sunny waters"
403
+ party_names["kd"] = "cauchy-riemann met sunglasses after rolling yellow"
404
+ inverted_dict = {}
405
+ # Invert dictionary
406
+ for k, v in party_names.items():
407
+ if v not in inverted_dict:
408
+ inverted_dict[v] = k
409
+ # Update values in old_topic_list
410
+ for i, topic in enumerate(old_topic_list):
411
+ if topic in inverted_dict.keys():
412
+ old_topic_list[i] = inverted_dict.get(topic)
413
+
414
+ return old_topic_list
415
+
416
+ def merge_classifications(self, old_list, classification_type):
417
+ """
418
+ Merges topics/targets from GPT-3 according to a list of predefined topics/targets.
419
+ :param old_list: list of the topics/targets to be merged
420
+ :param classification_type: type of classifications: topic or target
421
+ :return: list of new topics/targets
422
+ """
423
+ # Get the tuple of lists containing all synonyms and general topics/targets
424
+ tup_list = self.mat_to_list(self.file_to_mat(classification_type))
425
+ # Save list of synonyms
426
+ synonym_list = tup_list[0]
427
+ # Save list of mappings between synonym and general topic/target
428
+ synonym_mappings = tup_list[1]
429
+ # Load embedding model-names
430
+ model_list = ['sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', 'all-MiniLM-L6-v2']
431
+ result_dict = {}
432
+ # Encode party names
433
+ old_list = self.clean_party_names(old_list)
434
+ for model_name in model_list:
435
+ model = SentenceTransformer(model_name)
436
+ # Encode the topics/targets with the sentence transformer model
437
+ old_list_embeddings = model.encode(old_list, batch_size=64, show_progress_bar=True,
438
+ convert_to_tensor=True)
439
+ # Encode the synonyms with the sentence transformer model
440
+ synonym_list_embeddings = model.encode(synonym_list, batch_size=64, show_progress_bar=True,
441
+ convert_to_tensor=True)
442
+ for i, embedded_classification in enumerate(old_list_embeddings):
443
+ result_list = []
444
+ for embedded_synonyms in synonym_list_embeddings:
445
+ # Compute the cosine similarity between every classification and synonym
446
+ result = 1 - spatial.distance.cosine(embedded_classification, embedded_synonyms)
447
+ result_list.append(result)
448
+ max_value = max(result_list)
449
+ max_index = result_list.index(max_value)
450
+ old_classification = old_list[i]
451
+ # Extract the general topic/target
452
+ new_classification = synonym_mappings[max_index]
453
+ # Save the topic/target that yielded the highest cosine similarity value
454
+ if old_classification not in result_dict:
455
+ result_dict[old_classification] = [(new_classification, max_value, synonym_list[max_index])]
456
+ # When we have found the best topics/targets after using the first transformer model
457
+ else:
458
+ # Append the results from the next model
459
+ result_dict[old_classification].append((new_classification, max_value, synonym_list[max_index]))
460
+
461
+ new_dict = {}
462
+ # Time to replace the old values with the new ones
463
+ for old_values in result_dict:
464
+ tup_list = result_dict[old_values]
465
+ max_tup = max(tup_list, key=lambda item: item[1])
466
+ if classification_type == "topic":
467
+ limit = 0.4
468
+ else:
469
+ limit = 0.75
470
+ # Discard classification if the old topic/target is not similar to anything in our synonym lists
471
+ if max_tup[1] < limit:
472
+ max_tup = ("ERROR_9000", "{:.2f}".format(round(max_tup[1], 2)), "none")
473
+ else:
474
+ max_tup = (max_tup[0], "{:.2f}".format(round(max_tup[1], 2)), max_tup[2])
475
+ new_classification = max_tup
476
+ if old_values not in new_dict:
477
+ new_dict[old_values] = new_classification
478
+ new_list = []
479
+ for old_value in old_list:
480
+ new_list.append(new_dict[old_value])
481
+ return new_list
482
+
483
+ def merge_all(self):
484
+ """
485
+ Merges main+subtopics, targets, and updates the dataframe.
486
+ :param df:
487
+ :return:
488
+ """
489
+ df_topics = self.df.copy()
490
+
491
+ sub_topics = df_topics['sub_topic']
492
+ sub_topics = sub_topics.tolist()
493
+ sub_topics = self.cleanup_list(sub_topics)
494
+
495
+ main_topics = df_topics['main_topic']
496
+ main_topics = main_topics.tolist()
497
+ main_topics = self.cleanup_list(main_topics)
498
+
499
+ merged_topic_list = self.merge_lists(main_topics, sub_topics)
500
+
501
+ targets = df_topics['target']
502
+ targets = targets.tolist()
503
+ targets = self.cleanup_list(targets)
504
+
505
+ merged_topics = self.merge_classifications(merged_topic_list, "topic")
506
+ merged_targets = self.merge_classifications(targets, "target")
507
+
508
+ print("The following merges were made: ")
509
+ for i, top in enumerate(merged_topic_list):
510
+ print("TOPICS: ", top, " -> ", merged_topics[i])
511
+
512
+ t_list = []
513
+ for i in range(len(merged_topics)):
514
+ t_list.append(tuple(merged_topics[i]) + tuple(merged_targets[i]))
515
+ merged_tuples = t_list
516
+ df_topics['merged_tuple'] = merged_tuples
517
+
518
+ df = self.split_merged_tuple_into_columns(df_topics)
519
+ print("Merging finished...")
520
+ self.df = df
521
+
522
+ def split_merged_tuple_into_columns(self, df):
523
+ """
524
+ Splits the merged tuple (merged topic, merged target) into columns.
525
+ :return: None
526
+ """
527
+ df_topic = df.copy()
528
+ df_topic_split = pd.DataFrame(df_topic['merged_tuple'].tolist(),
529
+ columns=['merged_topic', 'cos_sim_topic', 'synonym_topic', 'merged_target', 'cos_sim_target', 'synonym_target'])
530
+ self.df['merged_tuple'] = df_topic['merged_tuple'].tolist()
531
+ # Manually add columns to self.df
532
+ self.df['merged_topic'] = df_topic_split['merged_topic'].tolist()
533
+ self.df['cos_sim_topic'] = df_topic_split['cos_sim_topic'].tolist()
534
+ self.df['synonym_topic'] = self.reset_party_names(df_topic_split['synonym_topic'].tolist())
535
+ self.df['merged_target'] = df_topic_split['merged_target'].tolist()
536
+ self.df['cos_sim_target'] = df_topic_split['cos_sim_target'].tolist()
537
+ self.df['synonym_target'] = self.reset_party_names(df_topic_split['synonym_target'].tolist())
538
+
539
+ return self.df
540
+
541
+ def run_main_pipeline(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
542
+ """
543
+ Classifies the topics/sentiments of a user's tweets.
544
+ #We presume that all tweets inside the twitterdata.csv file are already classified.
545
+ :return: None
546
+ """
547
+ # Check if file exists, if not, create it
548
+ if os.path.exists(filename):
549
+ # Fetch tweets from csv file
550
+ already_classified_df = pd.read_csv(filename, on_bad_lines='skip')
551
+ print("Already classified tweets: {}".format(already_classified_df.shape[0]))
552
+ # Create a temporary df where values from already_classified_df that are not it self.df are stored
553
+ temp_df = already_classified_df[already_classified_df['id'].isin(self.df['id'])]
554
+ # Remove rows from self.df that are not in already_classified_df
555
+ self.df = self.df[~self.df['id'].isin(already_classified_df['id'])]
556
+ # Only classify non-empty rows
557
+ if self.df.shape[0] > 0:
558
+ time.sleep(10)
559
+ print("Classifying topic, subtopic, sentiment and target of {} tweets...".format(self.df.shape[0]))
560
+ self.df = self.classify_all_list()
561
+ self.df = self.df.replace({'': 'none'}, regex=True)
562
+ self.df = self.df.replace({' ': 'none'}, regex=True)
563
+ print("Merging topics...")
564
+ self.merge_all()
565
+ print("Writing to csv...")
566
+ self.df_to_csv(filename)
567
+ # Concatenate temp_df and self.df
568
+ self.df = pd.concat([temp_df, self.df], ignore_index=True)
569
+ print("Appended {}.".format(filename))
570
+ return None
571
+ else:
572
+ self.df = pd.concat([temp_df, self.df], ignore_index=True)
573
+ print("No new tweets to classify.")
574
+ return None
575
+ else:
576
+ print("No csv file found. Continuing without removing already classified tweets.")
577
+ print("Classifying topic, subtopic, sentiment and target of {} tweets...".format(self.df.shape[0]))
578
+ self.df = self.classify_all_list()
579
+ self.df = self.df.replace({'': 'none'}, regex=True)
580
+ self.df = self.df.replace({' ': 'none'}, regex=True)
581
+ print("Merging topics...")
582
+ self.merge_all()
583
+ print("Writing to csv file...")
584
+ self.df_to_csv(filename)
585
+ print("Created {}.".format(filename))
586
+ return None
587
+
588
+
589
+ if __name__ == "__main__":
590
+ # $6.39 @ 3431 tweets
591
+ # $18.00 @ 4608 tweets
592
+ # $11.61 to classify 1177 tweets ~ $0.01 / tweet
593
+
594
+ # This code snippet allows for scraping and classifying by simply specifying a start and end date.
595
+ USER_LIST = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM',
596
+ 'dadgostarnooshi']
597
+ start_date = date(2022, 8, 4)
598
+ end_date = date(2022, 8, 4)
599
+ delta = timedelta(days=1)
600
+ while start_date <= end_date:
601
+ from_date = start_date.strftime("%Y-%m-%d")
602
+ start_date += delta
603
+ to_date = start_date.strftime("%Y-%m-%d")
604
+ print("curr_date: ", from_date)
605
+ tc = TextClassifier(from_date=from_date, to_date=to_date, user_list=USER_LIST, num_tweets=6000)
606
+ tc.run_main_pipeline()