File size: 30,968 Bytes
5aaf93b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34e5763
 
 
 
 
 
 
 
5aaf93b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c4ad0b
5aaf93b
 
 
 
 
 
5c4ad0b
5aaf93b
5c4ad0b
5aaf93b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c4ad0b
 
5aaf93b
 
 
 
5c4ad0b
 
5aaf93b
 
 
5c4ad0b
 
5aaf93b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c4ad0b
 
 
5aaf93b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c4ad0b
5aaf93b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c4ad0b
5aaf93b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c4ad0b
 
5aaf93b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c4ad0b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
import os
import time
import warnings
import openai
import pandas as pd
from dotenv import find_dotenv, load_dotenv
from pandas.core.common import SettingWithCopyWarning
from twitterscraper import TwitterScraper
from sentence_transformers import SentenceTransformer
from scipy import spatial
from datetime import date, timedelta

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

# Set one directory up into ROOT_PATH
ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

dotenv_path = find_dotenv()
load_dotenv(dotenv_path)
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")


class TextClassifier:
    def __init__(self, model_name="text-davinci-002", from_date='2022-01-01', to_date=str(date.today()),

                 user_list=['jimmieakesson'],

                 num_tweets=20):
        """

        Initializes the TextClassifier.

        :param model_name: name of the model from openai.

        :param from_date: string of the format 'YYYY-MM-DD'.

        :param to_date: string of the format 'YYYY-MM-DD'.

        :param num_tweets: integer value of the maximum number of tweets to be scraped.

        """
        # Make sure user_name is not empty
        assert user_list is not None, "user_name cannot be empty"

        self.ts = TwitterScraper.TwitterScraper(from_date, to_date, num_tweets)
        self.model_name = model_name
        self.from_date = from_date
        self.to_date = to_date
        self.num_tweets = num_tweets
        self.user_name = user_list
        # Assure that scrape_by_user actually gets num_tweets
        # add timer in time-loop and stop after 10 seconds
        # self.df = self.ts.scrape_by_user(user_name)
        self.df = self.ts.scrape_by_several_users(user_list)

        # Check if 'id' is in self.df
        if 'id' in self.df.columns:
            # Make id as type int64
            self.df.loc[:, 'id'] = self.df.id.copy().apply(lambda x: int(x))
        else:
            # If not do nothing
            pass
        openai.api_key = OPENAI_API_KEY

    def classify_all(self, tweet: str):
        """

        Classifies the topic, subtopic, sentiment and target of a user's tweets.

        """
        import os
        import openai

        valid_tweet = len(tweet.split()) > 4
        if valid_tweet:
            openai.api_key = os.getenv("OPENAI_API_KEY")
            promptstring = "Decide a Tweet's political TOPIC and SUBTOPIC, without classifying it as 'politics'. Also " \
                           "decide whether a political Tweet's " \
                           "SENTIMENT is " \
                           "positive, " \
                           "negative or neutral. Also give the TARGET of the sentiment. \nGive the answer in the form ' (" \
                           "TOPIC, SUBTOPIC, SENTIMENT, TARGET)'\n\nTweet: {} \nAnswer:  ".format(tweet)
            response = openai.Completion.create(
                model="text-davinci-002",
                prompt=promptstring,
                temperature=0,
                max_tokens=30,
                top_p=1,
                frequency_penalty=0.5,
                presence_penalty=0
            )
            classification_unclean = response.choices[0]['text']
            classification_clean = self.cleanup_topic_results(classification_unclean)
            if classification_clean.lower() == "(topic, subtopic, sentiment, target)":
                classification_clean = "(none, none, none, none)"
        else:
            classification_clean = "(none, none, none, none)"
        return classification_clean.lower()

    def classify_all_list(self):
        """

        Classifies the topics of a user's tweets.

        """
        df_topic = self.df.copy()
        df_topic['class_tuple'] = df_topic['tweet'].apply(self.classify_all)
        self.df = df_topic
        self.split_tuple_into_columns()
        return self.df

    @staticmethod
    def cleanup_topic_results(text):
        """

        Cleanup response from GPT-3 to a string matching the format: "(main_topic, sub_topic, sentiment, target)"

        :param text: GPT-3 response

        :return: A string on the format: "(main_topic, sub_topic, sentiment, target)"

        """
        new_item = text.strip()
        new_item = new_item.replace("\n", "")
        new_item = new_item.replace("  ", "")
        item_control = new_item.replace("(", "")
        item_control = item_control.replace(")", "")
        item_control = item_control.split(",")
        if ' ' or '' in item_control:
            item_control = [s.strip() if not (s == ' ' or s == '') else 'none' for s in
                            item_control]  # Replace empty classifications with 'none'
        diff = 4 - len(item_control)
        if diff < 0:  # If response gave more than four predictions
            cutout = item_control[diff - 1:]  # Cut out the superflous predictions
            item_control = item_control[:diff - 1]  # Save the rest
            new_s = ""
            for i in range(len(cutout)):
                new_s += cutout[i]
                if i < -diff:
                    new_s += " and "  # Merge superflous predictions. E.g. target = 's', 'mp', 'v' -> target = 's and mp and v'
            item_control.append(new_s)
        elif diff > 0:  # If response gave less than four predictions
            for i in range(diff):
                item_control.append("none")  # Fill out tuple with nones
        new_item = str(tuple(item_control))
        new_item = new_item.replace("'", "")
        return new_item

    def df_to_csv(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
        """

        Writes pandas df to csv file. If it already exists, it appends. If not, it creates. It also removes duplicates.

        :param filename:

        :return:

        """
        if not os.path.exists(filename):
            self.df.to_csv(filename, index=False)
        else:
            self.df.to_csv(filename, mode='a', header=False, index=False)

        self.remove_duplicates_from_csv(filename)

    @staticmethod
    def remove_duplicates_from_csv(filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
        """

        Removes duplicates from csv file.

        :param filename: filename of csv file

        :return: None

        """
        with open(filename, 'r', encoding="utf8") as f:
            lines = f.readlines()
        with open(filename, 'w', encoding="utf8") as f:
            for line in lines:
                if line not in lines[lines.index(line) + 1:]:
                    f.write(line)

    def remove_already_classified_tweets(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
        """

        Removes tweets that have already been classified.

        :param filename: filename of csv file

        :return: None

        """
        df = self.df
        df = df[df['sentiment'].isnull()]
        self.df = df
        self.df_to_csv(filename)

    def split_tuple_into_columns(self):
        """

        Splits the topics (topic, subtopic, sentiment, target) into columns.

        :return: None

        """
        df_topic = self.df.copy()
        df_topic['topics_temp'] = df_topic['class_tuple'].apply(lambda x: tuple(x[1:-1].split(",")))
        df_topic_split = pd.DataFrame(df_topic['topics_temp'].tolist(),
                                      columns=['main_topic', 'sub_topic', 'sentiment', 'target'])
        # Manually add columns to self.df
        self.df['main_topic'] = df_topic_split['main_topic'].tolist()
        self.df['main_topic'] = self.df['main_topic'].replace(["n/a", "", " "], "none", regex=True)
        self.df['main_topic'] = self.df['main_topic'].apply(
            lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")

        self.df['sub_topic'] = df_topic_split['sub_topic'].tolist()
        # In a few of the outputs from GPT-3 the sub_topic = "sentiment"
        self.df['sub_topic'] = self.df['sub_topic'].replace(["n/a", "sentiment", "", " "], "none", regex=True)
        self.df['sub_topic'] = self.df['sub_topic'].apply(
            lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")

        self.df['sentiment'] = df_topic_split['sentiment'].tolist()
        self.df['sentiment'] = self.df['sentiment'].replace(["n/a", "sentiment", "", " "], "none", regex=True)
        self.df['sentiment'] = self.df['sentiment'].apply(
            lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")

        self.df['target'] = df_topic_split['target'].tolist()
        self.df['target'] = self.df['target'].replace(["n/a", "", " "], "none", regex=True)
        self.df['target'] = self.df['target'].apply(lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")

        self.df.fillna('none', inplace=True)

    def get_dataframe(self):
        """

        Returns the dataframe.

        :return: dataframe

        """
        return self.df

    def __repr__(self):
        """

        Gives a string that describes which user is classified

        :return:

        """
        return "Classifier for user: " + self.user_name + " with model: " + self.model_name + "."

    def get_database(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
        """

        Returns the database containing all dataframes.

        :param filename: filename of csv file

        :return:

        """
        db = pd.read_csv(filename)
        return db

    def cleanup_list(self, uncleaned_list):
        """

        Cleans up faulty predictions.

        :param uncleaned_list: the list to be cleaned

        :return: cleaned list

        """
        uncleaned_list = [s if not isinstance(s, float) else "none" for s in uncleaned_list]
        uncleaned_list = [s if not len(s.split()) > 5 else "none" for s in uncleaned_list]
        uncleaned_list = [s if not "swedish" in s else s.replace("swedish", " ") for s in uncleaned_list]
        uncleaned_list = [s if not "politics" in s else s.replace("politics", "none") for s in uncleaned_list]
        uncleaned_list = [s.replace("  ", " ") for s in uncleaned_list]
        cleaned_list = [s.strip() for s in uncleaned_list]
        return cleaned_list

    def merge_lists(self, main_topic_list, sub_topic_list):
        """

        Merges the topic lists. If either topic is a faulty classification, only the non-faulty topic wil be used.

        If both are faulty, the merged topic will be labeled as faulty (ERROR_496).

        :param main_topic_list: A list containing main topics

        :param sub_topic_list: A list containing sub topics

        :return: A list containing string items on the form "main_topic and sub_topic"

        """
        new_list = []
        main_topic_list = self.clean_party_names(main_topic_list)
        sub_topic_list = self.clean_party_names(sub_topic_list)
        for i in range(len(main_topic_list)):
            if main_topic_list[i].lower() == "none" and sub_topic_list[
                i].lower() == "none":  # If the predictions are faulty
                new_list.append("ERROR_496")  # Label as ERROR_496 (faulty prediction)
            elif main_topic_list[i].lower() == "none":
                new_list.append(sub_topic_list[i])
            elif sub_topic_list[i].lower() == "none":
                new_list.append(main_topic_list[i])
            else:
                new_list.append(main_topic_list[i] + " and " + sub_topic_list[i])
        return new_list

    def file_to_mat(self, classification_type):
        """

        Converts a synonym textfile to a matrix in which the rows contain a general topic/target and its related words.

        :param classification_type: The type of classification: topic or target

        :return: a matrix in which the first element of each row is a general topic/target, and the rest are words related to

        the topic

        """
        filename = "{}/data/".format(ROOT_PATH)
        filename += classification_type + "_synonyms.txt"
        with open(filename, encoding='utf-8') as f:
            lines = f.read()
        lines = lines.split("\n")

        topic_list = []
        temp_list = []

        for topic in lines:
            if not topic.endswith("####"):
                temp_list.append(topic)
            else:
                temp_list.append(topic[:-4])  # Remove the marker (####)
                topic_list.append(temp_list)
                temp_list = []

        return topic_list

    def mat_to_list(self, mat):
        """

        Converts a matrix from file_to_mat() into one list containing all topics and synonyms, and one list with

        mappings for the synonyms.

        :param mat: a matrix from file_to_mat()

        :return:

        """
        full_list = []
        mapped_synonyms = []
        for syns in mat:
            for topic in syns:
                full_list.append(topic)
                mapped_synonyms.append(syns[0])
        return full_list, mapped_synonyms

    def clean_party_names(self, old_topic_list):
        """

        Encodes all party names to sentences that will yield a high cosine similarity value when merged with another

        topic, without taking the actual party name into account. These sentences have deliberately been composed such

        that they pose a low risk of being close (in the sentence embedding-space) to any possible merged topic or

        target that may be encountered.

        :param old_topic_list: list of topics

        :return: list of encoded topics

        """
        # Problem 1: When a party name is encountered, we want to bias the merging towards that party since the
        # occurrence of a very general main topic (as in the example below) plus a party name as subtopic is frequent.
        # Example: main_topic = "politics", sub_topic = "sweden democrats" ->
        # combined_topics = "politics and sweden democrats"
        # Problem 2: The party names themselves are biased towards certain topics/targets and lead to faulty merges.
        # Example: Variations of words such as "Sweden" as the target/topic will be biased towards getting merged with
        # "Sweden Democrats".
        # Solution: Encode party names with sentences that are HIGHLY unlikely to be close to anything in the embedding
        # space and thus enforcing a strong bias in the cosine similarity computation towards the party if encountered.

        party_names = {}
        party_names["m"] = "parrot computer is swimming as screen time"
        party_names["moderaterna"] = "parrot computer is swimming as screen time"
        party_names["moderates"] = "parrot computer is swimming as screen time"
        party_names["the moderates"] = "parrot computer is swimming as screen time"
        party_names["moderate party"] = "parrot computer is swimming as screen time"
        party_names["the moderate party"] = "parrot computer is swimming as screen time"
        party_names["the moderaterna party"] = "parrot computer is swimming as screen time"

        party_names["sd"] = "keyboard can hire the yellow elephant in cosmos"
        party_names["sverigedemokraterna"] = "keyboard can hire the yellow elephant in cosmos"
        party_names["sweden democrats"] = "keyboard can hire the yellow elephant in cosmos"
        party_names["the sweden democrats"] = "keyboard can hire the yellow elephant in cosmos"
        party_names["the swedish democrats"] = "keyboard can hire the yellow elephant in cosmos"
        party_names["swedish democrats"] = "keyboard can hire the yellow elephant in cosmos"
        party_names["@jimmieakesson"] = "keyboard can hire the yellow elephant in cosmos"

        party_names["l"] = "red weather jokes with music and the mathematician"
        party_names["liberalerna"] = "red weather jokes with music and the mathematician"
        party_names["liberals"] = "red weather jokes with music and the mathematician"
        party_names["the liberals"] = "red weather jokes with music and the mathematician"
        party_names["the liberal party"] = "red weather jokes with music and the mathematician"
        party_names["liberal people's party"] = "red weather jokes with music and the mathematician"
        party_names["@johanpehrson"] = "red weather jokes with music and the mathematician"

        party_names["mp"] = "ice piano flies with pencil as direction"
        party_names["miljöpartiet"] = "ice piano flies with pencil as direction"
        party_names["de gröna"] = "ice piano flies with pencil as direction"
        party_names["green party"] = "ice piano flies with pencil as direction"
        party_names["the green party"] = "ice piano flies with pencil as direction"
        party_names["miljopartiet"] = "ice piano flies with pencil as direction"
        party_names["@bolund"] = "ice piano flies with pencil as direction"
        party_names["@martastenevi"] = "ice piano flies with pencil as direction"

        party_names["s"] = "lamp of fire walks bird gladly tomorrow"
        party_names["socialdemokraterna"] = "lamp of fire walks bird gladly tomorrow"
        party_names["social democratic party"] = "lamp of fire walks bird gladly tomorrow"
        party_names["the social democratic party"] = "lamp of fire walks bird gladly tomorrow"
        party_names["social democrats"] = "lamp of fire walks bird gladly tomorrow"
        party_names["the social democrats"] = "lamp of fire walks bird gladly tomorrow"
        party_names["sosse"] = "lamp of fire walks bird gladly tomorrow"
        party_names["sossen"] = "lamp of fire walks bird gladly tomorrow"
        party_names["sossar"] = "lamp of fire walks bird gladly tomorrow"
        party_names["sossarna"] = "lamp of fire walks bird gladly tomorrow"
        party_names["sossarnas"] = "lamp of fire walks bird gladly tomorrow"
        party_names["swedish social democrats"] = "lamp of fire walks bird gladly tomorrow"
        party_names["@swedishpm"] = "lamp of fire walks bird gladly tomorrow"

        party_names["v"] = "rooftop cats play physics with cardboard fire"
        party_names["vänsterpartiet"] = "rooftop cats play physics with cardboard fire"
        party_names["left party"] = "rooftop cats play physics with cardboard fire"
        party_names["the left party"] = "rooftop cats play physics with cardboard fire"
        party_names["@dadgostarnooshi"] = "rooftop cats play physics with cardboard fire"

        party_names["c"] = "differential donuts program sunny waters"
        party_names["centerpartiet"] = "differential donuts program sunny waters"
        party_names["center party"] = "differential donuts program sunny waters"
        party_names["centre party"] = "differential donuts program sunny waters"
        party_names["the center party"] = "differential donuts program sunny waters"
        party_names["@annieloof"] = "differential donuts program sunny waters"

        party_names["kd"] = "cauchy-riemann met sunglasses after rolling yellow"
        party_names["kristdemokraterna"] = "cauchy-riemann met sunglasses after rolling yellow"
        party_names["christian democrats"] = "cauchy-riemann met sunglasses after rolling yellow"
        party_names["the christian democrats"] = "cauchy-riemann met sunglasses after rolling yellow"
        party_names["@buschebba"] = "cauchy-riemann met sunglasses after rolling yellow"

        for i, topic in enumerate(old_topic_list):
            topic = topic.lower()
            topic = topic.replace("  ", " ")
            topic = topic.strip()
            if topic in party_names:
                old_topic_list[i] = party_names.get(topic)

        return old_topic_list

    def reset_party_names(self, old_topic_list):
        """

        Decodes the encoded party names.

        :param old_topic_list: list of topics

        :return: list of encoded topics

        """
        party_names = {}
        party_names["m"] = "parrot computer is swimming as screen time"
        party_names["sd"] = "keyboard can hire the yellow elephant in cosmos"
        party_names["l"] = "red weather jokes with music and the mathematician"
        party_names["mp"] = "ice piano flies with pencil as direction"
        party_names["s"] = "lamp of fire walks bird gladly tomorrow"
        party_names["v"] = "rooftop cats play physics with cardboard fire"
        party_names["c"] = "differential donuts program sunny waters"
        party_names["kd"] = "cauchy-riemann met sunglasses after rolling yellow"
        inverted_dict = {}
        # Invert dictionary
        for k, v in party_names.items():
            if v not in inverted_dict:
                inverted_dict[v] = k
        # Update values in old_topic_list
        for i, topic in enumerate(old_topic_list):
            if topic in inverted_dict.keys():
                old_topic_list[i] = inverted_dict.get(topic)

        return old_topic_list

    def merge_classifications(self, old_list, classification_type):
        """

        Merges topics/targets from GPT-3 according to a list of predefined topics/targets.

        :param old_list: list of the topics/targets to be merged

        :param classification_type: type of classifications: topic or target

        :return: list of new topics/targets

        """
        # Get the tuple of lists containing all synonyms and general topics/targets
        tup_list = self.mat_to_list(self.file_to_mat(classification_type))
        # Save list of synonyms
        synonym_list = tup_list[0]
        # Save list of mappings between synonym and general topic/target
        synonym_mappings = tup_list[1]
        # Load embedding model-names
        model_list = ['sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', 'all-MiniLM-L6-v2']
        result_dict = {}
        # Encode party names
        old_list = self.clean_party_names(old_list)
        for model_name in model_list:
            model = SentenceTransformer(model_name)
            # Encode the topics/targets with the sentence transformer model
            old_list_embeddings = model.encode(old_list, batch_size=64, show_progress_bar=True,
                                               convert_to_tensor=True)
            # Encode the synonyms with the sentence transformer model
            synonym_list_embeddings = model.encode(synonym_list, batch_size=64, show_progress_bar=True,
                                                   convert_to_tensor=True)
            for i, embedded_classification in enumerate(old_list_embeddings):
                result_list = []
                for embedded_synonyms in synonym_list_embeddings:
                    # Compute the cosine similarity between every classification and synonym
                    result = 1 - spatial.distance.cosine(embedded_classification, embedded_synonyms)
                    result_list.append(result)
                max_value = max(result_list)
                max_index = result_list.index(max_value)
                old_classification = old_list[i]
                # Extract the general topic/target
                new_classification = synonym_mappings[max_index]
                # Save the topic/target that yielded the highest cosine similarity value
                if old_classification not in result_dict:
                    result_dict[old_classification] = [(new_classification, max_value, synonym_list[max_index])]
                # When we have found the best topics/targets after using the first transformer model
                else:
                    # Append the results from the next model
                    result_dict[old_classification].append((new_classification, max_value, synonym_list[max_index]))

        new_dict = {}
        # Time to replace the old values with the new ones
        for old_values in result_dict:
            tup_list = result_dict[old_values]
            max_tup = max(tup_list, key=lambda item: item[1])
            if classification_type == "topic":
                limit = 0.4
            else:
                limit = 0.75
            # Discard classification if the old topic/target is not similar to anything in our synonym lists
            if max_tup[1] < limit:
                max_tup = ("ERROR_9000", "{:.2f}".format(round(max_tup[1], 2)), "none")
            else:
                max_tup = (max_tup[0], "{:.2f}".format(round(max_tup[1], 2)), max_tup[2])
            new_classification = max_tup
            if old_values not in new_dict:
                new_dict[old_values] = new_classification
        new_list = []
        for old_value in old_list:
            new_list.append(new_dict[old_value])
        return new_list

    def merge_all(self):
        """

        Merges main+subtopics, targets, and updates the dataframe.

        :param df:

        :return:

        """
        df_topics = self.df.copy()

        sub_topics = df_topics['sub_topic']
        sub_topics = sub_topics.tolist()
        sub_topics = self.cleanup_list(sub_topics)

        main_topics = df_topics['main_topic']
        main_topics = main_topics.tolist()
        main_topics = self.cleanup_list(main_topics)

        merged_topic_list = self.merge_lists(main_topics, sub_topics)

        targets = df_topics['target']
        targets = targets.tolist()
        targets = self.cleanup_list(targets)

        merged_topics = self.merge_classifications(merged_topic_list, "topic")
        merged_targets = self.merge_classifications(targets, "target")

        print("The following merges were made: ")
        for i, top in enumerate(merged_topic_list):
            print("TOPICS: ", top, " -> ", merged_topics[i])

        t_list = []
        for i in range(len(merged_topics)):
            t_list.append(tuple(merged_topics[i]) + tuple(merged_targets[i]))
        merged_tuples = t_list
        df_topics['merged_tuple'] = merged_tuples

        df = self.split_merged_tuple_into_columns(df_topics)
        print("Merging finished...")
        self.df = df

    def split_merged_tuple_into_columns(self, df):
        """

        Splits the merged tuple (merged topic, merged target) into columns.

        :return: None

        """
        df_topic = df.copy()
        df_topic_split = pd.DataFrame(df_topic['merged_tuple'].tolist(),
                                      columns=['merged_topic', 'cos_sim_topic', 'synonym_topic', 'merged_target',
                                               'cos_sim_target', 'synonym_target'])
        self.df['merged_tuple'] = df_topic['merged_tuple'].tolist()
        # Manually add columns to self.df
        self.df['merged_topic'] = df_topic_split['merged_topic'].tolist()
        self.df['cos_sim_topic'] = df_topic_split['cos_sim_topic'].tolist()
        self.df['synonym_topic'] = self.reset_party_names(df_topic_split['synonym_topic'].tolist())
        self.df['merged_target'] = df_topic_split['merged_target'].tolist()
        self.df['cos_sim_target'] = df_topic_split['cos_sim_target'].tolist()
        self.df['synonym_target'] = self.reset_party_names(df_topic_split['synonym_target'].tolist())

        return self.df

    def run_main_pipeline(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
        """

        Classifies the topics/sentiments of a user's tweets.

        #We presume that all tweets inside the twitterdata.csv file are already classified.

        :return: None

        """
        # Check if file exists, if not, create it
        if os.path.exists(filename):
            # Fetch tweets from csv file
            already_classified_df = pd.read_csv(filename, on_bad_lines='skip')
            print("Already classified tweets: {}".format(already_classified_df.shape[0]))
            # Create a temporary df where values from already_classified_df that are not it self.df are stored
            temp_df = already_classified_df[already_classified_df['id'].isin(self.df['id'])]
            # Remove rows from self.df that are not in already_classified_df
            self.df = self.df[~self.df['id'].isin(already_classified_df['id'])]
            # Only classify non-empty rows
            if self.df.shape[0] > 0:
                time.sleep(10)
                print("Classifying topic, subtopic, sentiment and target of {} tweets...".format(self.df.shape[0]))
                self.df = self.classify_all_list()
                self.df = self.df.replace({'': 'none'}, regex=True)
                self.df = self.df.replace({' ': 'none'}, regex=True)
                print("Merging topics...")
                self.merge_all()
                print("Writing to csv...")
                self.df_to_csv(filename)
                # Concatenate temp_df and self.df
                self.df = pd.concat([temp_df, self.df], ignore_index=True)
                print("Appended {}.".format(filename))
                return None
            else:
                self.df = pd.concat([temp_df, self.df], ignore_index=True)
                print("No new tweets to classify.")
                return None
        else:
            print("No csv file found. Continuing without removing already classified tweets.")
            print("Classifying topic, subtopic, sentiment and target of {} tweets...".format(self.df.shape[0]))
            self.df = self.classify_all_list()
            self.df = self.df.replace({'': 'none'}, regex=True)
            self.df = self.df.replace({' ': 'none'}, regex=True)
            print("Merging topics...")
            self.merge_all()
            print("Writing to csv file...")
            self.df_to_csv(filename)
            print("Created {}.".format(filename))
            return None


if __name__ == "__main__":
    # $6.39 @ 3431 tweets
    # $18.00 @ 4608 tweets
    # $11.61 to classify 1177 tweets ~ $0.01 / tweet

    # This code snippet allows for scraping and classifying by simply specifying a start and end date.
    USER_LIST = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM',
                 'dadgostarnooshi']
    start_date = date(2022, 8, 4)
    end_date = date(2022, 8, 4)
    delta = timedelta(days=1)
    while start_date <= end_date:
        from_date = start_date.strftime("%Y-%m-%d")
        start_date += delta
        to_date = start_date.strftime("%Y-%m-%d")
        print("curr_date: ", from_date)
        tc = TextClassifier(from_date=from_date, to_date=to_date, user_list=USER_LIST, num_tweets=6000)
        tc.run_main_pipeline()