import openai import regex as re from twitterscraper import TwitterScraper from datetime import date import os from dotenv import find_dotenv, load_dotenv dotenv_path = find_dotenv() load_dotenv(dotenv_path) OPENAI_AUTHTOKEN = os.environ.get("OPENAI_AUTHTOKEN") class TextClassifier: def __init__(self, model_name="text-davinci-002", from_date='2022-01-01', to_date=str(date.today()), user_name='jimmieakesson', num_tweets=20): """ Initializes the TextClassifier. :param model_name: name of the model from openai. :param from_date: string of the format 'YYYY-MM-DD'. :param to_date: string of the format 'YYYY-MM-DD'. :param num_tweets: integer value of the maximum number of tweets to be scraped. """ # Make sure to_date is later than from_date assert from_date < to_date, "from_date must be earlier than to_date" # Make sure the dates are in the correct format assert re.match(r'^\d{4}-\d{2}-\d{2}$', from_date) is not None, "from_date must be in the format YYYY-MM-DD" # Make sure user_name is not empty assert user_name is not None, "user_name cannot be empty" # Make sure num_tweets is a positive integer assert num_tweets > 0, "num_tweets must be a positive integer" self.model_name = model_name self.from_date = from_date self.to_date = to_date self.num_tweets = num_tweets self.user_name = user_name self.ts = TwitterScraper.TwitterScraper(from_date, to_date, num_tweets) self.df = self.ts.scrape_by_user(user_name) # self.api_key = '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' openai.api_key = OPENAI_AUTHTOKEN def scrape_tweets(self): """ Scrapes tweets from the given date range. """ self.ts.scrape_tweets() @staticmethod def cleanup_sentiment_results(classification_unclean): """ Cleans up the results of the sentiment classification. :param classification_unclean: string of the classification result. :return: cleaned up string. """ classification_clean = classification_unclean.replace('\n\n', "") classification_clean = classification_clean.replace('\n', "") if classification_clean.startswith(" "): classification_clean = classification_clean.replace(" ", "") return classification_clean def classify_sentiment(self, text: str): """ Classifies the sentiment of a text. """ assert isinstance(text, str) prompt_string = "Classify one sentiment for this tweet:\n \"" prompt_string += text prompt_string += "\" \nFor example:\nSupport,\nOpposition,\nCriticism,\nPraise,\nDisagreement," \ "\nAgreement,\nSkepticism,\nAdmiration,\nAnecdotes,\nJokes,\nMemes,\nSarcasm,\nSatire," \ "\nQuestions,\nStatements,\nOpinions,\nPredictions.\nSENTIMENT=" response = openai.Completion.create( model=self.model_name, prompt=prompt_string, temperature=0.0, max_tokens=256, top_p=1, frequency_penalty=0, presence_penalty=0, logprobs=5 ) classification_unclean = response.choices[0]['text'] classification_clean = self.cleanup_sentiment_results(classification_unclean) return classification_clean.lower() def classify_sentiment_of_tweets(self): """ Classifies the sentiment of a user's tweets. """ df_sentiment = self.df.copy() df_sentiment['sentiment'] = df_sentiment['tweet'].apply(self.classify_sentiment) self.df = df_sentiment return self.df def analyze_sentiment(self, text: str, sentiment: str): # TODO: fix prompt before running this method """ Analyzes the sentiment of a text using OpenAI. :param text: string of the tweet text. :param sentiment: :return: """ # assert 1 == 2, "Måste fixa prompt innan denna metod körs" prompt_string = "Who is the TARGET of this " prompt_string += sentiment prompt_string += " TWEET?\\nTWEET=\"" prompt_string += text prompt_string += "\"\\n.TARGET should consist of less than 5 words.\\nTARGET=" response = openai.Completion.create( model=self.model_name, prompt=prompt_string, temperature=0, max_tokens=256, top_p=1, frequency_penalty=0, presence_penalty=0 ) analyzed_sentiment = response.choices[0]['text'] # Remove spaces at the start/end of the response if analyzed_sentiment.startswith(' '): analyzed_sentiment = analyzed_sentiment[1:] if analyzed_sentiment.endswith(' '): analyzed_sentiment = analyzed_sentiment[:-1] # Sometimes GPT-3 gives faulty results, so a simple filter is introduced # If the prediction is bad # -> set target value to N/A (not applicable) if len(analyzed_sentiment) > 50: analyzed_sentiment = "N/A" # An attempt to merge target responses that should be the same analyzed_sentiment = re.sub("\(", "", analyzed_sentiment) analyzed_sentiment = re.sub("\)", "", analyzed_sentiment) s_list = ["s", "the swedish social democratic party"] m_list = ["m", "the swedish moderate party", "the moderate party"] mp_list = ["mp", "the swedish green party"] if analyzed_sentiment.lower() == "v": analyzed_sentiment = "Vänsterpartiet" elif analyzed_sentiment.lower() == "mp": analyzed_sentiment = "Miljöpartiet" elif analyzed_sentiment.lower() in s_list: analyzed_sentiment = "Socialdemokraterna" elif analyzed_sentiment.lower() == "c": analyzed_sentiment = "Centerpartiet" elif analyzed_sentiment.lower() == "l": analyzed_sentiment = "Liberalerna" elif analyzed_sentiment.lower() == "kd": analyzed_sentiment = "Kristdemokraterna" elif analyzed_sentiment.lower() in m_list: analyzed_sentiment = "Moderaterna" elif analyzed_sentiment.lower() == "sd": analyzed_sentiment = "Sverigedemokraterna" elif analyzed_sentiment.lower() == "the swedish government": analyzed_sentiment = "Regeringen" return analyzed_sentiment def analyze_sentiment_of_tweets(self): """ Analyzes the sentiment of a user's tweets. """ # check if 'sentiment' column exists, raise exception if not assert 'sentiment' in self.df.columns, \ "'sentiment' column does not exist. Please run classify_sentiment_of_tweets first." df_sentiment = self.df.copy() df_sentiment['target'] = df_sentiment.apply(lambda row: self.analyze_sentiment(row['tweet'], row['sentiment']), axis=1) self.df = df_sentiment return self.df def classify_topic(self, text: str): """ Classifies the topics of a text. :param text: string of the tweet text. """ assert isinstance(text, str) prompt_string = "Classify one topic for this tweet:\n \"" prompt_string += text prompt_string += "\" \nFor example:\nEconomy,\nEnvironment,\nHealth,\nPolitics,\nScience,\nSports,\nTechnology," \ "\nTransportation,\nWorld.\nTOPIC=" response = openai.Completion.create( model=self.model_name, prompt=prompt_string, temperature=0, max_tokens=892, top_p=1, frequency_penalty=0, presence_penalty=0, ) classification_unclean = response.choices[0]['text'] classification_clean = self.cleanup_topic_results(classification_unclean) return classification_clean.lower() def classify_topics_of_tweets(self): """ Classifies the topics of a user's tweets. """ df_topic = self.df df_topic['topic'] = df_topic['tweet'].apply(self.classify_topic) return df_topic @staticmethod def cleanup_topic_results(text): new_item = text.replace("\n", " ") new_item = new_item.replace(" ", " ") return new_item def __repr__(self): """ Gives a string that describes which user is classified :return: """ return "Classifier for user: " + self.user_name + " with model: " + self.model_name + "." # if __name__ == "__main__": # import pandas as pd # from datetime import datetime # import os # # show all columns # pd.set_option('display.max_columns', None) # # tc = TextClassifier(from_date="2019-01-01", to_date="2019-05-31", user_name='jimmieakesson', num_tweets=20) # tc.classify_sentiment_of_tweets() # # df = tc.analyze_sentiment_of_tweets() # # print(df) # df = tc.classify_topics_of_tweets() # print(df) # # save to csv in a folder under politweet with timestamp in name # df.to_csv(f"{datetime.now().strftime('%Y-%m-%d %H-%M-%S')}_tweets.csv")