Spaces:

politweet-sh
/

politweet

Runtime error

App Files Files Community

politweet / textclassifier /TextClassifier.py

Demea9000

some more minor changes

5c4ad0b almost 2 years ago

raw

history blame contribute delete

No virus

31 kB

	import os
	import time
	import warnings
	import openai
	import pandas as pd
	from dotenv import find_dotenv, load_dotenv
	from pandas.core.common import SettingWithCopyWarning
	from twitterscraper import TwitterScraper
	from sentence_transformers import SentenceTransformer
	from scipy import spatial
	from datetime import date, timedelta

	warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

	# Set one directory up into ROOT_PATH
	ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

	dotenv_path = find_dotenv()
	load_dotenv(dotenv_path)
	OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")


	class TextClassifier:
	def __init__(self, model_name="text-davinci-002", from_date='2022-01-01', to_date=str(date.today()),
	user_list=['jimmieakesson'],
	num_tweets=20):
	"""
	Initializes the TextClassifier.
	:param model_name: name of the model from openai.
	:param from_date: string of the format 'YYYY-MM-DD'.
	:param to_date: string of the format 'YYYY-MM-DD'.
	:param num_tweets: integer value of the maximum number of tweets to be scraped.
	"""
	# Make sure user_name is not empty
	assert user_list is not None, "user_name cannot be empty"

	self.ts = TwitterScraper.TwitterScraper(from_date, to_date, num_tweets)
	self.model_name = model_name
	self.from_date = from_date
	self.to_date = to_date
	self.num_tweets = num_tweets
	self.user_name = user_list
	# Assure that scrape_by_user actually gets num_tweets
	# add timer in time-loop and stop after 10 seconds
	# self.df = self.ts.scrape_by_user(user_name)
	self.df = self.ts.scrape_by_several_users(user_list)

	# Check if 'id' is in self.df
	if 'id' in self.df.columns:
	# Make id as type int64
	self.df.loc[:, 'id'] = self.df.id.copy().apply(lambda x: int(x))
	else:
	# If not do nothing
	pass
	openai.api_key = OPENAI_API_KEY

	def classify_all(self, tweet: str):
	"""
	Classifies the topic, subtopic, sentiment and target of a user's tweets.
	"""
	import os
	import openai

	valid_tweet = len(tweet.split()) > 4
	if valid_tweet:
	openai.api_key = os.getenv("OPENAI_API_KEY")
	promptstring = "Decide a Tweet's political TOPIC and SUBTOPIC, without classifying it as 'politics'. Also " \
	"decide whether a political Tweet's " \
	"SENTIMENT is " \
	"positive, " \
	"negative or neutral. Also give the TARGET of the sentiment. \nGive the answer in the form ' (" \
	"TOPIC, SUBTOPIC, SENTIMENT, TARGET)'\n\nTweet: {} \nAnswer: ".format(tweet)
	response = openai.Completion.create(
	model="text-davinci-002",
	prompt=promptstring,
	temperature=0,
	max_tokens=30,
	top_p=1,
	frequency_penalty=0.5,
	presence_penalty=0
	)
	classification_unclean = response.choices[0]['text']
	classification_clean = self.cleanup_topic_results(classification_unclean)
	if classification_clean.lower() == "(topic, subtopic, sentiment, target)":
	classification_clean = "(none, none, none, none)"
	else:
	classification_clean = "(none, none, none, none)"
	return classification_clean.lower()

	def classify_all_list(self):
	"""
	Classifies the topics of a user's tweets.
	"""
	df_topic = self.df.copy()
	df_topic['class_tuple'] = df_topic['tweet'].apply(self.classify_all)
	self.df = df_topic
	self.split_tuple_into_columns()
	return self.df

	@staticmethod
	def cleanup_topic_results(text):
	"""
	Cleanup response from GPT-3 to a string matching the format: "(main_topic, sub_topic, sentiment, target)"
	:param text: GPT-3 response
	:return: A string on the format: "(main_topic, sub_topic, sentiment, target)"
	"""
	new_item = text.strip()
	new_item = new_item.replace("\n", "")
	new_item = new_item.replace(" ", "")
	item_control = new_item.replace("(", "")
	item_control = item_control.replace(")", "")
	item_control = item_control.split(",")
	if ' ' or '' in item_control:
	item_control = [s.strip() if not (s == ' ' or s == '') else 'none' for s in
	item_control] # Replace empty classifications with 'none'
	diff = 4 - len(item_control)
	if diff < 0: # If response gave more than four predictions
	cutout = item_control[diff - 1:] # Cut out the superflous predictions
	item_control = item_control[:diff - 1] # Save the rest
	new_s = ""
	for i in range(len(cutout)):
	new_s += cutout[i]
	if i < -diff:
	new_s += " and " # Merge superflous predictions. E.g. target = 's', 'mp', 'v' -> target = 's and mp and v'
	item_control.append(new_s)
	elif diff > 0: # If response gave less than four predictions
	for i in range(diff):
	item_control.append("none") # Fill out tuple with nones
	new_item = str(tuple(item_control))
	new_item = new_item.replace("'", "")
	return new_item

	def df_to_csv(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
	"""
	Writes pandas df to csv file. If it already exists, it appends. If not, it creates. It also removes duplicates.
	:param filename:
	:return:
	"""
	if not os.path.exists(filename):
	self.df.to_csv(filename, index=False)
	else:
	self.df.to_csv(filename, mode='a', header=False, index=False)

	self.remove_duplicates_from_csv(filename)

	@staticmethod
	def remove_duplicates_from_csv(filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
	"""
	Removes duplicates from csv file.
	:param filename: filename of csv file
	:return: None
	"""
	with open(filename, 'r', encoding="utf8") as f:
	lines = f.readlines()
	with open(filename, 'w', encoding="utf8") as f:
	for line in lines:
	if line not in lines[lines.index(line) + 1:]:
	f.write(line)

	def remove_already_classified_tweets(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
	"""
	Removes tweets that have already been classified.
	:param filename: filename of csv file
	:return: None
	"""
	df = self.df
	df = df[df['sentiment'].isnull()]
	self.df = df
	self.df_to_csv(filename)

	def split_tuple_into_columns(self):
	"""
	Splits the topics (topic, subtopic, sentiment, target) into columns.
	:return: None
	"""
	df_topic = self.df.copy()
	df_topic['topics_temp'] = df_topic['class_tuple'].apply(lambda x: tuple(x[1:-1].split(",")))
	df_topic_split = pd.DataFrame(df_topic['topics_temp'].tolist(),
	columns=['main_topic', 'sub_topic', 'sentiment', 'target'])
	# Manually add columns to self.df
	self.df['main_topic'] = df_topic_split['main_topic'].tolist()
	self.df['main_topic'] = self.df['main_topic'].replace(["n/a", "", " "], "none", regex=True)
	self.df['main_topic'] = self.df['main_topic'].apply(
	lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")

	self.df['sub_topic'] = df_topic_split['sub_topic'].tolist()
	# In a few of the outputs from GPT-3 the sub_topic = "sentiment"
	self.df['sub_topic'] = self.df['sub_topic'].replace(["n/a", "sentiment", "", " "], "none", regex=True)
	self.df['sub_topic'] = self.df['sub_topic'].apply(
	lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")

	self.df['sentiment'] = df_topic_split['sentiment'].tolist()
	self.df['sentiment'] = self.df['sentiment'].replace(["n/a", "sentiment", "", " "], "none", regex=True)
	self.df['sentiment'] = self.df['sentiment'].apply(
	lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")

	self.df['target'] = df_topic_split['target'].tolist()
	self.df['target'] = self.df['target'].replace(["n/a", "", " "], "none", regex=True)
	self.df['target'] = self.df['target'].apply(lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")

	self.df.fillna('none', inplace=True)

	def get_dataframe(self):
	"""
	Returns the dataframe.
	:return: dataframe
	"""
	return self.df

	def __repr__(self):
	"""
	Gives a string that describes which user is classified
	:return:
	"""
	return "Classifier for user: " + self.user_name + " with model: " + self.model_name + "."

	def get_database(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
	"""
	Returns the database containing all dataframes.
	:param filename: filename of csv file
	:return:
	"""
	db = pd.read_csv(filename)
	return db

	def cleanup_list(self, uncleaned_list):
	"""
	Cleans up faulty predictions.
	:param uncleaned_list: the list to be cleaned
	:return: cleaned list
	"""
	uncleaned_list = [s if not isinstance(s, float) else "none" for s in uncleaned_list]
	uncleaned_list = [s if not len(s.split()) > 5 else "none" for s in uncleaned_list]
	uncleaned_list = [s if not "swedish" in s else s.replace("swedish", " ") for s in uncleaned_list]
	uncleaned_list = [s if not "politics" in s else s.replace("politics", "none") for s in uncleaned_list]
	uncleaned_list = [s.replace(" ", " ") for s in uncleaned_list]
	cleaned_list = [s.strip() for s in uncleaned_list]
	return cleaned_list

	def merge_lists(self, main_topic_list, sub_topic_list):
	"""
	Merges the topic lists. If either topic is a faulty classification, only the non-faulty topic wil be used.
	If both are faulty, the merged topic will be labeled as faulty (ERROR_496).
	:param main_topic_list: A list containing main topics
	:param sub_topic_list: A list containing sub topics
	:return: A list containing string items on the form "main_topic and sub_topic"
	"""
	new_list = []
	main_topic_list = self.clean_party_names(main_topic_list)
	sub_topic_list = self.clean_party_names(sub_topic_list)
	for i in range(len(main_topic_list)):
	if main_topic_list[i].lower() == "none" and sub_topic_list[
	i].lower() == "none": # If the predictions are faulty
	new_list.append("ERROR_496") # Label as ERROR_496 (faulty prediction)
	elif main_topic_list[i].lower() == "none":
	new_list.append(sub_topic_list[i])
	elif sub_topic_list[i].lower() == "none":
	new_list.append(main_topic_list[i])
	else:
	new_list.append(main_topic_list[i] + " and " + sub_topic_list[i])
	return new_list

	def file_to_mat(self, classification_type):
	"""
	Converts a synonym textfile to a matrix in which the rows contain a general topic/target and its related words.
	:param classification_type: The type of classification: topic or target
	:return: a matrix in which the first element of each row is a general topic/target, and the rest are words related to
	the topic
	"""
	filename = "{}/data/".format(ROOT_PATH)
	filename += classification_type + "_synonyms.txt"
	with open(filename, encoding='utf-8') as f:
	lines = f.read()
	lines = lines.split("\n")

	topic_list = []
	temp_list = []

	for topic in lines:
	if not topic.endswith("####"):
	temp_list.append(topic)
	else:
	temp_list.append(topic[:-4]) # Remove the marker (####)
	topic_list.append(temp_list)
	temp_list = []

	return topic_list

	def mat_to_list(self, mat):
	"""
	Converts a matrix from file_to_mat() into one list containing all topics and synonyms, and one list with
	mappings for the synonyms.
	:param mat: a matrix from file_to_mat()
	:return:
	"""
	full_list = []
	mapped_synonyms = []
	for syns in mat:
	for topic in syns:
	full_list.append(topic)
	mapped_synonyms.append(syns[0])
	return full_list, mapped_synonyms

	def clean_party_names(self, old_topic_list):
	"""
	Encodes all party names to sentences that will yield a high cosine similarity value when merged with another
	topic, without taking the actual party name into account. These sentences have deliberately been composed such
	that they pose a low risk of being close (in the sentence embedding-space) to any possible merged topic or
	target that may be encountered.
	:param old_topic_list: list of topics
	:return: list of encoded topics
	"""
	# Problem 1: When a party name is encountered, we want to bias the merging towards that party since the
	# occurrence of a very general main topic (as in the example below) plus a party name as subtopic is frequent.
	# Example: main_topic = "politics", sub_topic = "sweden democrats" ->
	# combined_topics = "politics and sweden democrats"
	# Problem 2: The party names themselves are biased towards certain topics/targets and lead to faulty merges.
	# Example: Variations of words such as "Sweden" as the target/topic will be biased towards getting merged with
	# "Sweden Democrats".
	# Solution: Encode party names with sentences that are HIGHLY unlikely to be close to anything in the embedding
	# space and thus enforcing a strong bias in the cosine similarity computation towards the party if encountered.

	party_names = {}
	party_names["m"] = "parrot computer is swimming as screen time"
	party_names["moderaterna"] = "parrot computer is swimming as screen time"
	party_names["moderates"] = "parrot computer is swimming as screen time"
	party_names["the moderates"] = "parrot computer is swimming as screen time"
	party_names["moderate party"] = "parrot computer is swimming as screen time"
	party_names["the moderate party"] = "parrot computer is swimming as screen time"
	party_names["the moderaterna party"] = "parrot computer is swimming as screen time"

	party_names["sd"] = "keyboard can hire the yellow elephant in cosmos"
	party_names["sverigedemokraterna"] = "keyboard can hire the yellow elephant in cosmos"
	party_names["sweden democrats"] = "keyboard can hire the yellow elephant in cosmos"
	party_names["the sweden democrats"] = "keyboard can hire the yellow elephant in cosmos"
	party_names["the swedish democrats"] = "keyboard can hire the yellow elephant in cosmos"
	party_names["swedish democrats"] = "keyboard can hire the yellow elephant in cosmos"
	party_names["@jimmieakesson"] = "keyboard can hire the yellow elephant in cosmos"

	party_names["l"] = "red weather jokes with music and the mathematician"
	party_names["liberalerna"] = "red weather jokes with music and the mathematician"
	party_names["liberals"] = "red weather jokes with music and the mathematician"
	party_names["the liberals"] = "red weather jokes with music and the mathematician"
	party_names["the liberal party"] = "red weather jokes with music and the mathematician"
	party_names["liberal people's party"] = "red weather jokes with music and the mathematician"
	party_names["@johanpehrson"] = "red weather jokes with music and the mathematician"

	party_names["mp"] = "ice piano flies with pencil as direction"
	party_names["miljöpartiet"] = "ice piano flies with pencil as direction"
	party_names["de gröna"] = "ice piano flies with pencil as direction"
	party_names["green party"] = "ice piano flies with pencil as direction"
	party_names["the green party"] = "ice piano flies with pencil as direction"
	party_names["miljopartiet"] = "ice piano flies with pencil as direction"
	party_names["@bolund"] = "ice piano flies with pencil as direction"
	party_names["@martastenevi"] = "ice piano flies with pencil as direction"

	party_names["s"] = "lamp of fire walks bird gladly tomorrow"
	party_names["socialdemokraterna"] = "lamp of fire walks bird gladly tomorrow"
	party_names["social democratic party"] = "lamp of fire walks bird gladly tomorrow"
	party_names["the social democratic party"] = "lamp of fire walks bird gladly tomorrow"
	party_names["social democrats"] = "lamp of fire walks bird gladly tomorrow"
	party_names["the social democrats"] = "lamp of fire walks bird gladly tomorrow"
	party_names["sosse"] = "lamp of fire walks bird gladly tomorrow"
	party_names["sossen"] = "lamp of fire walks bird gladly tomorrow"
	party_names["sossar"] = "lamp of fire walks bird gladly tomorrow"
	party_names["sossarna"] = "lamp of fire walks bird gladly tomorrow"
	party_names["sossarnas"] = "lamp of fire walks bird gladly tomorrow"
	party_names["swedish social democrats"] = "lamp of fire walks bird gladly tomorrow"
	party_names["@swedishpm"] = "lamp of fire walks bird gladly tomorrow"

	party_names["v"] = "rooftop cats play physics with cardboard fire"
	party_names["vänsterpartiet"] = "rooftop cats play physics with cardboard fire"
	party_names["left party"] = "rooftop cats play physics with cardboard fire"
	party_names["the left party"] = "rooftop cats play physics with cardboard fire"
	party_names["@dadgostarnooshi"] = "rooftop cats play physics with cardboard fire"

	party_names["c"] = "differential donuts program sunny waters"
	party_names["centerpartiet"] = "differential donuts program sunny waters"
	party_names["center party"] = "differential donuts program sunny waters"
	party_names["centre party"] = "differential donuts program sunny waters"
	party_names["the center party"] = "differential donuts program sunny waters"
	party_names["@annieloof"] = "differential donuts program sunny waters"

	party_names["kd"] = "cauchy-riemann met sunglasses after rolling yellow"
	party_names["kristdemokraterna"] = "cauchy-riemann met sunglasses after rolling yellow"
	party_names["christian democrats"] = "cauchy-riemann met sunglasses after rolling yellow"
	party_names["the christian democrats"] = "cauchy-riemann met sunglasses after rolling yellow"
	party_names["@buschebba"] = "cauchy-riemann met sunglasses after rolling yellow"

	for i, topic in enumerate(old_topic_list):
	topic = topic.lower()
	topic = topic.replace(" ", " ")
	topic = topic.strip()
	if topic in party_names:
	old_topic_list[i] = party_names.get(topic)

	return old_topic_list

	def reset_party_names(self, old_topic_list):
	"""
	Decodes the encoded party names.
	:param old_topic_list: list of topics
	:return: list of encoded topics
	"""
	party_names = {}
	party_names["m"] = "parrot computer is swimming as screen time"
	party_names["sd"] = "keyboard can hire the yellow elephant in cosmos"
	party_names["l"] = "red weather jokes with music and the mathematician"
	party_names["mp"] = "ice piano flies with pencil as direction"
	party_names["s"] = "lamp of fire walks bird gladly tomorrow"
	party_names["v"] = "rooftop cats play physics with cardboard fire"
	party_names["c"] = "differential donuts program sunny waters"
	party_names["kd"] = "cauchy-riemann met sunglasses after rolling yellow"
	inverted_dict = {}
	# Invert dictionary
	for k, v in party_names.items():
	if v not in inverted_dict:
	inverted_dict[v] = k
	# Update values in old_topic_list
	for i, topic in enumerate(old_topic_list):
	if topic in inverted_dict.keys():
	old_topic_list[i] = inverted_dict.get(topic)

	return old_topic_list

	def merge_classifications(self, old_list, classification_type):
	"""
	Merges topics/targets from GPT-3 according to a list of predefined topics/targets.
	:param old_list: list of the topics/targets to be merged
	:param classification_type: type of classifications: topic or target
	:return: list of new topics/targets
	"""
	# Get the tuple of lists containing all synonyms and general topics/targets
	tup_list = self.mat_to_list(self.file_to_mat(classification_type))
	# Save list of synonyms
	synonym_list = tup_list[0]
	# Save list of mappings between synonym and general topic/target
	synonym_mappings = tup_list[1]
	# Load embedding model-names
	model_list = ['sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', 'all-MiniLM-L6-v2']
	result_dict = {}
	# Encode party names
	old_list = self.clean_party_names(old_list)
	for model_name in model_list:
	model = SentenceTransformer(model_name)
	# Encode the topics/targets with the sentence transformer model
	old_list_embeddings = model.encode(old_list, batch_size=64, show_progress_bar=True,
	convert_to_tensor=True)
	# Encode the synonyms with the sentence transformer model
	synonym_list_embeddings = model.encode(synonym_list, batch_size=64, show_progress_bar=True,
	convert_to_tensor=True)
	for i, embedded_classification in enumerate(old_list_embeddings):
	result_list = []
	for embedded_synonyms in synonym_list_embeddings:
	# Compute the cosine similarity between every classification and synonym
	result = 1 - spatial.distance.cosine(embedded_classification, embedded_synonyms)
	result_list.append(result)
	max_value = max(result_list)
	max_index = result_list.index(max_value)
	old_classification = old_list[i]
	# Extract the general topic/target
	new_classification = synonym_mappings[max_index]
	# Save the topic/target that yielded the highest cosine similarity value
	if old_classification not in result_dict:
	result_dict[old_classification] = [(new_classification, max_value, synonym_list[max_index])]
	# When we have found the best topics/targets after using the first transformer model
	else:
	# Append the results from the next model
	result_dict[old_classification].append((new_classification, max_value, synonym_list[max_index]))

	new_dict = {}
	# Time to replace the old values with the new ones
	for old_values in result_dict:
	tup_list = result_dict[old_values]
	max_tup = max(tup_list, key=lambda item: item[1])
	if classification_type == "topic":
	limit = 0.4
	else:
	limit = 0.75
	# Discard classification if the old topic/target is not similar to anything in our synonym lists
	if max_tup[1] < limit:
	max_tup = ("ERROR_9000", "{:.2f}".format(round(max_tup[1], 2)), "none")
	else:
	max_tup = (max_tup[0], "{:.2f}".format(round(max_tup[1], 2)), max_tup[2])
	new_classification = max_tup
	if old_values not in new_dict:
	new_dict[old_values] = new_classification
	new_list = []
	for old_value in old_list:
	new_list.append(new_dict[old_value])
	return new_list

	def merge_all(self):
	"""
	Merges main+subtopics, targets, and updates the dataframe.
	:param df:
	:return:
	"""
	df_topics = self.df.copy()

	sub_topics = df_topics['sub_topic']
	sub_topics = sub_topics.tolist()
	sub_topics = self.cleanup_list(sub_topics)

	main_topics = df_topics['main_topic']
	main_topics = main_topics.tolist()
	main_topics = self.cleanup_list(main_topics)

	merged_topic_list = self.merge_lists(main_topics, sub_topics)

	targets = df_topics['target']
	targets = targets.tolist()
	targets = self.cleanup_list(targets)

	merged_topics = self.merge_classifications(merged_topic_list, "topic")
	merged_targets = self.merge_classifications(targets, "target")

	print("The following merges were made: ")
	for i, top in enumerate(merged_topic_list):
	print("TOPICS: ", top, " -> ", merged_topics[i])

	t_list = []
	for i in range(len(merged_topics)):
	t_list.append(tuple(merged_topics[i]) + tuple(merged_targets[i]))
	merged_tuples = t_list
	df_topics['merged_tuple'] = merged_tuples

	df = self.split_merged_tuple_into_columns(df_topics)
	print("Merging finished...")
	self.df = df

	def split_merged_tuple_into_columns(self, df):
	"""
	Splits the merged tuple (merged topic, merged target) into columns.
	:return: None
	"""
	df_topic = df.copy()
	df_topic_split = pd.DataFrame(df_topic['merged_tuple'].tolist(),
	columns=['merged_topic', 'cos_sim_topic', 'synonym_topic', 'merged_target',
	'cos_sim_target', 'synonym_target'])
	self.df['merged_tuple'] = df_topic['merged_tuple'].tolist()
	# Manually add columns to self.df
	self.df['merged_topic'] = df_topic_split['merged_topic'].tolist()
	self.df['cos_sim_topic'] = df_topic_split['cos_sim_topic'].tolist()
	self.df['synonym_topic'] = self.reset_party_names(df_topic_split['synonym_topic'].tolist())
	self.df['merged_target'] = df_topic_split['merged_target'].tolist()
	self.df['cos_sim_target'] = df_topic_split['cos_sim_target'].tolist()
	self.df['synonym_target'] = self.reset_party_names(df_topic_split['synonym_target'].tolist())

	return self.df

	def run_main_pipeline(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
	"""
	Classifies the topics/sentiments of a user's tweets.
	#We presume that all tweets inside the twitterdata.csv file are already classified.
	:return: None
	"""
	# Check if file exists, if not, create it
	if os.path.exists(filename):
	# Fetch tweets from csv file
	already_classified_df = pd.read_csv(filename, on_bad_lines='skip')
	print("Already classified tweets: {}".format(already_classified_df.shape[0]))
	# Create a temporary df where values from already_classified_df that are not it self.df are stored
	temp_df = already_classified_df[already_classified_df['id'].isin(self.df['id'])]
	# Remove rows from self.df that are not in already_classified_df
	self.df = self.df[~self.df['id'].isin(already_classified_df['id'])]
	# Only classify non-empty rows
	if self.df.shape[0] > 0:
	time.sleep(10)
	print("Classifying topic, subtopic, sentiment and target of {} tweets...".format(self.df.shape[0]))
	self.df = self.classify_all_list()
	self.df = self.df.replace({'': 'none'}, regex=True)
	self.df = self.df.replace({' ': 'none'}, regex=True)
	print("Merging topics...")
	self.merge_all()
	print("Writing to csv...")
	self.df_to_csv(filename)
	# Concatenate temp_df and self.df
	self.df = pd.concat([temp_df, self.df], ignore_index=True)
	print("Appended {}.".format(filename))
	return None
	else:
	self.df = pd.concat([temp_df, self.df], ignore_index=True)
	print("No new tweets to classify.")
	return None
	else:
	print("No csv file found. Continuing without removing already classified tweets.")
	print("Classifying topic, subtopic, sentiment and target of {} tweets...".format(self.df.shape[0]))
	self.df = self.classify_all_list()
	self.df = self.df.replace({'': 'none'}, regex=True)
	self.df = self.df.replace({' ': 'none'}, regex=True)
	print("Merging topics...")
	self.merge_all()
	print("Writing to csv file...")
	self.df_to_csv(filename)
	print("Created {}.".format(filename))
	return None


	if __name__ == "__main__":
	# $6.39 @ 3431 tweets
	# $18.00 @ 4608 tweets
	# $11.61 to classify 1177 tweets ~ $0.01 / tweet

	# This code snippet allows for scraping and classifying by simply specifying a start and end date.
	USER_LIST = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM',
	'dadgostarnooshi']
	start_date = date(2022, 8, 4)
	end_date = date(2022, 8, 4)
	delta = timedelta(days=1)
	while start_date <= end_date:
	from_date = start_date.strftime("%Y-%m-%d")
	start_date += delta
	to_date = start_date.strftime("%Y-%m-%d")
	print("curr_date: ", from_date)
	tc = TextClassifier(from_date=from_date, to_date=to_date, user_list=USER_LIST, num_tweets=6000)
	tc.run_main_pipeline()