politweet / textclassifier /TextClassifier.py
Demea9000's picture
some more minor changes
5c4ad0b
import os
import time
import warnings
import openai
import pandas as pd
from dotenv import find_dotenv, load_dotenv
from pandas.core.common import SettingWithCopyWarning
from twitterscraper import TwitterScraper
from sentence_transformers import SentenceTransformer
from scipy import spatial
from datetime import date, timedelta
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
# Set one directory up into ROOT_PATH
ROOT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
dotenv_path = find_dotenv()
load_dotenv(dotenv_path)
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
class TextClassifier:
def __init__(self, model_name="text-davinci-002", from_date='2022-01-01', to_date=str(date.today()),
user_list=['jimmieakesson'],
num_tweets=20):
"""
Initializes the TextClassifier.
:param model_name: name of the model from openai.
:param from_date: string of the format 'YYYY-MM-DD'.
:param to_date: string of the format 'YYYY-MM-DD'.
:param num_tweets: integer value of the maximum number of tweets to be scraped.
"""
# Make sure user_name is not empty
assert user_list is not None, "user_name cannot be empty"
self.ts = TwitterScraper.TwitterScraper(from_date, to_date, num_tweets)
self.model_name = model_name
self.from_date = from_date
self.to_date = to_date
self.num_tweets = num_tweets
self.user_name = user_list
# Assure that scrape_by_user actually gets num_tweets
# add timer in time-loop and stop after 10 seconds
# self.df = self.ts.scrape_by_user(user_name)
self.df = self.ts.scrape_by_several_users(user_list)
# Check if 'id' is in self.df
if 'id' in self.df.columns:
# Make id as type int64
self.df.loc[:, 'id'] = self.df.id.copy().apply(lambda x: int(x))
else:
# If not do nothing
pass
openai.api_key = OPENAI_API_KEY
def classify_all(self, tweet: str):
"""
Classifies the topic, subtopic, sentiment and target of a user's tweets.
"""
import os
import openai
valid_tweet = len(tweet.split()) > 4
if valid_tweet:
openai.api_key = os.getenv("OPENAI_API_KEY")
promptstring = "Decide a Tweet's political TOPIC and SUBTOPIC, without classifying it as 'politics'. Also " \
"decide whether a political Tweet's " \
"SENTIMENT is " \
"positive, " \
"negative or neutral. Also give the TARGET of the sentiment. \nGive the answer in the form ' (" \
"TOPIC, SUBTOPIC, SENTIMENT, TARGET)'\n\nTweet: {} \nAnswer: ".format(tweet)
response = openai.Completion.create(
model="text-davinci-002",
prompt=promptstring,
temperature=0,
max_tokens=30,
top_p=1,
frequency_penalty=0.5,
presence_penalty=0
)
classification_unclean = response.choices[0]['text']
classification_clean = self.cleanup_topic_results(classification_unclean)
if classification_clean.lower() == "(topic, subtopic, sentiment, target)":
classification_clean = "(none, none, none, none)"
else:
classification_clean = "(none, none, none, none)"
return classification_clean.lower()
def classify_all_list(self):
"""
Classifies the topics of a user's tweets.
"""
df_topic = self.df.copy()
df_topic['class_tuple'] = df_topic['tweet'].apply(self.classify_all)
self.df = df_topic
self.split_tuple_into_columns()
return self.df
@staticmethod
def cleanup_topic_results(text):
"""
Cleanup response from GPT-3 to a string matching the format: "(main_topic, sub_topic, sentiment, target)"
:param text: GPT-3 response
:return: A string on the format: "(main_topic, sub_topic, sentiment, target)"
"""
new_item = text.strip()
new_item = new_item.replace("\n", "")
new_item = new_item.replace(" ", "")
item_control = new_item.replace("(", "")
item_control = item_control.replace(")", "")
item_control = item_control.split(",")
if ' ' or '' in item_control:
item_control = [s.strip() if not (s == ' ' or s == '') else 'none' for s in
item_control] # Replace empty classifications with 'none'
diff = 4 - len(item_control)
if diff < 0: # If response gave more than four predictions
cutout = item_control[diff - 1:] # Cut out the superflous predictions
item_control = item_control[:diff - 1] # Save the rest
new_s = ""
for i in range(len(cutout)):
new_s += cutout[i]
if i < -diff:
new_s += " and " # Merge superflous predictions. E.g. target = 's', 'mp', 'v' -> target = 's and mp and v'
item_control.append(new_s)
elif diff > 0: # If response gave less than four predictions
for i in range(diff):
item_control.append("none") # Fill out tuple with nones
new_item = str(tuple(item_control))
new_item = new_item.replace("'", "")
return new_item
def df_to_csv(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
"""
Writes pandas df to csv file. If it already exists, it appends. If not, it creates. It also removes duplicates.
:param filename:
:return:
"""
if not os.path.exists(filename):
self.df.to_csv(filename, index=False)
else:
self.df.to_csv(filename, mode='a', header=False, index=False)
self.remove_duplicates_from_csv(filename)
@staticmethod
def remove_duplicates_from_csv(filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
"""
Removes duplicates from csv file.
:param filename: filename of csv file
:return: None
"""
with open(filename, 'r', encoding="utf8") as f:
lines = f.readlines()
with open(filename, 'w', encoding="utf8") as f:
for line in lines:
if line not in lines[lines.index(line) + 1:]:
f.write(line)
def remove_already_classified_tweets(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
"""
Removes tweets that have already been classified.
:param filename: filename of csv file
:return: None
"""
df = self.df
df = df[df['sentiment'].isnull()]
self.df = df
self.df_to_csv(filename)
def split_tuple_into_columns(self):
"""
Splits the topics (topic, subtopic, sentiment, target) into columns.
:return: None
"""
df_topic = self.df.copy()
df_topic['topics_temp'] = df_topic['class_tuple'].apply(lambda x: tuple(x[1:-1].split(",")))
df_topic_split = pd.DataFrame(df_topic['topics_temp'].tolist(),
columns=['main_topic', 'sub_topic', 'sentiment', 'target'])
# Manually add columns to self.df
self.df['main_topic'] = df_topic_split['main_topic'].tolist()
self.df['main_topic'] = self.df['main_topic'].replace(["n/a", "", " "], "none", regex=True)
self.df['main_topic'] = self.df['main_topic'].apply(
lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
self.df['sub_topic'] = df_topic_split['sub_topic'].tolist()
# In a few of the outputs from GPT-3 the sub_topic = "sentiment"
self.df['sub_topic'] = self.df['sub_topic'].replace(["n/a", "sentiment", "", " "], "none", regex=True)
self.df['sub_topic'] = self.df['sub_topic'].apply(
lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
self.df['sentiment'] = df_topic_split['sentiment'].tolist()
self.df['sentiment'] = self.df['sentiment'].replace(["n/a", "sentiment", "", " "], "none", regex=True)
self.df['sentiment'] = self.df['sentiment'].apply(
lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
self.df['target'] = df_topic_split['target'].tolist()
self.df['target'] = self.df['target'].replace(["n/a", "", " "], "none", regex=True)
self.df['target'] = self.df['target'].apply(lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
self.df.fillna('none', inplace=True)
def get_dataframe(self):
"""
Returns the dataframe.
:return: dataframe
"""
return self.df
def __repr__(self):
"""
Gives a string that describes which user is classified
:return:
"""
return "Classifier for user: " + self.user_name + " with model: " + self.model_name + "."
def get_database(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
"""
Returns the database containing all dataframes.
:param filename: filename of csv file
:return:
"""
db = pd.read_csv(filename)
return db
def cleanup_list(self, uncleaned_list):
"""
Cleans up faulty predictions.
:param uncleaned_list: the list to be cleaned
:return: cleaned list
"""
uncleaned_list = [s if not isinstance(s, float) else "none" for s in uncleaned_list]
uncleaned_list = [s if not len(s.split()) > 5 else "none" for s in uncleaned_list]
uncleaned_list = [s if not "swedish" in s else s.replace("swedish", " ") for s in uncleaned_list]
uncleaned_list = [s if not "politics" in s else s.replace("politics", "none") for s in uncleaned_list]
uncleaned_list = [s.replace(" ", " ") for s in uncleaned_list]
cleaned_list = [s.strip() for s in uncleaned_list]
return cleaned_list
def merge_lists(self, main_topic_list, sub_topic_list):
"""
Merges the topic lists. If either topic is a faulty classification, only the non-faulty topic wil be used.
If both are faulty, the merged topic will be labeled as faulty (ERROR_496).
:param main_topic_list: A list containing main topics
:param sub_topic_list: A list containing sub topics
:return: A list containing string items on the form "main_topic and sub_topic"
"""
new_list = []
main_topic_list = self.clean_party_names(main_topic_list)
sub_topic_list = self.clean_party_names(sub_topic_list)
for i in range(len(main_topic_list)):
if main_topic_list[i].lower() == "none" and sub_topic_list[
i].lower() == "none": # If the predictions are faulty
new_list.append("ERROR_496") # Label as ERROR_496 (faulty prediction)
elif main_topic_list[i].lower() == "none":
new_list.append(sub_topic_list[i])
elif sub_topic_list[i].lower() == "none":
new_list.append(main_topic_list[i])
else:
new_list.append(main_topic_list[i] + " and " + sub_topic_list[i])
return new_list
def file_to_mat(self, classification_type):
"""
Converts a synonym textfile to a matrix in which the rows contain a general topic/target and its related words.
:param classification_type: The type of classification: topic or target
:return: a matrix in which the first element of each row is a general topic/target, and the rest are words related to
the topic
"""
filename = "{}/data/".format(ROOT_PATH)
filename += classification_type + "_synonyms.txt"
with open(filename, encoding='utf-8') as f:
lines = f.read()
lines = lines.split("\n")
topic_list = []
temp_list = []
for topic in lines:
if not topic.endswith("####"):
temp_list.append(topic)
else:
temp_list.append(topic[:-4]) # Remove the marker (####)
topic_list.append(temp_list)
temp_list = []
return topic_list
def mat_to_list(self, mat):
"""
Converts a matrix from file_to_mat() into one list containing all topics and synonyms, and one list with
mappings for the synonyms.
:param mat: a matrix from file_to_mat()
:return:
"""
full_list = []
mapped_synonyms = []
for syns in mat:
for topic in syns:
full_list.append(topic)
mapped_synonyms.append(syns[0])
return full_list, mapped_synonyms
def clean_party_names(self, old_topic_list):
"""
Encodes all party names to sentences that will yield a high cosine similarity value when merged with another
topic, without taking the actual party name into account. These sentences have deliberately been composed such
that they pose a low risk of being close (in the sentence embedding-space) to any possible merged topic or
target that may be encountered.
:param old_topic_list: list of topics
:return: list of encoded topics
"""
# Problem 1: When a party name is encountered, we want to bias the merging towards that party since the
# occurrence of a very general main topic (as in the example below) plus a party name as subtopic is frequent.
# Example: main_topic = "politics", sub_topic = "sweden democrats" ->
# combined_topics = "politics and sweden democrats"
# Problem 2: The party names themselves are biased towards certain topics/targets and lead to faulty merges.
# Example: Variations of words such as "Sweden" as the target/topic will be biased towards getting merged with
# "Sweden Democrats".
# Solution: Encode party names with sentences that are HIGHLY unlikely to be close to anything in the embedding
# space and thus enforcing a strong bias in the cosine similarity computation towards the party if encountered.
party_names = {}
party_names["m"] = "parrot computer is swimming as screen time"
party_names["moderaterna"] = "parrot computer is swimming as screen time"
party_names["moderates"] = "parrot computer is swimming as screen time"
party_names["the moderates"] = "parrot computer is swimming as screen time"
party_names["moderate party"] = "parrot computer is swimming as screen time"
party_names["the moderate party"] = "parrot computer is swimming as screen time"
party_names["the moderaterna party"] = "parrot computer is swimming as screen time"
party_names["sd"] = "keyboard can hire the yellow elephant in cosmos"
party_names["sverigedemokraterna"] = "keyboard can hire the yellow elephant in cosmos"
party_names["sweden democrats"] = "keyboard can hire the yellow elephant in cosmos"
party_names["the sweden democrats"] = "keyboard can hire the yellow elephant in cosmos"
party_names["the swedish democrats"] = "keyboard can hire the yellow elephant in cosmos"
party_names["swedish democrats"] = "keyboard can hire the yellow elephant in cosmos"
party_names["@jimmieakesson"] = "keyboard can hire the yellow elephant in cosmos"
party_names["l"] = "red weather jokes with music and the mathematician"
party_names["liberalerna"] = "red weather jokes with music and the mathematician"
party_names["liberals"] = "red weather jokes with music and the mathematician"
party_names["the liberals"] = "red weather jokes with music and the mathematician"
party_names["the liberal party"] = "red weather jokes with music and the mathematician"
party_names["liberal people's party"] = "red weather jokes with music and the mathematician"
party_names["@johanpehrson"] = "red weather jokes with music and the mathematician"
party_names["mp"] = "ice piano flies with pencil as direction"
party_names["miljöpartiet"] = "ice piano flies with pencil as direction"
party_names["de gröna"] = "ice piano flies with pencil as direction"
party_names["green party"] = "ice piano flies with pencil as direction"
party_names["the green party"] = "ice piano flies with pencil as direction"
party_names["miljopartiet"] = "ice piano flies with pencil as direction"
party_names["@bolund"] = "ice piano flies with pencil as direction"
party_names["@martastenevi"] = "ice piano flies with pencil as direction"
party_names["s"] = "lamp of fire walks bird gladly tomorrow"
party_names["socialdemokraterna"] = "lamp of fire walks bird gladly tomorrow"
party_names["social democratic party"] = "lamp of fire walks bird gladly tomorrow"
party_names["the social democratic party"] = "lamp of fire walks bird gladly tomorrow"
party_names["social democrats"] = "lamp of fire walks bird gladly tomorrow"
party_names["the social democrats"] = "lamp of fire walks bird gladly tomorrow"
party_names["sosse"] = "lamp of fire walks bird gladly tomorrow"
party_names["sossen"] = "lamp of fire walks bird gladly tomorrow"
party_names["sossar"] = "lamp of fire walks bird gladly tomorrow"
party_names["sossarna"] = "lamp of fire walks bird gladly tomorrow"
party_names["sossarnas"] = "lamp of fire walks bird gladly tomorrow"
party_names["swedish social democrats"] = "lamp of fire walks bird gladly tomorrow"
party_names["@swedishpm"] = "lamp of fire walks bird gladly tomorrow"
party_names["v"] = "rooftop cats play physics with cardboard fire"
party_names["vänsterpartiet"] = "rooftop cats play physics with cardboard fire"
party_names["left party"] = "rooftop cats play physics with cardboard fire"
party_names["the left party"] = "rooftop cats play physics with cardboard fire"
party_names["@dadgostarnooshi"] = "rooftop cats play physics with cardboard fire"
party_names["c"] = "differential donuts program sunny waters"
party_names["centerpartiet"] = "differential donuts program sunny waters"
party_names["center party"] = "differential donuts program sunny waters"
party_names["centre party"] = "differential donuts program sunny waters"
party_names["the center party"] = "differential donuts program sunny waters"
party_names["@annieloof"] = "differential donuts program sunny waters"
party_names["kd"] = "cauchy-riemann met sunglasses after rolling yellow"
party_names["kristdemokraterna"] = "cauchy-riemann met sunglasses after rolling yellow"
party_names["christian democrats"] = "cauchy-riemann met sunglasses after rolling yellow"
party_names["the christian democrats"] = "cauchy-riemann met sunglasses after rolling yellow"
party_names["@buschebba"] = "cauchy-riemann met sunglasses after rolling yellow"
for i, topic in enumerate(old_topic_list):
topic = topic.lower()
topic = topic.replace(" ", " ")
topic = topic.strip()
if topic in party_names:
old_topic_list[i] = party_names.get(topic)
return old_topic_list
def reset_party_names(self, old_topic_list):
"""
Decodes the encoded party names.
:param old_topic_list: list of topics
:return: list of encoded topics
"""
party_names = {}
party_names["m"] = "parrot computer is swimming as screen time"
party_names["sd"] = "keyboard can hire the yellow elephant in cosmos"
party_names["l"] = "red weather jokes with music and the mathematician"
party_names["mp"] = "ice piano flies with pencil as direction"
party_names["s"] = "lamp of fire walks bird gladly tomorrow"
party_names["v"] = "rooftop cats play physics with cardboard fire"
party_names["c"] = "differential donuts program sunny waters"
party_names["kd"] = "cauchy-riemann met sunglasses after rolling yellow"
inverted_dict = {}
# Invert dictionary
for k, v in party_names.items():
if v not in inverted_dict:
inverted_dict[v] = k
# Update values in old_topic_list
for i, topic in enumerate(old_topic_list):
if topic in inverted_dict.keys():
old_topic_list[i] = inverted_dict.get(topic)
return old_topic_list
def merge_classifications(self, old_list, classification_type):
"""
Merges topics/targets from GPT-3 according to a list of predefined topics/targets.
:param old_list: list of the topics/targets to be merged
:param classification_type: type of classifications: topic or target
:return: list of new topics/targets
"""
# Get the tuple of lists containing all synonyms and general topics/targets
tup_list = self.mat_to_list(self.file_to_mat(classification_type))
# Save list of synonyms
synonym_list = tup_list[0]
# Save list of mappings between synonym and general topic/target
synonym_mappings = tup_list[1]
# Load embedding model-names
model_list = ['sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', 'all-MiniLM-L6-v2']
result_dict = {}
# Encode party names
old_list = self.clean_party_names(old_list)
for model_name in model_list:
model = SentenceTransformer(model_name)
# Encode the topics/targets with the sentence transformer model
old_list_embeddings = model.encode(old_list, batch_size=64, show_progress_bar=True,
convert_to_tensor=True)
# Encode the synonyms with the sentence transformer model
synonym_list_embeddings = model.encode(synonym_list, batch_size=64, show_progress_bar=True,
convert_to_tensor=True)
for i, embedded_classification in enumerate(old_list_embeddings):
result_list = []
for embedded_synonyms in synonym_list_embeddings:
# Compute the cosine similarity between every classification and synonym
result = 1 - spatial.distance.cosine(embedded_classification, embedded_synonyms)
result_list.append(result)
max_value = max(result_list)
max_index = result_list.index(max_value)
old_classification = old_list[i]
# Extract the general topic/target
new_classification = synonym_mappings[max_index]
# Save the topic/target that yielded the highest cosine similarity value
if old_classification not in result_dict:
result_dict[old_classification] = [(new_classification, max_value, synonym_list[max_index])]
# When we have found the best topics/targets after using the first transformer model
else:
# Append the results from the next model
result_dict[old_classification].append((new_classification, max_value, synonym_list[max_index]))
new_dict = {}
# Time to replace the old values with the new ones
for old_values in result_dict:
tup_list = result_dict[old_values]
max_tup = max(tup_list, key=lambda item: item[1])
if classification_type == "topic":
limit = 0.4
else:
limit = 0.75
# Discard classification if the old topic/target is not similar to anything in our synonym lists
if max_tup[1] < limit:
max_tup = ("ERROR_9000", "{:.2f}".format(round(max_tup[1], 2)), "none")
else:
max_tup = (max_tup[0], "{:.2f}".format(round(max_tup[1], 2)), max_tup[2])
new_classification = max_tup
if old_values not in new_dict:
new_dict[old_values] = new_classification
new_list = []
for old_value in old_list:
new_list.append(new_dict[old_value])
return new_list
def merge_all(self):
"""
Merges main+subtopics, targets, and updates the dataframe.
:param df:
:return:
"""
df_topics = self.df.copy()
sub_topics = df_topics['sub_topic']
sub_topics = sub_topics.tolist()
sub_topics = self.cleanup_list(sub_topics)
main_topics = df_topics['main_topic']
main_topics = main_topics.tolist()
main_topics = self.cleanup_list(main_topics)
merged_topic_list = self.merge_lists(main_topics, sub_topics)
targets = df_topics['target']
targets = targets.tolist()
targets = self.cleanup_list(targets)
merged_topics = self.merge_classifications(merged_topic_list, "topic")
merged_targets = self.merge_classifications(targets, "target")
print("The following merges were made: ")
for i, top in enumerate(merged_topic_list):
print("TOPICS: ", top, " -> ", merged_topics[i])
t_list = []
for i in range(len(merged_topics)):
t_list.append(tuple(merged_topics[i]) + tuple(merged_targets[i]))
merged_tuples = t_list
df_topics['merged_tuple'] = merged_tuples
df = self.split_merged_tuple_into_columns(df_topics)
print("Merging finished...")
self.df = df
def split_merged_tuple_into_columns(self, df):
"""
Splits the merged tuple (merged topic, merged target) into columns.
:return: None
"""
df_topic = df.copy()
df_topic_split = pd.DataFrame(df_topic['merged_tuple'].tolist(),
columns=['merged_topic', 'cos_sim_topic', 'synonym_topic', 'merged_target',
'cos_sim_target', 'synonym_target'])
self.df['merged_tuple'] = df_topic['merged_tuple'].tolist()
# Manually add columns to self.df
self.df['merged_topic'] = df_topic_split['merged_topic'].tolist()
self.df['cos_sim_topic'] = df_topic_split['cos_sim_topic'].tolist()
self.df['synonym_topic'] = self.reset_party_names(df_topic_split['synonym_topic'].tolist())
self.df['merged_target'] = df_topic_split['merged_target'].tolist()
self.df['cos_sim_target'] = df_topic_split['cos_sim_target'].tolist()
self.df['synonym_target'] = self.reset_party_names(df_topic_split['synonym_target'].tolist())
return self.df
def run_main_pipeline(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
"""
Classifies the topics/sentiments of a user's tweets.
#We presume that all tweets inside the twitterdata.csv file are already classified.
:return: None
"""
# Check if file exists, if not, create it
if os.path.exists(filename):
# Fetch tweets from csv file
already_classified_df = pd.read_csv(filename, on_bad_lines='skip')
print("Already classified tweets: {}".format(already_classified_df.shape[0]))
# Create a temporary df where values from already_classified_df that are not it self.df are stored
temp_df = already_classified_df[already_classified_df['id'].isin(self.df['id'])]
# Remove rows from self.df that are not in already_classified_df
self.df = self.df[~self.df['id'].isin(already_classified_df['id'])]
# Only classify non-empty rows
if self.df.shape[0] > 0:
time.sleep(10)
print("Classifying topic, subtopic, sentiment and target of {} tweets...".format(self.df.shape[0]))
self.df = self.classify_all_list()
self.df = self.df.replace({'': 'none'}, regex=True)
self.df = self.df.replace({' ': 'none'}, regex=True)
print("Merging topics...")
self.merge_all()
print("Writing to csv...")
self.df_to_csv(filename)
# Concatenate temp_df and self.df
self.df = pd.concat([temp_df, self.df], ignore_index=True)
print("Appended {}.".format(filename))
return None
else:
self.df = pd.concat([temp_df, self.df], ignore_index=True)
print("No new tweets to classify.")
return None
else:
print("No csv file found. Continuing without removing already classified tweets.")
print("Classifying topic, subtopic, sentiment and target of {} tweets...".format(self.df.shape[0]))
self.df = self.classify_all_list()
self.df = self.df.replace({'': 'none'}, regex=True)
self.df = self.df.replace({' ': 'none'}, regex=True)
print("Merging topics...")
self.merge_all()
print("Writing to csv file...")
self.df_to_csv(filename)
print("Created {}.".format(filename))
return None
if __name__ == "__main__":
# $6.39 @ 3431 tweets
# $18.00 @ 4608 tweets
# $11.61 to classify 1177 tweets ~ $0.01 / tweet
# This code snippet allows for scraping and classifying by simply specifying a start and end date.
USER_LIST = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM',
'dadgostarnooshi']
start_date = date(2022, 8, 4)
end_date = date(2022, 8, 4)
delta = timedelta(days=1)
while start_date <= end_date:
from_date = start_date.strftime("%Y-%m-%d")
start_date += delta
to_date = start_date.strftime("%Y-%m-%d")
print("curr_date: ", from_date)
tc = TextClassifier(from_date=from_date, to_date=to_date, user_list=USER_LIST, num_tweets=6000)
tc.run_main_pipeline()