politweet / app.py
Demea9000's picture
changed topic so that topics are divided into columns
5214b07
import gradio as gr
from textclassifier import TextClassifier as tc
from functions import *
import re
def main(from_date, to_date, user_name):
"""
Main function. Runs the program.
:return: None
"""
from_date = from_date
to_date = to_date
user_name = user_name
text_classifier = tc.TextClassifier(from_date=from_date, to_date=to_date, user_name=user_name, num_tweets=20)
text_classifier.run_main_pipeline()
dataframe = text_classifier.dataframe
return (dataframe, get_summary_statistics(dataframe))
def get_summary_statistics(dataframe):
"""
This function returns a summary statistics of the dataframe. Returns a string with the summary statistics.
:param dataframe: dataframe
:return: str
"""
summary_statistics = dataframe.describe()
return summary_statistics.to_string()
def separate_string(string):
list_string = string.split('.')
list_useable = []
for list_part in list_string :
list_useable.append(list_part.split(' ', 1))
final_list = []
for li in list_useable[1:]:
final_list.append(li[1])
# remove numeric characters and spaces
filter_numeric_regex = '[^a-z]'
final_final_list = []
for li in final_list:
final_final_list.append(re.sub(filter_numeric_regex,' ',li).strip())
return final_final_list
def summary_categorical(dataframe):
"""
This function returns a string of a summary of categorical variables of a dataframe
:param dataframe:
:return:
"""
# First find all numeric columns in the dataframe
numeric_columns = dataframe._get_numeric_data().columns
# Then find all categorical columns in the dataframe
categorical_columns = dataframe.select_dtypes(include=['object']).columns
print("Numeric columns: " + str(numeric_columns) + "\n" + "Categorical columns: " + str(categorical_columns))
# Then find the summary statistics of each categorical column
summary = ""
for column in categorical_columns:
# skip 'tweet', 'urls' and 'date' columns
if column == 'tweet' or column == 'date' or column == 'urls':
continue
else:
summary += column + ": " + str(dataframe[column].value_counts()) + "\n"
# Return a nice decoration of summary
return "Summary of categorical variables:\n" + summary
if __name__ == "__main__":
from datetime import date
# demo = gr.Interface(
# fn=main,
# inputs=['text', 'text', 'text'],
# outputs=["dataframe", "text"],
# )
# demo.launch()
text_classifier = tc.TextClassifier(from_date='2020-01-01', to_date='2020-01-31', user_name="jimmieakesson",
num_tweets=20)
text_classifier.run_main_pipeline()
print(get_summary_statistics(text_classifier.get_dataframe()))
print(type(get_summary_statistics(text_classifier.get_dataframe())))
print(summary_categorical(text_classifier.get_dataframe()))
string = '1. swedish 2. nuclear 3. hello world 4. uha yhd ikv hahd vva 5. '
print(separate_string(string))