File size: 2,604 Bytes
15ca093
e515291
297c37f
 
 
 
 
e515291
 
15ca093
e515291
 
 
 
15ca093
 
 
e515291
ec6943b
297c37f
 
15ca093
 
5214b07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec6943b
 
297c37f
 
ec6943b
5214b07
15ca093
5214b07
 
 
 
 
 
 
38b2250
5214b07
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import gradio as gr
from textclassifier import TextClassifier as tc
from functions import functions as f
import time

USER_LIST = ['jimmieakesson', 'BuschEbba', 'annieloof', 'JohanPehrson', 'bolund', 'martastenevi', 'SwedishPM',
             'dadgostarnooshi']


def main(from_date, to_date, user_name):
    """
    Main function. Runs the program.
    :return: None
    """
    from_date = from_date
    to_date = to_date
    user_name = user_name
    text_classifier = tc.TextClassifier(from_date=from_date, to_date=to_date, user_name=user_name, num_tweets=20)
    text_classifier.run_main_pipeline()
    dataframe = text_classifier.get_dataframe()
    return dataframe, get_summary_statistics(dataframe)


def get_summary_statistics(dataframe):
    """
    This function returns a summary statistics of the dataframe. Returns a string with the summary statistics.
    :param dataframe: dataframe
    :return: str
    """
    summary_statistics = dataframe.describe()
    return summary_statistics.to_string()


def summary_categorical(dataframe):
    """
    This function returns a string of a summary of categorical variables of a dataframe
    :param dataframe:
    :return:
    """
    # First find all numeric columns in the dataframe
    numeric_columns = dataframe._get_numeric_data().columns
    # Then find all categorical columns in the dataframe
    categorical_columns = dataframe.select_dtypes(include=['object']).columns
    print("Numeric columns: " + str(numeric_columns) + "\n" + "Categorical columns: " + str(categorical_columns))
    # Then find the summary statistics of each categorical column
    summary = ""
    for column in categorical_columns:
        # skip 'tweet', 'urls' and 'date' columns
        if column == 'tweet' or column == 'date' or column == 'urls':
            continue
        else:
            summary += column + ": " + str(dataframe[column].value_counts()) + "\n"
    # Return a nice decoration of summary
    return "Summary of categorical variables:\n" + summary




if __name__ == "__main__":
    from datetime import date

    # demo = gr.Interface(
    #     fn=main,
    #     inputs=['text', 'text', 'text'],
    #     outputs=["dataframe", "text"],
    # )
    # demo.launch()

    text_classifier = tc.TextClassifier(from_date='2019-07-01', to_date='2022-07-31', user_name=USER_LIST[1], num_tweets=20)
    text_classifier.run_main_pipeline()
    print(get_summary_statistics(text_classifier.get_dataframe()))
    print(type(get_summary_statistics(text_classifier.get_dataframe())))
    print(summary_categorical(text_classifier.get_dataframe()))