File size: 5,826 Bytes
9cd5d9e
317f161
11067be
 
68b374e
 
 
317f161
 
68b374e
317f161
68b374e
 
1fd02a5
68b374e
 
 
 
 
 
 
1fd02a5
68b374e
 
 
 
 
 
 
 
 
 
 
317f161
 
68b374e
11067be
 
 
68b374e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11067be
68b374e
11067be
 
 
 
68b374e
317f161
11067be
 
 
68b374e
 
 
 
 
 
11067be
 
 
 
68b374e
 
 
 
 
 
 
 
 
11067be
1fd02a5
11067be
1fd02a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f239c56
68b374e
9cd5d9e
f239c56
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import gradio as gr
from datasets import load_dataset
from nltk.util import ngrams
from collections import Counter
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

# Load the dataset and convert it to a Pandas dataframe
sotu_dataset = "jsulz/state-of-the-union-addresses"
dataset = load_dataset(sotu_dataset)
df = dataset["train"].to_pandas()
# decode the tokens-nostop column from a byte array to a list of string
"""
df["tokens-nostop"] = df["tokens-nostop"].apply(
    lambda x: x.decode("utf-8")
    .replace('"', "")
    .replace("[", "")
    .replace("]", "")
    .split(",")
)
"""
df["word_count"] = df["speech_html"].apply(lambda x: len(x.split()))
# calculate the automated readibility index reading ease score for each address
# automated readability index = 4.71 * (characters/words) + 0.5 * (words/sentences) - 21.43
df["ari"] = df["no-contractions"].apply(
    lambda x: (4.71 * (len(x.replace(" ", "")) / len(x.split())))
    + (0.5 * (len(x.split()) / len(x.split("."))))
    - 21.43
)

written = df[df["categories"] == "Written"]
spoken = df[df["categories"] == "Spoken"]

# Create a Gradio interface with blocks
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # A Dashboard to Analyze the State of the Union Addresses
        """
    )
    gr.BarPlot(
        df,
        x="date",
        y="word_count",
        title="Total Number of Words in the Speeches",
        color="categories",
    )
    # group by president and category and calculate the average word count sort by date
    avg_word_count = (
        df.groupby(["date", "potus", "categories"])["word_count"].mean().reset_index()
    )
    # create a bar chart
    gr.BarPlot(
        avg_word_count,
        x="potus",
        y="word_count",
        title="Average Number of Words in the Speeches",
        color="categories",
        x_label_angle=-45,
        height=400,
        min_width=160,
        fill_height=True,
        container=True,
        scale=2,
    )
    with gr.Row():
        ari = df[["potus", "date", "ari", "categories"]]
        gr.LinePlot(
            ari,
            x="date",
            y="ari",
            title="Automated Readability Index",
        )
    # get all unique president names
    presidents = df["potus"].unique()
    # convert presidents to a list
    presidents = presidents.tolist()
    # create a dropdown to select a president
    president = gr.Dropdown(label="Select a President", choices=["All"] + presidents)
    grams = gr.Slider(minimum=1, maximum=4, step=1, label="N-grams", interactive=True)
    with gr.Row():
        # if president is not of type string
        @gr.render(inputs=president)
        def show_text(potus):
            if potus != "All" and potus is not None:
                ari = df[df["potus"] == potus][
                    ["date", "categories", "word_count", "ari"]
                ]
                gr.DataFrame(ari, height=200)

        @gr.render(inputs=president)
        def word_length_bar(potus):
            # calculate the total number of words in the speech_html column and add it to a new column
            # if the president is "All", show the word count for all presidents
            # if the president is not "All", show the word count for the selected president
            if potus != "All" and potus is not None:
                gr.LinePlot(
                    df[df["potus"] == potus],
                    x="date",
                    y="word_count",
                    title="Total Number of Words in the Speeches",
                )

    with gr.Row():
        with gr.Column():

            @gr.render(inputs=[president, grams])
            def ngram_bar(potus, n_grams):
                if potus != "All" and potus is not None:
                    if type(n_grams) is not int:
                        n_grams = 1
                    print(n_grams)
                    # create a Counter object from the trigrams
                    potus_df = df[df["potus"] == potus]
                    # decode the tokens-nostop column from a byte array to a list of string
                    trigrams = (
                        potus_df["tokens-nostop"]
                        .apply(lambda x: list(ngrams(x, n_grams)))
                        .apply(Counter)
                        .sum()
                    )
                    # get the most common trigrams
                    common_trigrams = trigrams.most_common(20)
                    # unzip the list of tuples and plot the trigrams and counts as a bar chart
                    trigrams, counts = zip(*common_trigrams)
                    # join the trigrams into a single string
                    trigrams = [" ".join(trigram) for trigram in trigrams]
                    # create a dataframe from the trigrams and counts
                    trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
                    # plot the trigrams and counts as a bar chart from matplotlib
                    """
                    fig, ax = plt.subplots(figsize=(12, 4))
                    ax.barh(trigrams_df["trigrams"], trigrams_df["counts"])
                    ax.set_title("Top 20 Trigrams")
                    ax.set_ylabel("Count")
                    ax.set_xlabel("Trigrams")
                    plt.xticks(rotation=45)
                    # make it tight layout
                    plt.tight_layout()
                    """
                    fig = px.scatter(
                        trigrams_df,
                        x="counts",
                        y="trigrams",
                        title="Top 20 Trigrams",
                        orientation="h",
                    )
                    print(fig)
                    gr.Plot(value=fig, container=True, visible=True)


demo.launch(share=True)