Spaces:

jsulz
/

sotu-analysis

Running

File size: 5,826 Bytes

import gradio as gr
from datasets import load_dataset
from nltk.util import ngrams
from collections import Counter
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

# Load the dataset and convert it to a Pandas dataframe
sotu_dataset = "jsulz/state-of-the-union-addresses"
dataset = load_dataset(sotu_dataset)
df = dataset["train"].to_pandas()
# decode the tokens-nostop column from a byte array to a list of string
"""
df["tokens-nostop"] = df["tokens-nostop"].apply(
    lambda x: x.decode("utf-8")
    .replace('"', "")
    .replace("[", "")
    .replace("]", "")
    .split(",")
)
"""
df["word_count"] = df["speech_html"].apply(lambda x: len(x.split()))
# calculate the automated readibility index reading ease score for each address
# automated readability index = 4.71 * (characters/words) + 0.5 * (words/sentences) - 21.43
df["ari"] = df["no-contractions"].apply(
    lambda x: (4.71 * (len(x.replace(" ", "")) / len(x.split())))
    + (0.5 * (len(x.split()) / len(x.split("."))))
    - 21.43
)

written = df[df["categories"] == "Written"]
spoken = df[df["categories"] == "Spoken"]

# Create a Gradio interface with blocks
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # A Dashboard to Analyze the State of the Union Addresses
        """
    )
    gr.BarPlot(
        df,
        x="date",
        y="word_count",
        title="Total Number of Words in the Speeches",
        color="categories",
    )
    # group by president and category and calculate the average word count sort by date
    avg_word_count = (
        df.groupby(["date", "potus", "categories"])["word_count"].mean().reset_index()
    )
    # create a bar chart
    gr.BarPlot(
        avg_word_count,
        x="potus",
        y="word_count",
        title="Average Number of Words in the Speeches",
        color="categories",
        x_label_angle=-45,
        height=400,
        min_width=160,
        fill_height=True,
        container=True,
        scale=2,
    )
    with gr.Row():
        ari = df[["potus", "date", "ari", "categories"]]
        gr.LinePlot(
            ari,
            x="date",
            y="ari",
            title="Automated Readability Index",
        )
    # get all unique president names
    presidents = df["potus"].unique()
    # convert presidents to a list
    presidents = presidents.tolist()
    # create a dropdown to select a president
    president = gr.Dropdown(label="Select a President", choices=["All"] + presidents)
    grams = gr.Slider(minimum=1, maximum=4, step=1, label="N-grams", interactive=True)
    with gr.Row():
        # if president is not of type string
        @gr.render(inputs=president)
        def show_text(potus):
            if potus != "All" and potus is not None:
                ari = df[df["potus"] == potus][
                    ["date", "categories", "word_count", "ari"]
                ]
                gr.DataFrame(ari, height=200)

        @gr.render(inputs=president)
        def word_length_bar(potus):
            # calculate the total number of words in the speech_html column and add it to a new column
            # if the president is "All", show the word count for all presidents
            # if the president is not "All", show the word count for the selected president
            if potus != "All" and potus is not None:
                gr.LinePlot(
                    df[df["potus"] == potus],
                    x="date",
                    y="word_count",
                    title="Total Number of Words in the Speeches",
                )

    with gr.Row():
        with gr.Column():

            @gr.render(inputs=[president, grams])
            def ngram_bar(potus, n_grams):
                if potus != "All" and potus is not None:
                    if type(n_grams) is not int:
                        n_grams = 1
                    print(n_grams)
                    # create a Counter object from the trigrams
                    potus_df = df[df["potus"] == potus]
                    # decode the tokens-nostop column from a byte array to a list of string
                    trigrams = (
                        potus_df["tokens-nostop"]
                        .apply(lambda x: list(ngrams(x, n_grams)))
                        .apply(Counter)
                        .sum()
                    )
                    # get the most common trigrams
                    common_trigrams = trigrams.most_common(20)
                    # unzip the list of tuples and plot the trigrams and counts as a bar chart
                    trigrams, counts = zip(*common_trigrams)
                    # join the trigrams into a single string
                    trigrams = [" ".join(trigram) for trigram in trigrams]
                    # create a dataframe from the trigrams and counts
                    trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
                    # plot the trigrams and counts as a bar chart from matplotlib
                    """
                    fig, ax = plt.subplots(figsize=(12, 4))
                    ax.barh(trigrams_df["trigrams"], trigrams_df["counts"])
                    ax.set_title("Top 20 Trigrams")
                    ax.set_ylabel("Count")
                    ax.set_xlabel("Trigrams")
                    plt.xticks(rotation=45)
                    # make it tight layout
                    plt.tight_layout()
                    """
                    fig = px.scatter(
                        trigrams_df,
                        x="counts",
                        y="trigrams",
                        title="Top 20 Trigrams",
                        orientation="h",
                    )
                    print(fig)
                    gr.Plot(value=fig, container=True, visible=True)


demo.launch(share=True)