Spaces:

jsulz
/

sotu-analysis

Running

App Files Files Community

jsulz HF staff commited on Aug 29

Commit

68b374e

•

1 Parent(s): b09504e

done with most charts; just need to add words

Browse files

Files changed (4) hide show

app.py +111 -33
poetry.lock +0 -0
pyproject.toml +19 -0
requirements.txt +75 -1

app.py CHANGED Viewed

@@ -2,61 +2,139 @@ import gradio as gr
 from datasets import load_dataset
 from nltk.util import ngrams
 from collections import Counter
 # Load the dataset and convert it to a Pandas dataframe
-sotu_dataset = 'jsulz/state-of-the-union-addresses'
 dataset = load_dataset(sotu_dataset)
-df = dataset['train'].to_pandas()
-df['word_count'] = df['speech_html'].apply(lambda x: len(x.split()))
-written = df[df['categories'] == 'Written']
-spoken = df[df['categories'] == 'Spoken']
 # Create a Gradio interface with blocks
-with  gr.Blocks() as demo:
     gr.Markdown(
         """
         # A Dashboard to Analyze the State of the Union Addresses
-        """)
     # get all unique president names
-    presidents = df['potus'].unique()
     # convert presidents to a list
     presidents = presidents.tolist()
     # create a dropdown to select a president
     president = gr.Dropdown(label="Select a President", choices=["All"] + presidents)
     with gr.Row():
         # if president is not of type string
         @gr.render(inputs=president)
         def show_text(potus):
-            if potus is not None:
-                gr.Markdown(f"{potus} was the first president of the United States.")
         @gr.render(inputs=president)
         def word_length_bar(potus):
             # calculate the total number of words in the speech_html column and add it to a new column
             # if the president is "All", show the word count for all presidents
-            if potus == "All":
-                gr.BarPlot(df, x="date", y="word_count", title="Total Number of Words in the Speeches")
-            else:
-                # if the president is not "All", show the word count for the selected president
-                gr.BarPlot(df[df['potus'] == potus], x="date", y="word_count", title="Total Number of Words in the Speeches")
     with gr.Row():
-        @gr.render(inputs=president)
-        def ngram_bar(potus):
-            # create a Counter object from the trigrams
-            potus_df = df[df["potus"] == potus]
-            trigrams = (
-                potus_df["tokens-nostop"].apply(lambda x: list(ngrams(x, 3))).apply(Counter).sum()
-            )
-            # get the most common trigrams
-            common_trigrams = trigrams.most_common(20)
-            # unzip the list of tuples and plot the trigrams and counts as a bar chart
-            trigrams, counts = zip(*common_trigrams)
-            # join the trigrams into a single string
-            trigrams = [" ".join(trigram) for trigram in trigrams]
-            # create a dataframe from the trigrams and counts
-            trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
-            # plot the trigrams and counts as a bar chart
-            gr.BarPlot(trigrams_df, x="trigrams", y="counts", title="Most Common Trigrams")
 demo.launch()

 from datasets import load_dataset
 from nltk.util import ngrams
 from collections import Counter
+import pandas as pd
+import plotly.express as px
+import matplotlib.pyplot as plt
 # Load the dataset and convert it to a Pandas dataframe
+sotu_dataset = "jsulz/state-of-the-union-addresses"
 dataset = load_dataset(sotu_dataset)
+df = dataset["train"].to_pandas()
+# decode the tokens-nostop column from a byte array to a list of string
+df["tokens-nostop"] = df["tokens-nostop"].apply(
+    lambda x: x.decode("utf-8")
+    .replace('"', "")
+    .replace("[", "")
+    .replace("]", "")
+    .split(",")
+)
+df["word_count"] = df["speech_html"].apply(lambda x: len(x.split()))
+# calculate the automated readibility index reading ease score for each address
+# automated readability index = 4.71 * (characters/words) + 0.5 * (words/sentences) - 21.43
+df["ari"] = df["no-contractions"].apply(
+    lambda x: (4.71 * (len(x.replace(" ", "")) / len(x.split())))
+    + (0.5 * (len(x.split()) / len(x.split("."))))
+    - 21.43
+)
+written = df[df["categories"] == "Written"]
+spoken = df[df["categories"] == "Spoken"]
 # Create a Gradio interface with blocks
+with gr.Blocks() as demo:
     gr.Markdown(
         """
         # A Dashboard to Analyze the State of the Union Addresses
+        """
+    )
+    gr.BarPlot(
+        df,
+        x="date",
+        y="word_count",
+        title="Total Number of Words in the Speeches",
+        color="categories",
+    )
+    # group by president and category and calculate the average word count sort by date
+    avg_word_count = (
+        df.groupby(["date", "potus", "categories"])["word_count"].mean().reset_index()
+    )
+    # create a bar chart
+    gr.BarPlot(
+        avg_word_count,
+        x="potus",
+        y="word_count",
+        title="Average Number of Words in the Speeches",
+        color="categories",
+        x_label_angle=-45,
+        height=400,
+        min_width=160,
+        fill_height=True,
+        container=True,
+        scale=2,
+    )
+    with gr.Row():
+        ari = df[["potus", "date", "ari", "categories"]]
+        gr.LinePlot(
+            ari,
+            x="date",
+            y="ari",
+            title="Automated Readability Index",
+        )
     # get all unique president names
+    presidents = df["potus"].unique()
     # convert presidents to a list
     presidents = presidents.tolist()
     # create a dropdown to select a president
     president = gr.Dropdown(label="Select a President", choices=["All"] + presidents)
+    grams = gr.Slider(minimum=1, maximum=4, step=1, label="N-grams", interactive=True)
     with gr.Row():
         # if president is not of type string
         @gr.render(inputs=president)
         def show_text(potus):
+            if potus != "All" and potus is not None:
+                ari = df[df["potus"] == potus][
+                    ["date", "categories", "word_count", "ari"]
+                ]
+                gr.DataFrame(ari, height=200)
         @gr.render(inputs=president)
         def word_length_bar(potus):
             # calculate the total number of words in the speech_html column and add it to a new column
             # if the president is "All", show the word count for all presidents
+            # if the president is not "All", show the word count for the selected president
+            if potus != "All" and potus is not None:
+                gr.LinePlot(
+                    df[df["potus"] == potus],
+                    x="date",
+                    y="word_count",
+                    title="Total Number of Words in the Speeches",
+                )
     with gr.Row():
+        @gr.render(inputs=[president, grams])
+        def ngram_bar(potus, n_grams):
+            if potus != "All" and potus is not None:
+                if type(n_grams) is not int:
+                    n_grams = 1
+                print(n_grams)
+                # create a Counter object from the trigrams
+                potus_df = df[df["potus"] == potus]
+                # decode the tokens-nostop column from a byte array to a list of string
+                trigrams = (
+                    potus_df["tokens-nostop"]
+                    .apply(lambda x: list(ngrams(x, n_grams)))
+                    .apply(Counter)
+                    .sum()
+                )
+                # get the most common trigrams
+                common_trigrams = trigrams.most_common(20)
+                # unzip the list of tuples and plot the trigrams and counts as a bar chart
+                trigrams, counts = zip(*common_trigrams)
+                # join the trigrams into a single string
+                trigrams = [" ".join(trigram) for trigram in trigrams]
+                # create a dataframe from the trigrams and counts
+                trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
+                # plot the trigrams and counts as a bar chart from matplotlib
+                fig, ax = plt.subplots(figsize=(12, 4))
+                ax.barh(trigrams_df["trigrams"], trigrams_df["counts"])
+                ax.set_title("Top 20 Trigrams")
+                ax.set_ylabel("Count")
+                ax.set_xlabel("Trigrams")
+                plt.xticks(rotation=45)
+                # make it tight layout
+                plt.tight_layout()
+                gr.Plot(value=fig, container=True)
 demo.launch()

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,19 @@

+[tool.poetry]
+name = "sotu-analysis"
+version = "0.1.0"
+description = ""
+authors = ["jsulz <j.sulzdorf@gmail.com>"]
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "^3.12"
+gradio = "^4.42.0"
+datasets = "^2.21.0"
+pandas = "^2.2.2"
+nltk = "^3.9.1"
+plotly = "^5.23.0"
+matplotlib = "^3.9.2"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

requirements.txt CHANGED Viewed

	@@ -1 +1,75 @@
1	- ~~nltk~~==3.9.1

+aiofiles==23.2.1
+aiohappyeyeballs==2.4.0
+aiohttp==3.10.5
+aiosignal==1.3.1
+annotated-types==0.7.0
+anyio==4.4.0
+attrs==24.2.0
+certifi==2024.7.4
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+contourpy==1.3.0
+cycler==0.12.1
+datasets==2.21.0
+dill==0.3.8
+fastapi==0.112.2
+ffmpy==0.4.0
+filelock==3.15.4
+fonttools==4.53.1
+frozenlist==1.4.1
+fsspec==2024.6.1
+fsspec[http]==2024.6.1
+gradio-client==1.3.0
+gradio==4.42.0
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
+huggingface-hub==0.24.6
+idna==3.8
+importlib-resources==6.4.4
+jinja2==3.1.4
+joblib==1.4.2
+kiwisolver==1.4.5
+markdown-it-py==3.0.0
+markupsafe==2.1.5
+matplotlib==3.9.2
+mdurl==0.1.2
+multiprocess==0.70.16
+nltk==3.9.1
+numpy==2.1.0
+orjson==3.10.7
+packaging==24.1
+pandas==2.2.2
+pillow==10.4.0
+plotly==5.23.0
+pyarrow==17.0.0
+pydantic-core==2.20.1
+pydantic==2.8.2
+pydub==0.25.1
+pygments==2.18.0
+pyparsing==3.1.4
+python-dateutil==2.9.0.post0
+python-multipart==0.0.9
+pytz==2024.1
+pyyaml==6.0.2
+regex==2024.7.24
+requests==2.32.3
+rich==13.8.0
+ruff==0.6.2
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+starlette==0.38.2
+tenacity==9.0.0
+tomlkit==0.12.0
+tqdm==4.66.5
+typer==0.12.5
+typing-extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.2
+uvicorn==0.30.6
+websockets==12.0
+xxhash==3.5.0
+yarl==1.9.4