Spaces:
Running
Running
done with most charts; just need to add words
Browse files- app.py +111 -33
- poetry.lock +0 -0
- pyproject.toml +19 -0
- requirements.txt +75 -1
app.py
CHANGED
@@ -2,61 +2,139 @@ import gradio as gr
|
|
2 |
from datasets import load_dataset
|
3 |
from nltk.util import ngrams
|
4 |
from collections import Counter
|
|
|
|
|
|
|
5 |
|
6 |
# Load the dataset and convert it to a Pandas dataframe
|
7 |
-
sotu_dataset =
|
8 |
dataset = load_dataset(sotu_dataset)
|
9 |
-
df = dataset[
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# Create a Gradio interface with blocks
|
15 |
-
with
|
16 |
gr.Markdown(
|
17 |
"""
|
18 |
# A Dashboard to Analyze the State of the Union Addresses
|
19 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
# get all unique president names
|
21 |
-
presidents = df[
|
22 |
# convert presidents to a list
|
23 |
presidents = presidents.tolist()
|
24 |
# create a dropdown to select a president
|
25 |
president = gr.Dropdown(label="Select a President", choices=["All"] + presidents)
|
|
|
26 |
with gr.Row():
|
27 |
# if president is not of type string
|
28 |
@gr.render(inputs=president)
|
29 |
def show_text(potus):
|
30 |
-
if potus is not None:
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
33 |
@gr.render(inputs=president)
|
34 |
def word_length_bar(potus):
|
35 |
# calculate the total number of words in the speech_html column and add it to a new column
|
36 |
# if the president is "All", show the word count for all presidents
|
37 |
-
if
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
42 |
with gr.Row():
|
43 |
|
44 |
-
@gr.render(inputs=president)
|
45 |
-
def ngram_bar(potus):
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
demo.launch()
|
|
|
2 |
from datasets import load_dataset
|
3 |
from nltk.util import ngrams
|
4 |
from collections import Counter
|
5 |
+
import pandas as pd
|
6 |
+
import plotly.express as px
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
|
9 |
# Load the dataset and convert it to a Pandas dataframe
|
10 |
+
sotu_dataset = "jsulz/state-of-the-union-addresses"
|
11 |
dataset = load_dataset(sotu_dataset)
|
12 |
+
df = dataset["train"].to_pandas()
|
13 |
+
# decode the tokens-nostop column from a byte array to a list of string
|
14 |
+
df["tokens-nostop"] = df["tokens-nostop"].apply(
|
15 |
+
lambda x: x.decode("utf-8")
|
16 |
+
.replace('"', "")
|
17 |
+
.replace("[", "")
|
18 |
+
.replace("]", "")
|
19 |
+
.split(",")
|
20 |
+
)
|
21 |
+
df["word_count"] = df["speech_html"].apply(lambda x: len(x.split()))
|
22 |
+
# calculate the automated readibility index reading ease score for each address
|
23 |
+
# automated readability index = 4.71 * (characters/words) + 0.5 * (words/sentences) - 21.43
|
24 |
+
df["ari"] = df["no-contractions"].apply(
|
25 |
+
lambda x: (4.71 * (len(x.replace(" ", "")) / len(x.split())))
|
26 |
+
+ (0.5 * (len(x.split()) / len(x.split("."))))
|
27 |
+
- 21.43
|
28 |
+
)
|
29 |
+
|
30 |
+
written = df[df["categories"] == "Written"]
|
31 |
+
spoken = df[df["categories"] == "Spoken"]
|
32 |
|
33 |
# Create a Gradio interface with blocks
|
34 |
+
with gr.Blocks() as demo:
|
35 |
gr.Markdown(
|
36 |
"""
|
37 |
# A Dashboard to Analyze the State of the Union Addresses
|
38 |
+
"""
|
39 |
+
)
|
40 |
+
gr.BarPlot(
|
41 |
+
df,
|
42 |
+
x="date",
|
43 |
+
y="word_count",
|
44 |
+
title="Total Number of Words in the Speeches",
|
45 |
+
color="categories",
|
46 |
+
)
|
47 |
+
# group by president and category and calculate the average word count sort by date
|
48 |
+
avg_word_count = (
|
49 |
+
df.groupby(["date", "potus", "categories"])["word_count"].mean().reset_index()
|
50 |
+
)
|
51 |
+
# create a bar chart
|
52 |
+
gr.BarPlot(
|
53 |
+
avg_word_count,
|
54 |
+
x="potus",
|
55 |
+
y="word_count",
|
56 |
+
title="Average Number of Words in the Speeches",
|
57 |
+
color="categories",
|
58 |
+
x_label_angle=-45,
|
59 |
+
height=400,
|
60 |
+
min_width=160,
|
61 |
+
fill_height=True,
|
62 |
+
container=True,
|
63 |
+
scale=2,
|
64 |
+
)
|
65 |
+
with gr.Row():
|
66 |
+
ari = df[["potus", "date", "ari", "categories"]]
|
67 |
+
gr.LinePlot(
|
68 |
+
ari,
|
69 |
+
x="date",
|
70 |
+
y="ari",
|
71 |
+
title="Automated Readability Index",
|
72 |
+
)
|
73 |
# get all unique president names
|
74 |
+
presidents = df["potus"].unique()
|
75 |
# convert presidents to a list
|
76 |
presidents = presidents.tolist()
|
77 |
# create a dropdown to select a president
|
78 |
president = gr.Dropdown(label="Select a President", choices=["All"] + presidents)
|
79 |
+
grams = gr.Slider(minimum=1, maximum=4, step=1, label="N-grams", interactive=True)
|
80 |
with gr.Row():
|
81 |
# if president is not of type string
|
82 |
@gr.render(inputs=president)
|
83 |
def show_text(potus):
|
84 |
+
if potus != "All" and potus is not None:
|
85 |
+
ari = df[df["potus"] == potus][
|
86 |
+
["date", "categories", "word_count", "ari"]
|
87 |
+
]
|
88 |
+
gr.DataFrame(ari, height=200)
|
89 |
+
|
90 |
@gr.render(inputs=president)
|
91 |
def word_length_bar(potus):
|
92 |
# calculate the total number of words in the speech_html column and add it to a new column
|
93 |
# if the president is "All", show the word count for all presidents
|
94 |
+
# if the president is not "All", show the word count for the selected president
|
95 |
+
if potus != "All" and potus is not None:
|
96 |
+
gr.LinePlot(
|
97 |
+
df[df["potus"] == potus],
|
98 |
+
x="date",
|
99 |
+
y="word_count",
|
100 |
+
title="Total Number of Words in the Speeches",
|
101 |
+
)
|
102 |
+
|
103 |
with gr.Row():
|
104 |
|
105 |
+
@gr.render(inputs=[president, grams])
|
106 |
+
def ngram_bar(potus, n_grams):
|
107 |
+
if potus != "All" and potus is not None:
|
108 |
+
if type(n_grams) is not int:
|
109 |
+
n_grams = 1
|
110 |
+
print(n_grams)
|
111 |
+
# create a Counter object from the trigrams
|
112 |
+
potus_df = df[df["potus"] == potus]
|
113 |
+
# decode the tokens-nostop column from a byte array to a list of string
|
114 |
+
trigrams = (
|
115 |
+
potus_df["tokens-nostop"]
|
116 |
+
.apply(lambda x: list(ngrams(x, n_grams)))
|
117 |
+
.apply(Counter)
|
118 |
+
.sum()
|
119 |
+
)
|
120 |
+
# get the most common trigrams
|
121 |
+
common_trigrams = trigrams.most_common(20)
|
122 |
+
# unzip the list of tuples and plot the trigrams and counts as a bar chart
|
123 |
+
trigrams, counts = zip(*common_trigrams)
|
124 |
+
# join the trigrams into a single string
|
125 |
+
trigrams = [" ".join(trigram) for trigram in trigrams]
|
126 |
+
# create a dataframe from the trigrams and counts
|
127 |
+
trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
|
128 |
+
# plot the trigrams and counts as a bar chart from matplotlib
|
129 |
+
fig, ax = plt.subplots(figsize=(12, 4))
|
130 |
+
ax.barh(trigrams_df["trigrams"], trigrams_df["counts"])
|
131 |
+
ax.set_title("Top 20 Trigrams")
|
132 |
+
ax.set_ylabel("Count")
|
133 |
+
ax.set_xlabel("Trigrams")
|
134 |
+
plt.xticks(rotation=45)
|
135 |
+
# make it tight layout
|
136 |
+
plt.tight_layout()
|
137 |
+
gr.Plot(value=fig, container=True)
|
138 |
+
|
139 |
|
140 |
demo.launch()
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "sotu-analysis"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = ""
|
5 |
+
authors = ["jsulz <j.sulzdorf@gmail.com>"]
|
6 |
+
readme = "README.md"
|
7 |
+
|
8 |
+
[tool.poetry.dependencies]
|
9 |
+
python = "^3.12"
|
10 |
+
gradio = "^4.42.0"
|
11 |
+
datasets = "^2.21.0"
|
12 |
+
pandas = "^2.2.2"
|
13 |
+
nltk = "^3.9.1"
|
14 |
+
plotly = "^5.23.0"
|
15 |
+
matplotlib = "^3.9.2"
|
16 |
+
|
17 |
+
[build-system]
|
18 |
+
requires = ["poetry-core"]
|
19 |
+
build-backend = "poetry.core.masonry.api"
|
requirements.txt
CHANGED
@@ -1 +1,75 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.2.1
|
2 |
+
aiohappyeyeballs==2.4.0
|
3 |
+
aiohttp==3.10.5
|
4 |
+
aiosignal==1.3.1
|
5 |
+
annotated-types==0.7.0
|
6 |
+
anyio==4.4.0
|
7 |
+
attrs==24.2.0
|
8 |
+
certifi==2024.7.4
|
9 |
+
charset-normalizer==3.3.2
|
10 |
+
click==8.1.7
|
11 |
+
colorama==0.4.6
|
12 |
+
contourpy==1.3.0
|
13 |
+
cycler==0.12.1
|
14 |
+
datasets==2.21.0
|
15 |
+
dill==0.3.8
|
16 |
+
fastapi==0.112.2
|
17 |
+
ffmpy==0.4.0
|
18 |
+
filelock==3.15.4
|
19 |
+
fonttools==4.53.1
|
20 |
+
frozenlist==1.4.1
|
21 |
+
fsspec==2024.6.1
|
22 |
+
fsspec[http]==2024.6.1
|
23 |
+
gradio-client==1.3.0
|
24 |
+
gradio==4.42.0
|
25 |
+
h11==0.14.0
|
26 |
+
httpcore==1.0.5
|
27 |
+
httpx==0.27.2
|
28 |
+
huggingface-hub==0.24.6
|
29 |
+
idna==3.8
|
30 |
+
importlib-resources==6.4.4
|
31 |
+
jinja2==3.1.4
|
32 |
+
joblib==1.4.2
|
33 |
+
kiwisolver==1.4.5
|
34 |
+
markdown-it-py==3.0.0
|
35 |
+
markupsafe==2.1.5
|
36 |
+
matplotlib==3.9.2
|
37 |
+
mdurl==0.1.2
|
38 |
+
multiprocess==0.70.16
|
39 |
+
nltk==3.9.1
|
40 |
+
numpy==2.1.0
|
41 |
+
orjson==3.10.7
|
42 |
+
packaging==24.1
|
43 |
+
pandas==2.2.2
|
44 |
+
pillow==10.4.0
|
45 |
+
plotly==5.23.0
|
46 |
+
pyarrow==17.0.0
|
47 |
+
pydantic-core==2.20.1
|
48 |
+
pydantic==2.8.2
|
49 |
+
pydub==0.25.1
|
50 |
+
pygments==2.18.0
|
51 |
+
pyparsing==3.1.4
|
52 |
+
python-dateutil==2.9.0.post0
|
53 |
+
python-multipart==0.0.9
|
54 |
+
pytz==2024.1
|
55 |
+
pyyaml==6.0.2
|
56 |
+
regex==2024.7.24
|
57 |
+
requests==2.32.3
|
58 |
+
rich==13.8.0
|
59 |
+
ruff==0.6.2
|
60 |
+
semantic-version==2.10.0
|
61 |
+
shellingham==1.5.4
|
62 |
+
six==1.16.0
|
63 |
+
sniffio==1.3.1
|
64 |
+
starlette==0.38.2
|
65 |
+
tenacity==9.0.0
|
66 |
+
tomlkit==0.12.0
|
67 |
+
tqdm==4.66.5
|
68 |
+
typer==0.12.5
|
69 |
+
typing-extensions==4.12.2
|
70 |
+
tzdata==2024.1
|
71 |
+
urllib3==2.2.2
|
72 |
+
uvicorn==0.30.6
|
73 |
+
websockets==12.0
|
74 |
+
xxhash==3.5.0
|
75 |
+
yarl==1.9.4
|