jsulz HF staff commited on
Commit
68b374e
1 Parent(s): b09504e

done with most charts; just need to add words

Browse files
Files changed (4) hide show
  1. app.py +111 -33
  2. poetry.lock +0 -0
  3. pyproject.toml +19 -0
  4. requirements.txt +75 -1
app.py CHANGED
@@ -2,61 +2,139 @@ import gradio as gr
2
  from datasets import load_dataset
3
  from nltk.util import ngrams
4
  from collections import Counter
 
 
 
5
 
6
  # Load the dataset and convert it to a Pandas dataframe
7
- sotu_dataset = 'jsulz/state-of-the-union-addresses'
8
  dataset = load_dataset(sotu_dataset)
9
- df = dataset['train'].to_pandas()
10
- df['word_count'] = df['speech_html'].apply(lambda x: len(x.split()))
11
- written = df[df['categories'] == 'Written']
12
- spoken = df[df['categories'] == 'Spoken']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # Create a Gradio interface with blocks
15
- with gr.Blocks() as demo:
16
  gr.Markdown(
17
  """
18
  # A Dashboard to Analyze the State of the Union Addresses
19
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  # get all unique president names
21
- presidents = df['potus'].unique()
22
  # convert presidents to a list
23
  presidents = presidents.tolist()
24
  # create a dropdown to select a president
25
  president = gr.Dropdown(label="Select a President", choices=["All"] + presidents)
 
26
  with gr.Row():
27
  # if president is not of type string
28
  @gr.render(inputs=president)
29
  def show_text(potus):
30
- if potus is not None:
31
- gr.Markdown(f"{potus} was the first president of the United States.")
32
-
 
 
 
33
  @gr.render(inputs=president)
34
  def word_length_bar(potus):
35
  # calculate the total number of words in the speech_html column and add it to a new column
36
  # if the president is "All", show the word count for all presidents
37
- if potus == "All":
38
- gr.BarPlot(df, x="date", y="word_count", title="Total Number of Words in the Speeches")
39
- else:
40
- # if the president is not "All", show the word count for the selected president
41
- gr.BarPlot(df[df['potus'] == potus], x="date", y="word_count", title="Total Number of Words in the Speeches")
 
 
 
 
42
  with gr.Row():
43
 
44
- @gr.render(inputs=president)
45
- def ngram_bar(potus):
46
- # create a Counter object from the trigrams
47
- potus_df = df[df["potus"] == potus]
48
- trigrams = (
49
- potus_df["tokens-nostop"].apply(lambda x: list(ngrams(x, 3))).apply(Counter).sum()
50
- )
51
- # get the most common trigrams
52
- common_trigrams = trigrams.most_common(20)
53
- # unzip the list of tuples and plot the trigrams and counts as a bar chart
54
- trigrams, counts = zip(*common_trigrams)
55
- # join the trigrams into a single string
56
- trigrams = [" ".join(trigram) for trigram in trigrams]
57
- # create a dataframe from the trigrams and counts
58
- trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
59
- # plot the trigrams and counts as a bar chart
60
- gr.BarPlot(trigrams_df, x="trigrams", y="counts", title="Most Common Trigrams")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  demo.launch()
 
2
  from datasets import load_dataset
3
  from nltk.util import ngrams
4
  from collections import Counter
5
+ import pandas as pd
6
+ import plotly.express as px
7
+ import matplotlib.pyplot as plt
8
 
9
  # Load the dataset and convert it to a Pandas dataframe
10
+ sotu_dataset = "jsulz/state-of-the-union-addresses"
11
  dataset = load_dataset(sotu_dataset)
12
+ df = dataset["train"].to_pandas()
13
+ # decode the tokens-nostop column from a byte array to a list of string
14
+ df["tokens-nostop"] = df["tokens-nostop"].apply(
15
+ lambda x: x.decode("utf-8")
16
+ .replace('"', "")
17
+ .replace("[", "")
18
+ .replace("]", "")
19
+ .split(",")
20
+ )
21
+ df["word_count"] = df["speech_html"].apply(lambda x: len(x.split()))
22
+ # calculate the automated readibility index reading ease score for each address
23
+ # automated readability index = 4.71 * (characters/words) + 0.5 * (words/sentences) - 21.43
24
+ df["ari"] = df["no-contractions"].apply(
25
+ lambda x: (4.71 * (len(x.replace(" ", "")) / len(x.split())))
26
+ + (0.5 * (len(x.split()) / len(x.split("."))))
27
+ - 21.43
28
+ )
29
+
30
+ written = df[df["categories"] == "Written"]
31
+ spoken = df[df["categories"] == "Spoken"]
32
 
33
  # Create a Gradio interface with blocks
34
+ with gr.Blocks() as demo:
35
  gr.Markdown(
36
  """
37
  # A Dashboard to Analyze the State of the Union Addresses
38
+ """
39
+ )
40
+ gr.BarPlot(
41
+ df,
42
+ x="date",
43
+ y="word_count",
44
+ title="Total Number of Words in the Speeches",
45
+ color="categories",
46
+ )
47
+ # group by president and category and calculate the average word count sort by date
48
+ avg_word_count = (
49
+ df.groupby(["date", "potus", "categories"])["word_count"].mean().reset_index()
50
+ )
51
+ # create a bar chart
52
+ gr.BarPlot(
53
+ avg_word_count,
54
+ x="potus",
55
+ y="word_count",
56
+ title="Average Number of Words in the Speeches",
57
+ color="categories",
58
+ x_label_angle=-45,
59
+ height=400,
60
+ min_width=160,
61
+ fill_height=True,
62
+ container=True,
63
+ scale=2,
64
+ )
65
+ with gr.Row():
66
+ ari = df[["potus", "date", "ari", "categories"]]
67
+ gr.LinePlot(
68
+ ari,
69
+ x="date",
70
+ y="ari",
71
+ title="Automated Readability Index",
72
+ )
73
  # get all unique president names
74
+ presidents = df["potus"].unique()
75
  # convert presidents to a list
76
  presidents = presidents.tolist()
77
  # create a dropdown to select a president
78
  president = gr.Dropdown(label="Select a President", choices=["All"] + presidents)
79
+ grams = gr.Slider(minimum=1, maximum=4, step=1, label="N-grams", interactive=True)
80
  with gr.Row():
81
  # if president is not of type string
82
  @gr.render(inputs=president)
83
  def show_text(potus):
84
+ if potus != "All" and potus is not None:
85
+ ari = df[df["potus"] == potus][
86
+ ["date", "categories", "word_count", "ari"]
87
+ ]
88
+ gr.DataFrame(ari, height=200)
89
+
90
  @gr.render(inputs=president)
91
  def word_length_bar(potus):
92
  # calculate the total number of words in the speech_html column and add it to a new column
93
  # if the president is "All", show the word count for all presidents
94
+ # if the president is not "All", show the word count for the selected president
95
+ if potus != "All" and potus is not None:
96
+ gr.LinePlot(
97
+ df[df["potus"] == potus],
98
+ x="date",
99
+ y="word_count",
100
+ title="Total Number of Words in the Speeches",
101
+ )
102
+
103
  with gr.Row():
104
 
105
+ @gr.render(inputs=[president, grams])
106
+ def ngram_bar(potus, n_grams):
107
+ if potus != "All" and potus is not None:
108
+ if type(n_grams) is not int:
109
+ n_grams = 1
110
+ print(n_grams)
111
+ # create a Counter object from the trigrams
112
+ potus_df = df[df["potus"] == potus]
113
+ # decode the tokens-nostop column from a byte array to a list of string
114
+ trigrams = (
115
+ potus_df["tokens-nostop"]
116
+ .apply(lambda x: list(ngrams(x, n_grams)))
117
+ .apply(Counter)
118
+ .sum()
119
+ )
120
+ # get the most common trigrams
121
+ common_trigrams = trigrams.most_common(20)
122
+ # unzip the list of tuples and plot the trigrams and counts as a bar chart
123
+ trigrams, counts = zip(*common_trigrams)
124
+ # join the trigrams into a single string
125
+ trigrams = [" ".join(trigram) for trigram in trigrams]
126
+ # create a dataframe from the trigrams and counts
127
+ trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
128
+ # plot the trigrams and counts as a bar chart from matplotlib
129
+ fig, ax = plt.subplots(figsize=(12, 4))
130
+ ax.barh(trigrams_df["trigrams"], trigrams_df["counts"])
131
+ ax.set_title("Top 20 Trigrams")
132
+ ax.set_ylabel("Count")
133
+ ax.set_xlabel("Trigrams")
134
+ plt.xticks(rotation=45)
135
+ # make it tight layout
136
+ plt.tight_layout()
137
+ gr.Plot(value=fig, container=True)
138
+
139
 
140
  demo.launch()
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "sotu-analysis"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["jsulz <j.sulzdorf@gmail.com>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.12"
10
+ gradio = "^4.42.0"
11
+ datasets = "^2.21.0"
12
+ pandas = "^2.2.2"
13
+ nltk = "^3.9.1"
14
+ plotly = "^5.23.0"
15
+ matplotlib = "^3.9.2"
16
+
17
+ [build-system]
18
+ requires = ["poetry-core"]
19
+ build-backend = "poetry.core.masonry.api"
requirements.txt CHANGED
@@ -1 +1,75 @@
1
- nltk==3.9.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohappyeyeballs==2.4.0
3
+ aiohttp==3.10.5
4
+ aiosignal==1.3.1
5
+ annotated-types==0.7.0
6
+ anyio==4.4.0
7
+ attrs==24.2.0
8
+ certifi==2024.7.4
9
+ charset-normalizer==3.3.2
10
+ click==8.1.7
11
+ colorama==0.4.6
12
+ contourpy==1.3.0
13
+ cycler==0.12.1
14
+ datasets==2.21.0
15
+ dill==0.3.8
16
+ fastapi==0.112.2
17
+ ffmpy==0.4.0
18
+ filelock==3.15.4
19
+ fonttools==4.53.1
20
+ frozenlist==1.4.1
21
+ fsspec==2024.6.1
22
+ fsspec[http]==2024.6.1
23
+ gradio-client==1.3.0
24
+ gradio==4.42.0
25
+ h11==0.14.0
26
+ httpcore==1.0.5
27
+ httpx==0.27.2
28
+ huggingface-hub==0.24.6
29
+ idna==3.8
30
+ importlib-resources==6.4.4
31
+ jinja2==3.1.4
32
+ joblib==1.4.2
33
+ kiwisolver==1.4.5
34
+ markdown-it-py==3.0.0
35
+ markupsafe==2.1.5
36
+ matplotlib==3.9.2
37
+ mdurl==0.1.2
38
+ multiprocess==0.70.16
39
+ nltk==3.9.1
40
+ numpy==2.1.0
41
+ orjson==3.10.7
42
+ packaging==24.1
43
+ pandas==2.2.2
44
+ pillow==10.4.0
45
+ plotly==5.23.0
46
+ pyarrow==17.0.0
47
+ pydantic-core==2.20.1
48
+ pydantic==2.8.2
49
+ pydub==0.25.1
50
+ pygments==2.18.0
51
+ pyparsing==3.1.4
52
+ python-dateutil==2.9.0.post0
53
+ python-multipart==0.0.9
54
+ pytz==2024.1
55
+ pyyaml==6.0.2
56
+ regex==2024.7.24
57
+ requests==2.32.3
58
+ rich==13.8.0
59
+ ruff==0.6.2
60
+ semantic-version==2.10.0
61
+ shellingham==1.5.4
62
+ six==1.16.0
63
+ sniffio==1.3.1
64
+ starlette==0.38.2
65
+ tenacity==9.0.0
66
+ tomlkit==0.12.0
67
+ tqdm==4.66.5
68
+ typer==0.12.5
69
+ typing-extensions==4.12.2
70
+ tzdata==2024.1
71
+ urllib3==2.2.2
72
+ uvicorn==0.30.6
73
+ websockets==12.0
74
+ xxhash==3.5.0
75
+ yarl==1.9.4