Mark Febrizio
commited on
Commit
•
fe4f734
1
Parent(s):
a0ce115
Documentation (#24)
Browse files* Update significant.py
* docstrings for grouping
* Update plotting.py
docstrings and remove inoperable code
* docstrings
* Update dependabot.yml
* Update significant.py
bugfix where clean_cols was still being passed even after being removed
* Update test_get_data.py
update test end date
* Update README.md
Add summary of potential uses
* Update app.py
adjust sig tooltip language
- .github/dependabot.yml +1 -2
- README.md +11 -0
- app.py +3 -3
- cra_window_rules.py +12 -1
- modules/grouping.py +45 -15
- modules/plotting.py +30 -18
- modules/search_columns.py +8 -6
- modules/significant.py +53 -20
- modules/utils.py +3 -0
- tests/test_get_data.py +2 -2
.github/dependabot.yml
CHANGED
@@ -8,9 +8,8 @@ updates:
|
|
8 |
- package-ecosystem: "pip"
|
9 |
directory: "/" # Location of package manifests
|
10 |
schedule:
|
11 |
-
interval: "
|
12 |
allow:
|
13 |
-
- dependency-name: "faicons"
|
14 |
- dependency-name: "fr-toolbelt"
|
15 |
- dependency-name: "pandas"
|
16 |
- dependency-name: "plotnine"
|
|
|
8 |
- package-ecosystem: "pip"
|
9 |
directory: "/" # Location of package manifests
|
10 |
schedule:
|
11 |
+
interval: "monthly"
|
12 |
allow:
|
|
|
13 |
- dependency-name: "fr-toolbelt"
|
14 |
- dependency-name: "pandas"
|
15 |
- dependency-name: "plotnine"
|
README.md
CHANGED
@@ -20,3 +20,14 @@ Dashboard app tracking rules falling within the Congressional Review Act (CRA) w
|
|
20 |
**[Link to HuggingFace Space that hosts the dashboard app.](https://huggingface.co/spaces/regulatorystudies/cra-window-rules)**
|
21 |
|
22 |
Developed by the [GW Regulatory Studies Center](https://go.gwu.edu/regstudies).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
**[Link to HuggingFace Space that hosts the dashboard app.](https://huggingface.co/spaces/regulatorystudies/cra-window-rules)**
|
21 |
|
22 |
Developed by the [GW Regulatory Studies Center](https://go.gwu.edu/regstudies).
|
23 |
+
|
24 |
+
## Potential Uses
|
25 |
+
|
26 |
+
The CRA lookback window is a hot topic going into a presidential election year. One natural question building on the opening of the window is: *what rules might fall into the window and could be subject to the CRA next year?*
|
27 |
+
|
28 |
+
This information may be valuable for different stakeholders in different ways:
|
29 |
+
|
30 |
+
1. Industry observers and other regulated entities may want to be aware of what rules might or might not be durable given an administration change;
|
31 |
+
2. Advocacy groups might want to know how to advocate for or against such policies;
|
32 |
+
3. Federal policymakers can have a better sense of what policies may or may not carry over given a change in administration;
|
33 |
+
4. Journalists may want to focus on what highly salient policies might be reversed depending on the results of the election. They also may want to know more broadly how many rules fall into the window and from which agencies.
|
app.py
CHANGED
@@ -18,7 +18,7 @@ from modules import (
|
|
18 |
AGENCIES,
|
19 |
groupby_agency,
|
20 |
groupby_date,
|
21 |
-
|
22 |
pad_missing_dates,
|
23 |
plot_agency,
|
24 |
plot_tf,
|
@@ -85,7 +85,7 @@ with ui.sidebar(open={"desktop": "open", "mobile": "closed"}, fg="#033C5A"):
|
|
85 |
|
86 |
with ui.tooltip(placement="right", id="sig_tooltip"):
|
87 |
ui.input_select("menu_significant", "Select rule significance", choices=["all", "3f1-significant", "other-significant"], selected="all", multiple=True, size=3)
|
88 |
-
"Rule significance as defined in Executive Order 12866."
|
89 |
|
90 |
with ui.tooltip(placement="right", id="agency_tooltip"):
|
91 |
ui.input_select("menu_agency", "Select agencies", choices=["all"] + AGENCIES, selected=["all"], multiple=True, size=6)
|
@@ -348,7 +348,7 @@ def grouped_df_day():
|
|
348 |
@reactive.calc
|
349 |
def grouped_df_week():
|
350 |
filt_df = filter_significance()
|
351 |
-
filt_df =
|
352 |
try:
|
353 |
grouped = groupby_date(filt_df, group_col=("week_number", "week_of"), significant=GET_SIGNIFICANT)
|
354 |
grouped = pad_missing_dates(
|
|
|
18 |
AGENCIES,
|
19 |
groupby_agency,
|
20 |
groupby_date,
|
21 |
+
add_week_info_to_data,
|
22 |
pad_missing_dates,
|
23 |
plot_agency,
|
24 |
plot_tf,
|
|
|
85 |
|
86 |
with ui.tooltip(placement="right", id="sig_tooltip"):
|
87 |
ui.input_select("menu_significant", "Select rule significance", choices=["all", "3f1-significant", "other-significant"], selected="all", multiple=True, size=3)
|
88 |
+
"Rule significance as defined in Executive Order 12866, as amended by Executive Order 14094."
|
89 |
|
90 |
with ui.tooltip(placement="right", id="agency_tooltip"):
|
91 |
ui.input_select("menu_agency", "Select agencies", choices=["all"] + AGENCIES, selected=["all"], multiple=True, size=6)
|
|
|
348 |
@reactive.calc
|
349 |
def grouped_df_week():
|
350 |
filt_df = filter_significance()
|
351 |
+
filt_df = add_week_info_to_data(filt_df)
|
352 |
try:
|
353 |
grouped = groupby_date(filt_df, group_col=("week_number", "week_of"), significant=GET_SIGNIFICANT)
|
354 |
grouped = pad_missing_dates(
|
cra_window_rules.py
CHANGED
@@ -13,6 +13,15 @@ from modules import (
|
|
13 |
|
14 |
|
15 |
def save_csv(path: Path, df_all: DataFrame, df_agency: DataFrame, df_ym: DataFrame, transition_year: int):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
files = (
|
17 |
f"rules_{transition_year - 1}_{transition_year}.csv",
|
18 |
f"rules_by_agency_{transition_year - 1}_{transition_year}.csv",
|
@@ -23,7 +32,9 @@ def save_csv(path: Path, df_all: DataFrame, df_agency: DataFrame, df_ym: DataFra
|
|
23 |
data.to_csv(path / file, index=False)
|
24 |
|
25 |
|
26 |
-
def main(start_date, save_data: bool = True, path: Path | None = None, metadata: dict | None = None, significant: bool = True):
|
|
|
|
|
27 |
if date.fromisoformat(start_date) < date(2023, 4, 6):
|
28 |
significant = False
|
29 |
date_range = get_date_range(start_date)
|
|
|
13 |
|
14 |
|
15 |
def save_csv(path: Path, df_all: DataFrame, df_agency: DataFrame, df_ym: DataFrame, transition_year: int):
|
16 |
+
"""Save output as CSV files.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
path (Path): Save data here.
|
20 |
+
df_all (DataFrame): Data at the rule level.
|
21 |
+
df_agency (DataFrame): Data grouped by agency.
|
22 |
+
df_ym (DataFrame): Data grouped by publication year and month.
|
23 |
+
transition_year (int): Presidential transition year.
|
24 |
+
"""
|
25 |
files = (
|
26 |
f"rules_{transition_year - 1}_{transition_year}.csv",
|
27 |
f"rules_by_agency_{transition_year - 1}_{transition_year}.csv",
|
|
|
32 |
data.to_csv(path / file, index=False)
|
33 |
|
34 |
|
35 |
+
def main(start_date: str, save_data: bool = True, path: Path | None = None, metadata: dict | None = None, significant: bool = True):
|
36 |
+
"""Retrieve rules in CRA window and save resulting data.
|
37 |
+
"""
|
38 |
if date.fromisoformat(start_date) < date(2023, 4, 6):
|
39 |
significant = False
|
40 |
date_range = get_date_range(start_date)
|
modules/grouping.py
CHANGED
@@ -87,7 +87,7 @@ def _get_weeks(dates: list[date], end_date: date | None = None, **kwargs) -> lis
|
|
87 |
return results
|
88 |
|
89 |
|
90 |
-
def
|
91 |
"""Add week number and week start date to input data.
|
92 |
|
93 |
Args:
|
@@ -108,7 +108,8 @@ def add_weeks_to_data(df: DataFrame, date_column: str = "publication_date", new_
|
|
108 |
|
109 |
|
110 |
def _pad_missing_weeks(timeframe_list: list[date], **kwargs):
|
111 |
-
|
|
|
112 |
# get the start date for the first week
|
113 |
first_week_start = _get_first_week_start(timeframe_list)
|
114 |
|
@@ -117,7 +118,8 @@ def _pad_missing_weeks(timeframe_list: list[date], **kwargs):
|
|
117 |
|
118 |
|
119 |
def _pad_missing_days(timeframe_list: list[date], end_date: date | None = None):
|
120 |
-
|
|
|
121 |
start_date = min(timeframe_list)
|
122 |
if end_date is None:
|
123 |
end_date = date.today()
|
@@ -132,7 +134,20 @@ def _pad_missing_days(timeframe_list: list[date], end_date: date | None = None):
|
|
132 |
|
133 |
|
134 |
def pad_missing_dates(df: DataFrame, pad_column: str, how: str, fill_padded_values: dict | None = None, **kwargs):
|
|
|
135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
df_copy = df.copy()
|
137 |
timeframe_list = [d.date() if isinstance(d, (Timestamp, datetime)) else d for d in df_copy[pad_column].to_list()]
|
138 |
df_copy = df_copy.astype({pad_column: "object"})
|
@@ -148,7 +163,7 @@ def pad_missing_dates(df: DataFrame, pad_column: str, how: str, fill_padded_valu
|
|
148 |
elif how == "weeks":
|
149 |
week_numbers, padded_timeframes = zip(*_pad_missing_weeks(timeframe_list, **kwargs))
|
150 |
else:
|
151 |
-
raise ValueError
|
152 |
|
153 |
# incorporate extended dates into dataframe
|
154 |
df_merge = DataFrame({pad_column: padded_timeframes})
|
@@ -174,19 +189,19 @@ def groupby_agency(
|
|
174 |
metadata: dict | None = None,
|
175 |
metadata_value: str = "acronym",
|
176 |
):
|
177 |
-
"""
|
178 |
|
179 |
Args:
|
180 |
-
df (DataFrame):
|
181 |
-
group_col (str, optional):
|
182 |
-
value_col (str, optional):
|
183 |
-
aggfunc (str, optional):
|
184 |
-
significant (bool, optional):
|
185 |
-
metadata (dict | None, optional):
|
186 |
-
metadata_value (str, optional):
|
187 |
|
188 |
Returns:
|
189 |
-
|
190 |
"""
|
191 |
aggfunc_dict = {value_col: aggfunc, }
|
192 |
if significant:
|
@@ -225,12 +240,27 @@ def groupby_date(
|
|
225 |
aggfunc: str = "count",
|
226 |
significant: bool = True
|
227 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
if isinstance(group_col, str):
|
229 |
group_col = [group_col]
|
230 |
elif isinstance(group_col, (list, tuple)):
|
231 |
group_col = list(group_col)
|
232 |
else:
|
233 |
-
raise TypeError
|
234 |
|
235 |
aggfunc_dict = {value_col: aggfunc, }
|
236 |
if significant:
|
@@ -263,7 +293,7 @@ if __name__ == "__main__":
|
|
263 |
df_a = pad_missing_dates(df, "dates", "days", fill_padded_values={"values": 0})
|
264 |
print(df_a.head(10))
|
265 |
|
266 |
-
df =
|
267 |
print(df.head(10))
|
268 |
|
269 |
grouped = groupby_date(df, group_col=("week_number", "week_of"), value_col="values", significant=False)
|
|
|
87 |
return results
|
88 |
|
89 |
|
90 |
+
def add_week_info_to_data(df: DataFrame, date_column: str = "publication_date", new_columns: tuple[str] = ("week_number", "week_of")):
|
91 |
"""Add week number and week start date to input data.
|
92 |
|
93 |
Args:
|
|
|
108 |
|
109 |
|
110 |
def _pad_missing_weeks(timeframe_list: list[date], **kwargs):
|
111 |
+
"""Pad dataframe with weeks missing from retrieved data (i.e., weeks without qualifying rule data).
|
112 |
+
"""
|
113 |
# get the start date for the first week
|
114 |
first_week_start = _get_first_week_start(timeframe_list)
|
115 |
|
|
|
118 |
|
119 |
|
120 |
def _pad_missing_days(timeframe_list: list[date], end_date: date | None = None):
|
121 |
+
"""Pad dataframe with days missing from retrieved data (i.e., days without qualifying rule data).
|
122 |
+
"""
|
123 |
start_date = min(timeframe_list)
|
124 |
if end_date is None:
|
125 |
end_date = date.today()
|
|
|
134 |
|
135 |
|
136 |
def pad_missing_dates(df: DataFrame, pad_column: str, how: str, fill_padded_values: dict | None = None, **kwargs):
|
137 |
+
"""Add missing dates (either weeks or days) to the dataset.
|
138 |
|
139 |
+
Args:
|
140 |
+
df (DataFrame): Input data.
|
141 |
+
pad_column (str): Date column to pad.
|
142 |
+
how (str): Whether to pad by "days" or "weeks".
|
143 |
+
fill_padded_values (dict | None, optional): Dictionary of columns and values to fill for padded observations (e.g., {"column": 0}). Defaults to None.
|
144 |
+
|
145 |
+
Raises:
|
146 |
+
ValueError: Must pass 'days' or 'weeks' to parameter 'how'.
|
147 |
+
|
148 |
+
Returns:
|
149 |
+
DataFrame: Padded data.
|
150 |
+
"""
|
151 |
df_copy = df.copy()
|
152 |
timeframe_list = [d.date() if isinstance(d, (Timestamp, datetime)) else d for d in df_copy[pad_column].to_list()]
|
153 |
df_copy = df_copy.astype({pad_column: "object"})
|
|
|
163 |
elif how == "weeks":
|
164 |
week_numbers, padded_timeframes = zip(*_pad_missing_weeks(timeframe_list, **kwargs))
|
165 |
else:
|
166 |
+
raise ValueError("Must pass 'days' or 'weeks' to parameter 'how'.")
|
167 |
|
168 |
# incorporate extended dates into dataframe
|
169 |
df_merge = DataFrame({pad_column: padded_timeframes})
|
|
|
189 |
metadata: dict | None = None,
|
190 |
metadata_value: str = "acronym",
|
191 |
):
|
192 |
+
"""Group data by agencies and aggregate the values.
|
193 |
|
194 |
Args:
|
195 |
+
df (DataFrame): Input data.
|
196 |
+
group_col (str, optional): Column to group by. Defaults to "parent_slug".
|
197 |
+
value_col (str, optional): Column for values for grouping and aggregation. Defaults to "document_number".
|
198 |
+
aggfunc (str, optional): Aggregation function. Defaults to "count".
|
199 |
+
significant (bool, optional): Whether to include significance data in values. Defaults to True.
|
200 |
+
metadata (dict | None, optional): Agency metadata. Defaults to None.
|
201 |
+
metadata_value (str, optional): Metadata value to add to output data. Defaults to "acronym".
|
202 |
|
203 |
Returns:
|
204 |
+
DataFrame: Grouped and aggregated data.
|
205 |
"""
|
206 |
aggfunc_dict = {value_col: aggfunc, }
|
207 |
if significant:
|
|
|
240 |
aggfunc: str = "count",
|
241 |
significant: bool = True
|
242 |
):
|
243 |
+
"""Group data by a given date frequency and aggregate the values.
|
244 |
+
|
245 |
+
Args:
|
246 |
+
df (DataFrame): Input data.
|
247 |
+
group_col (str | tuple | list, optional): Columns to group by. Defaults to ("publication_year", "publication_month", ).
|
248 |
+
value_col (str, optional): Column for values for grouping and aggregation. Defaults to "document_number".
|
249 |
+
aggfunc (str, optional): Aggregation function. Defaults to "count".
|
250 |
+
significant (bool, optional): Whether to include significance data in values. Defaults to True.
|
251 |
+
|
252 |
+
Raises:
|
253 |
+
TypeError: Parameter 'group_col' must be type `str`, `list`, or `tuple`.
|
254 |
+
|
255 |
+
Returns:
|
256 |
+
DataFrame: Grouped and aggregated data.
|
257 |
+
"""
|
258 |
if isinstance(group_col, str):
|
259 |
group_col = [group_col]
|
260 |
elif isinstance(group_col, (list, tuple)):
|
261 |
group_col = list(group_col)
|
262 |
else:
|
263 |
+
raise TypeError("Parameter 'group_col' must be type `str`, `list`, or `tuple`.")
|
264 |
|
265 |
aggfunc_dict = {value_col: aggfunc, }
|
266 |
if significant:
|
|
|
293 |
df_a = pad_missing_dates(df, "dates", "days", fill_padded_values={"values": 0})
|
294 |
print(df_a.head(10))
|
295 |
|
296 |
+
df = add_week_info_to_data(df, date_column="dates")
|
297 |
print(df.head(10))
|
298 |
|
299 |
grouped = groupby_date(df, group_col=("week_number", "week_of"), value_col="values", significant=False)
|
modules/plotting.py
CHANGED
@@ -22,7 +22,8 @@ class DataAvailabilityError(Exception):
|
|
22 |
|
23 |
|
24 |
def plot_NA(placeholder_text: str = "Not enough data available to visualize.", placeholder_size: int = 14):
|
25 |
-
"""Placeholder plot for when there is not enough data available to visualize.
|
|
|
26 |
return (
|
27 |
ggplot()
|
28 |
+ annotate("text", x=0, y=0, label=placeholder_text, size=placeholder_size)
|
@@ -32,7 +33,8 @@ def plot_NA(placeholder_text: str = "Not enough data available to visualize.", p
|
|
32 |
|
33 |
|
34 |
def generate_rule_axis_label(rule_types: list | None = None):
|
35 |
-
"""Generate axis label for rules, accounting for rule type ("all", "3f1", or "other").
|
|
|
36 |
categories = ""
|
37 |
if (rule_types is None) or ("all" in rule_types):
|
38 |
pass
|
@@ -52,7 +54,9 @@ def plot_agency(df, group_col = "acronym", value_col = "rules", color="#033C5A",
|
|
52 |
df (DataFrame): Input data.
|
53 |
group_col (str, optional): Column on which the data are grouped. Defaults to "acronym".
|
54 |
value_col (str, optional): Column of values to be plotted. Defaults to "rules".
|
55 |
-
|
|
|
|
|
56 |
Returns:
|
57 |
ggplot: Plotted data.
|
58 |
"""
|
@@ -71,7 +75,6 @@ def plot_agency(df, group_col = "acronym", value_col = "rules", color="#033C5A",
|
|
71 |
+ labs(y=y_lab, x="", title="Rules Published by Agency")
|
72 |
+ theme_light()
|
73 |
)
|
74 |
-
|
75 |
return plot
|
76 |
|
77 |
|
@@ -89,7 +92,10 @@ def plot_month(
|
|
89 |
df (DataFrame): Input data.
|
90 |
group_cols (tuple, optional): Columns on which the data are grouped. Defaults to ("publication_year", "publication_month").
|
91 |
value_col (str, optional): Column of values to be plotted. Defaults to "rules".
|
92 |
-
|
|
|
|
|
|
|
93 |
Returns:
|
94 |
ggplot: Plotted data.
|
95 |
"""
|
@@ -123,8 +129,11 @@ def plot_day(
|
|
123 |
|
124 |
Args:
|
125 |
df (DataFrame): Input data.
|
126 |
-
group_col (str, optional): Column on which the data are grouped. Defaults to
|
127 |
value_col (str, optional): Column of values to be plotted. Defaults to "rules".
|
|
|
|
|
|
|
128 |
|
129 |
Returns:
|
130 |
ggplot: Plotted data.
|
@@ -167,6 +176,19 @@ def plot_week(
|
|
167 |
y_lab: str = "",
|
168 |
show_significant: bool = False,
|
169 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
max_value = df.loc[:, value_col].max()
|
171 |
|
172 |
date_values = df[group_col].to_list()
|
@@ -199,26 +221,16 @@ def plot_week(
|
|
199 |
+ labs(y=y_lab, x="", title=title)
|
200 |
+ theme_light()
|
201 |
)
|
202 |
-
if show_significant:
|
203 |
-
# trying to add significant rules as additional lines
|
204 |
-
# but getting "TypeError: Discrete value supplied to continuous scale"
|
205 |
-
# for 3f1 sig rules
|
206 |
-
df = df.astype({"3f1_significant": "float"})
|
207 |
-
plot = (
|
208 |
-
plot
|
209 |
-
#+ geom_line(aes(x=group_col, y="3f1_significant"), inherit_aes=False, group=1, color="#AA9868", linetype="dotted")
|
210 |
-
+ geom_line(aes(x=group_col, y="other_significant"), inherit_aes=False, group=1, color="#0190DB", linetype="dashed")
|
211 |
-
#+ guide_legend()
|
212 |
-
)
|
213 |
return plot
|
214 |
|
215 |
|
216 |
-
def plot_tf(df: DataFrame, frequency: str, rule_types:
|
217 |
"""Plot rules over time by given frequency.
|
218 |
|
219 |
Args:
|
220 |
df (DataFrame): Input data.
|
221 |
frequency (str): Frequency of time for aggregating rules. Accepts "monthly" or "daily".
|
|
|
222 |
|
223 |
Raises:
|
224 |
ValueError: Frequency parameter received invalid value.
|
|
|
22 |
|
23 |
|
24 |
def plot_NA(placeholder_text: str = "Not enough data available to visualize.", placeholder_size: int = 14):
|
25 |
+
"""Placeholder plot for when there is not enough data available to visualize.
|
26 |
+
"""
|
27 |
return (
|
28 |
ggplot()
|
29 |
+ annotate("text", x=0, y=0, label=placeholder_text, size=placeholder_size)
|
|
|
33 |
|
34 |
|
35 |
def generate_rule_axis_label(rule_types: list | None = None):
|
36 |
+
"""Generate axis label for rules, accounting for rule type ("all", "3f1-significant", or "other-significant").
|
37 |
+
"""
|
38 |
categories = ""
|
39 |
if (rule_types is None) or ("all" in rule_types):
|
40 |
pass
|
|
|
54 |
df (DataFrame): Input data.
|
55 |
group_col (str, optional): Column on which the data are grouped. Defaults to "acronym".
|
56 |
value_col (str, optional): Column of values to be plotted. Defaults to "rules".
|
57 |
+
color (str, optional): Color of values in plot. Defaults to "#033C5A" ([GW Blue](https://communications.gwu.edu/visual-identity/color-palette)).
|
58 |
+
rule_types (list | None, optional): One or more rule types to include in plot. Accepts "all", "3f1-significant", or "other-significant". Defaults to None.
|
59 |
+
|
60 |
Returns:
|
61 |
ggplot: Plotted data.
|
62 |
"""
|
|
|
75 |
+ labs(y=y_lab, x="", title="Rules Published by Agency")
|
76 |
+ theme_light()
|
77 |
)
|
|
|
78 |
return plot
|
79 |
|
80 |
|
|
|
92 |
df (DataFrame): Input data.
|
93 |
group_cols (tuple, optional): Columns on which the data are grouped. Defaults to ("publication_year", "publication_month").
|
94 |
value_col (str, optional): Column of values to be plotted. Defaults to "rules".
|
95 |
+
color (str, optional): Color of values in plot. Defaults to "#033C5A" ([GW Blue](https://communications.gwu.edu/visual-identity/color-palette)).
|
96 |
+
title (str | None, optional): Plot title. Defaults to None.
|
97 |
+
y_lab (str, optional): Plot y label. Defaults to "" (empty string).
|
98 |
+
|
99 |
Returns:
|
100 |
ggplot: Plotted data.
|
101 |
"""
|
|
|
129 |
|
130 |
Args:
|
131 |
df (DataFrame): Input data.
|
132 |
+
group_col (str, optional): Column on which the data are grouped. Defaults to "publication_date".
|
133 |
value_col (str, optional): Column of values to be plotted. Defaults to "rules".
|
134 |
+
color (str, optional): Color of values in plot. Defaults to "#033C5A" ([GW Blue](https://communications.gwu.edu/visual-identity/color-palette)).
|
135 |
+
title (str | None, optional): Plot title. Defaults to None.
|
136 |
+
y_lab (str, optional): Plot y label. Defaults to "" (empty string).
|
137 |
|
138 |
Returns:
|
139 |
ggplot: Plotted data.
|
|
|
176 |
y_lab: str = "",
|
177 |
show_significant: bool = False,
|
178 |
):
|
179 |
+
"""Plot rules by week.
|
180 |
+
|
181 |
+
Args:
|
182 |
+
df (DataFrame): Input data.
|
183 |
+
group_col (str, optional): Column on which the data are grouped. Defaults to "week_of".
|
184 |
+
value_col (str, optional): Column of values to be plotted. Defaults to "rules".
|
185 |
+
color (str, optional): Color of values in plot. Defaults to "#033C5A" ([GW Blue](https://communications.gwu.edu/visual-identity/color-palette)).
|
186 |
+
title (str | None, optional): Plot title. Defaults to None.
|
187 |
+
y_lab (str, optional): Plot y label. Defaults to "" (empty string).
|
188 |
+
|
189 |
+
Returns:
|
190 |
+
ggplot: Plotted data.
|
191 |
+
"""
|
192 |
max_value = df.loc[:, value_col].max()
|
193 |
|
194 |
date_values = df[group_col].to_list()
|
|
|
221 |
+ labs(y=y_lab, x="", title=title)
|
222 |
+ theme_light()
|
223 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
return plot
|
225 |
|
226 |
|
227 |
+
def plot_tf(df: DataFrame, frequency: str, rule_types: list | None = None, **kwargs) -> ggplot:
|
228 |
"""Plot rules over time by given frequency.
|
229 |
|
230 |
Args:
|
231 |
df (DataFrame): Input data.
|
232 |
frequency (str): Frequency of time for aggregating rules. Accepts "monthly" or "daily".
|
233 |
+
rule_types (list | None, optional): One or more rule types to include in plot. Accepts "all", "3f1-significant", or "other-significant". Defaults to None.
|
234 |
|
235 |
Raises:
|
236 |
ValueError: Frequency parameter received invalid value.
|
modules/search_columns.py
CHANGED
@@ -11,12 +11,14 @@ class SearchError(Exception):
|
|
11 |
|
12 |
|
13 |
# Defining a function to search for string patterns within dataframe columns
|
14 |
-
def search_columns(
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
20 |
"""Search columns for string patterns within dataframe columns.
|
21 |
|
22 |
Args:
|
|
|
11 |
|
12 |
|
13 |
# Defining a function to search for string patterns within dataframe columns
|
14 |
+
def search_columns(
|
15 |
+
df: DataFrame,
|
16 |
+
patterns: list,
|
17 |
+
columns: list,
|
18 |
+
return_as: str = "indicator_column",
|
19 |
+
return_column: str = "indicator",
|
20 |
+
re_flags = re.I | re.X
|
21 |
+
):
|
22 |
"""Search columns for string patterns within dataframe columns.
|
23 |
|
24 |
Args:
|
modules/significant.py
CHANGED
@@ -21,7 +21,17 @@ def read_csv_data(
|
|
21 |
"Major"
|
22 |
),
|
23 |
url: str = r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv"
|
24 |
-
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
# handle dates formatted as str
|
26 |
if isinstance(start_date, str):
|
27 |
start_date = date.fromisoformat(start_date)
|
@@ -56,13 +66,22 @@ def read_csv_data(
|
|
56 |
return None, cols, max_date
|
57 |
|
58 |
|
59 |
-
def clean_data(
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
# start a lazy query
|
67 |
lf = (
|
68 |
df.lazy()
|
@@ -70,10 +89,6 @@ def clean_data(df: pl.DataFrame,
|
|
70 |
.with_columns(pl.col("document_number").str.strip_chars())
|
71 |
# only keep document_numbers from input
|
72 |
.filter(pl.col("document_number").is_in(document_numbers))
|
73 |
-
# temporarily format "not available" data (input as dots)
|
74 |
-
#.with_columns(pl.col(c for c in clean_columns if c != "document_number").str.replace_all(".", f"{format_not_available_values}", literal=True))
|
75 |
-
# cast to nullable int dtype
|
76 |
-
#.with_columns(pl.col(c for c in clean_columns if c != "document_number").cast(pl.Int64, strict=False))
|
77 |
)
|
78 |
|
79 |
# return optimized query plan instead of df
|
@@ -84,22 +99,40 @@ def clean_data(df: pl.DataFrame,
|
|
84 |
return lf.collect()
|
85 |
|
86 |
|
87 |
-
def merge_with_api_results(
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
main_df = pl.from_pandas(pd_df)
|
92 |
df = main_df.join(pl_df, on="document_number", how="left", validate="1:1", coalesce=True)
|
93 |
return df.to_pandas()
|
94 |
|
95 |
|
96 |
-
def get_significant_info(input_df, start_date, document_numbers):
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
if pl_df is None:
|
100 |
print("Failed to integrate significance tracking data with retrieved documents.")
|
101 |
return input_df
|
102 |
-
pl_df = clean_data(pl_df, document_numbers
|
103 |
pd_df = merge_with_api_results(input_df, pl_df)
|
104 |
return pd_df, max_date
|
105 |
|
|
|
21 |
"Major"
|
22 |
),
|
23 |
url: str = r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv"
|
24 |
+
) -> tuple[pd_DataFrame | None, list, date]:
|
25 |
+
"""Read CSV data from GitHub file.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
start_date (date | str): Start date of read data.
|
29 |
+
retrieve_columns (list | tuple, optional): Get select columns. Defaults to ( "publication_date", "document_number", "significant", "econ_significant", "3(f)(1) significant", "Major" ).
|
30 |
+
url (str, optional): URL where data are located. Defaults to r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv".
|
31 |
+
|
32 |
+
Returns:
|
33 |
+
tuple: Data, column names, max date in dataset
|
34 |
+
"""
|
35 |
# handle dates formatted as str
|
36 |
if isinstance(start_date, str):
|
37 |
start_date = date.fromisoformat(start_date)
|
|
|
66 |
return None, cols, max_date
|
67 |
|
68 |
|
69 |
+
def clean_data(
|
70 |
+
df: pl.DataFrame,
|
71 |
+
document_numbers: list,
|
72 |
+
*,
|
73 |
+
return_optimized_plan: bool = False
|
74 |
+
):
|
75 |
+
"""Clean data.
|
76 |
+
|
77 |
+
Args:
|
78 |
+
df (pl.DataFrame): Input polars dataframe.
|
79 |
+
document_numbers (list): List of document numbers to keep.
|
80 |
+
return_optimized_plan (bool, optional): Return optimized query plan rather than dataframe. Defaults to False.
|
81 |
+
|
82 |
+
Returns:
|
83 |
+
DataFrame | str: Cleaned data (or string representation of the query plan)
|
84 |
+
"""
|
85 |
# start a lazy query
|
86 |
lf = (
|
87 |
df.lazy()
|
|
|
89 |
.with_columns(pl.col("document_number").str.strip_chars())
|
90 |
# only keep document_numbers from input
|
91 |
.filter(pl.col("document_number").is_in(document_numbers))
|
|
|
|
|
|
|
|
|
92 |
)
|
93 |
|
94 |
# return optimized query plan instead of df
|
|
|
99 |
return lf.collect()
|
100 |
|
101 |
|
102 |
+
def merge_with_api_results(
|
103 |
+
pd_df: pd_DataFrame,
|
104 |
+
pl_df: pl.DataFrame
|
105 |
+
):
|
106 |
+
"""Merge significance data with FR API data.
|
107 |
+
|
108 |
+
Args:
|
109 |
+
pd_df (pd_DataFrame): Main dataset of FR rules.
|
110 |
+
pl_df (pl.DataFrame): Significance data.
|
111 |
+
|
112 |
+
Returns:
|
113 |
+
DataFrame: Merged data.
|
114 |
+
"""
|
115 |
main_df = pl.from_pandas(pd_df)
|
116 |
df = main_df.join(pl_df, on="document_number", how="left", validate="1:1", coalesce=True)
|
117 |
return df.to_pandas()
|
118 |
|
119 |
|
120 |
+
def get_significant_info(input_df: pd_DataFrame, start_date: str, document_numbers: list):
|
121 |
+
"""Retrieve significance information for input data.
|
122 |
+
|
123 |
+
Args:
|
124 |
+
input_df (pd.DataFrame): Input data.
|
125 |
+
start_date (str): Start date of data.
|
126 |
+
document_numbers (list): Documents to keep.
|
127 |
+
|
128 |
+
Returns:
|
129 |
+
tuple[DataFrame, datetime.date]: Data with significance information, max date in dataset
|
130 |
+
"""
|
131 |
+
pl_df, _, max_date = read_csv_data(start_date)
|
132 |
if pl_df is None:
|
133 |
print("Failed to integrate significance tracking data with retrieved documents.")
|
134 |
return input_df
|
135 |
+
pl_df = clean_data(pl_df, document_numbers)
|
136 |
pd_df = merge_with_api_results(input_df, pl_df)
|
137 |
return pd_df, max_date
|
138 |
|
modules/utils.py
CHANGED
@@ -2,6 +2,9 @@ from pandas import DataFrame
|
|
2 |
|
3 |
|
4 |
def _get_nested_metadata(metadata_key: str, metadata: dict[dict], metadata_value: str):
|
|
|
|
|
|
|
5 |
getter = metadata.get(metadata_key, {})
|
6 |
return getter.get(metadata_value, metadata_key)
|
7 |
|
|
|
2 |
|
3 |
|
4 |
def _get_nested_metadata(metadata_key: str, metadata: dict[dict], metadata_value: str):
|
5 |
+
"""Get nested metadata from `dict[dict, Any]` structure.
|
6 |
+
Returns "metadata_key" as default value.
|
7 |
+
"""
|
8 |
getter = metadata.get(metadata_key, {})
|
9 |
return getter.get(metadata_value, metadata_key)
|
10 |
|
tests/test_get_data.py
CHANGED
@@ -6,7 +6,7 @@ from modules.get_rules_in_window import (
|
|
6 |
)
|
7 |
|
8 |
|
9 |
-
def test_get_date_range(start_str: str = "2024-05-01"):
|
10 |
|
11 |
start_date = date.fromisoformat(start_str)
|
12 |
end_year = start_date.year + 1
|
@@ -15,7 +15,7 @@ def test_get_date_range(start_str: str = "2024-05-01"):
|
|
15 |
assert isinstance(dates_str, dict)
|
16 |
assert (
|
17 |
dates_str.get("start") == start_str
|
18 |
-
and dates_str.get("end") == f"{end_year}-
|
19 |
and dates_str.get("transition_year") == end_year
|
20 |
)
|
21 |
|
|
|
6 |
)
|
7 |
|
8 |
|
9 |
+
def test_get_date_range(start_str: str = "2024-05-01", end_mmdd: str = "01-03"):
|
10 |
|
11 |
start_date = date.fromisoformat(start_str)
|
12 |
end_year = start_date.year + 1
|
|
|
15 |
assert isinstance(dates_str, dict)
|
16 |
assert (
|
17 |
dates_str.get("start") == start_str
|
18 |
+
and dates_str.get("end") == f"{end_year}-{end_mmdd}"
|
19 |
and dates_str.get("transition_year") == end_year
|
20 |
)
|
21 |
|