Mark Febrizio commited on
Commit
fe4f734
1 Parent(s): a0ce115

Documentation (#24)

Browse files

* Update significant.py

* docstrings for grouping

* Update plotting.py

docstrings and remove inoperable code

* docstrings

* Update dependabot.yml

* Update significant.py

bugfix where clean_cols was still being passed even after being removed

* Update test_get_data.py

update test end date

* Update README.md

Add summary of potential uses

* Update app.py

adjust sig tooltip language

.github/dependabot.yml CHANGED
@@ -8,9 +8,8 @@ updates:
8
  - package-ecosystem: "pip"
9
  directory: "/" # Location of package manifests
10
  schedule:
11
- interval: "weekly"
12
  allow:
13
- - dependency-name: "faicons"
14
  - dependency-name: "fr-toolbelt"
15
  - dependency-name: "pandas"
16
  - dependency-name: "plotnine"
 
8
  - package-ecosystem: "pip"
9
  directory: "/" # Location of package manifests
10
  schedule:
11
+ interval: "monthly"
12
  allow:
 
13
  - dependency-name: "fr-toolbelt"
14
  - dependency-name: "pandas"
15
  - dependency-name: "plotnine"
README.md CHANGED
@@ -20,3 +20,14 @@ Dashboard app tracking rules falling within the Congressional Review Act (CRA) w
20
  **[Link to HuggingFace Space that hosts the dashboard app.](https://huggingface.co/spaces/regulatorystudies/cra-window-rules)**
21
 
22
  Developed by the [GW Regulatory Studies Center](https://go.gwu.edu/regstudies).
 
 
 
 
 
 
 
 
 
 
 
 
20
  **[Link to HuggingFace Space that hosts the dashboard app.](https://huggingface.co/spaces/regulatorystudies/cra-window-rules)**
21
 
22
  Developed by the [GW Regulatory Studies Center](https://go.gwu.edu/regstudies).
23
+
24
+ ## Potential Uses
25
+
26
+ The CRA lookback window is a hot topic going into a presidential election year. One natural question building on the opening of the window is: *what rules might fall into the window and could be subject to the CRA next year?*
27
+
28
+ This information may be valuable for different stakeholders in different ways:
29
+
30
+ 1. Industry observers and other regulated entities may want to be aware of what rules might or might not be durable given an administration change;
31
+ 2. Advocacy groups might want to know how to advocate for or against such policies;
32
+ 3. Federal policymakers can have a better sense of what policies may or may not carry over given a change in administration;
33
+ 4. Journalists may want to focus on what highly salient policies might be reversed depending on the results of the election. They also may want to know more broadly how many rules fall into the window and from which agencies.
app.py CHANGED
@@ -18,7 +18,7 @@ from modules import (
18
  AGENCIES,
19
  groupby_agency,
20
  groupby_date,
21
- add_weeks_to_data,
22
  pad_missing_dates,
23
  plot_agency,
24
  plot_tf,
@@ -85,7 +85,7 @@ with ui.sidebar(open={"desktop": "open", "mobile": "closed"}, fg="#033C5A"):
85
 
86
  with ui.tooltip(placement="right", id="sig_tooltip"):
87
  ui.input_select("menu_significant", "Select rule significance", choices=["all", "3f1-significant", "other-significant"], selected="all", multiple=True, size=3)
88
- "Rule significance as defined in Executive Order 12866."
89
 
90
  with ui.tooltip(placement="right", id="agency_tooltip"):
91
  ui.input_select("menu_agency", "Select agencies", choices=["all"] + AGENCIES, selected=["all"], multiple=True, size=6)
@@ -348,7 +348,7 @@ def grouped_df_day():
348
  @reactive.calc
349
  def grouped_df_week():
350
  filt_df = filter_significance()
351
- filt_df = add_weeks_to_data(filt_df)
352
  try:
353
  grouped = groupby_date(filt_df, group_col=("week_number", "week_of"), significant=GET_SIGNIFICANT)
354
  grouped = pad_missing_dates(
 
18
  AGENCIES,
19
  groupby_agency,
20
  groupby_date,
21
+ add_week_info_to_data,
22
  pad_missing_dates,
23
  plot_agency,
24
  plot_tf,
 
85
 
86
  with ui.tooltip(placement="right", id="sig_tooltip"):
87
  ui.input_select("menu_significant", "Select rule significance", choices=["all", "3f1-significant", "other-significant"], selected="all", multiple=True, size=3)
88
+ "Rule significance as defined in Executive Order 12866, as amended by Executive Order 14094."
89
 
90
  with ui.tooltip(placement="right", id="agency_tooltip"):
91
  ui.input_select("menu_agency", "Select agencies", choices=["all"] + AGENCIES, selected=["all"], multiple=True, size=6)
 
348
  @reactive.calc
349
  def grouped_df_week():
350
  filt_df = filter_significance()
351
+ filt_df = add_week_info_to_data(filt_df)
352
  try:
353
  grouped = groupby_date(filt_df, group_col=("week_number", "week_of"), significant=GET_SIGNIFICANT)
354
  grouped = pad_missing_dates(
cra_window_rules.py CHANGED
@@ -13,6 +13,15 @@ from modules import (
13
 
14
 
15
  def save_csv(path: Path, df_all: DataFrame, df_agency: DataFrame, df_ym: DataFrame, transition_year: int):
 
 
 
 
 
 
 
 
 
16
  files = (
17
  f"rules_{transition_year - 1}_{transition_year}.csv",
18
  f"rules_by_agency_{transition_year - 1}_{transition_year}.csv",
@@ -23,7 +32,9 @@ def save_csv(path: Path, df_all: DataFrame, df_agency: DataFrame, df_ym: DataFra
23
  data.to_csv(path / file, index=False)
24
 
25
 
26
- def main(start_date, save_data: bool = True, path: Path | None = None, metadata: dict | None = None, significant: bool = True):
 
 
27
  if date.fromisoformat(start_date) < date(2023, 4, 6):
28
  significant = False
29
  date_range = get_date_range(start_date)
 
13
 
14
 
15
  def save_csv(path: Path, df_all: DataFrame, df_agency: DataFrame, df_ym: DataFrame, transition_year: int):
16
+ """Save output as CSV files.
17
+
18
+ Args:
19
+ path (Path): Save data here.
20
+ df_all (DataFrame): Data at the rule level.
21
+ df_agency (DataFrame): Data grouped by agency.
22
+ df_ym (DataFrame): Data grouped by publication year and month.
23
+ transition_year (int): Presidential transition year.
24
+ """
25
  files = (
26
  f"rules_{transition_year - 1}_{transition_year}.csv",
27
  f"rules_by_agency_{transition_year - 1}_{transition_year}.csv",
 
32
  data.to_csv(path / file, index=False)
33
 
34
 
35
+ def main(start_date: str, save_data: bool = True, path: Path | None = None, metadata: dict | None = None, significant: bool = True):
36
+ """Retrieve rules in CRA window and save resulting data.
37
+ """
38
  if date.fromisoformat(start_date) < date(2023, 4, 6):
39
  significant = False
40
  date_range = get_date_range(start_date)
modules/grouping.py CHANGED
@@ -87,7 +87,7 @@ def _get_weeks(dates: list[date], end_date: date | None = None, **kwargs) -> lis
87
  return results
88
 
89
 
90
- def add_weeks_to_data(df: DataFrame, date_column: str = "publication_date", new_columns: tuple[str] = ("week_number", "week_of")):
91
  """Add week number and week start date to input data.
92
 
93
  Args:
@@ -108,7 +108,8 @@ def add_weeks_to_data(df: DataFrame, date_column: str = "publication_date", new_
108
 
109
 
110
  def _pad_missing_weeks(timeframe_list: list[date], **kwargs):
111
-
 
112
  # get the start date for the first week
113
  first_week_start = _get_first_week_start(timeframe_list)
114
 
@@ -117,7 +118,8 @@ def _pad_missing_weeks(timeframe_list: list[date], **kwargs):
117
 
118
 
119
  def _pad_missing_days(timeframe_list: list[date], end_date: date | None = None):
120
-
 
121
  start_date = min(timeframe_list)
122
  if end_date is None:
123
  end_date = date.today()
@@ -132,7 +134,20 @@ def _pad_missing_days(timeframe_list: list[date], end_date: date | None = None):
132
 
133
 
134
  def pad_missing_dates(df: DataFrame, pad_column: str, how: str, fill_padded_values: dict | None = None, **kwargs):
 
135
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  df_copy = df.copy()
137
  timeframe_list = [d.date() if isinstance(d, (Timestamp, datetime)) else d for d in df_copy[pad_column].to_list()]
138
  df_copy = df_copy.astype({pad_column: "object"})
@@ -148,7 +163,7 @@ def pad_missing_dates(df: DataFrame, pad_column: str, how: str, fill_padded_valu
148
  elif how == "weeks":
149
  week_numbers, padded_timeframes = zip(*_pad_missing_weeks(timeframe_list, **kwargs))
150
  else:
151
- raise ValueError
152
 
153
  # incorporate extended dates into dataframe
154
  df_merge = DataFrame({pad_column: padded_timeframes})
@@ -174,19 +189,19 @@ def groupby_agency(
174
  metadata: dict | None = None,
175
  metadata_value: str = "acronym",
176
  ):
177
- """_summary_
178
 
179
  Args:
180
- df (DataFrame): _description_
181
- group_col (str, optional): _description_. Defaults to "parent_slug".
182
- value_col (str, optional): _description_. Defaults to "document_number".
183
- aggfunc (str, optional): _description_. Defaults to "count".
184
- significant (bool, optional): _description_. Defaults to True.
185
- metadata (dict | None, optional): _description_. Defaults to None.
186
- metadata_value (str, optional): _description_. Defaults to "acronym".
187
 
188
  Returns:
189
- _type_: _description_
190
  """
191
  aggfunc_dict = {value_col: aggfunc, }
192
  if significant:
@@ -225,12 +240,27 @@ def groupby_date(
225
  aggfunc: str = "count",
226
  significant: bool = True
227
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  if isinstance(group_col, str):
229
  group_col = [group_col]
230
  elif isinstance(group_col, (list, tuple)):
231
  group_col = list(group_col)
232
  else:
233
- raise TypeError
234
 
235
  aggfunc_dict = {value_col: aggfunc, }
236
  if significant:
@@ -263,7 +293,7 @@ if __name__ == "__main__":
263
  df_a = pad_missing_dates(df, "dates", "days", fill_padded_values={"values": 0})
264
  print(df_a.head(10))
265
 
266
- df = add_weeks_to_data(df, date_column="dates")
267
  print(df.head(10))
268
 
269
  grouped = groupby_date(df, group_col=("week_number", "week_of"), value_col="values", significant=False)
 
87
  return results
88
 
89
 
90
+ def add_week_info_to_data(df: DataFrame, date_column: str = "publication_date", new_columns: tuple[str] = ("week_number", "week_of")):
91
  """Add week number and week start date to input data.
92
 
93
  Args:
 
108
 
109
 
110
  def _pad_missing_weeks(timeframe_list: list[date], **kwargs):
111
+ """Pad dataframe with weeks missing from retrieved data (i.e., weeks without qualifying rule data).
112
+ """
113
  # get the start date for the first week
114
  first_week_start = _get_first_week_start(timeframe_list)
115
 
 
118
 
119
 
120
  def _pad_missing_days(timeframe_list: list[date], end_date: date | None = None):
121
+ """Pad dataframe with days missing from retrieved data (i.e., days without qualifying rule data).
122
+ """
123
  start_date = min(timeframe_list)
124
  if end_date is None:
125
  end_date = date.today()
 
134
 
135
 
136
  def pad_missing_dates(df: DataFrame, pad_column: str, how: str, fill_padded_values: dict | None = None, **kwargs):
137
+ """Add missing dates (either weeks or days) to the dataset.
138
 
139
+ Args:
140
+ df (DataFrame): Input data.
141
+ pad_column (str): Date column to pad.
142
+ how (str): Whether to pad by "days" or "weeks".
143
+ fill_padded_values (dict | None, optional): Dictionary of columns and values to fill for padded observations (e.g., {"column": 0}). Defaults to None.
144
+
145
+ Raises:
146
+ ValueError: Must pass 'days' or 'weeks' to parameter 'how'.
147
+
148
+ Returns:
149
+ DataFrame: Padded data.
150
+ """
151
  df_copy = df.copy()
152
  timeframe_list = [d.date() if isinstance(d, (Timestamp, datetime)) else d for d in df_copy[pad_column].to_list()]
153
  df_copy = df_copy.astype({pad_column: "object"})
 
163
  elif how == "weeks":
164
  week_numbers, padded_timeframes = zip(*_pad_missing_weeks(timeframe_list, **kwargs))
165
  else:
166
+ raise ValueError("Must pass 'days' or 'weeks' to parameter 'how'.")
167
 
168
  # incorporate extended dates into dataframe
169
  df_merge = DataFrame({pad_column: padded_timeframes})
 
189
  metadata: dict | None = None,
190
  metadata_value: str = "acronym",
191
  ):
192
+ """Group data by agencies and aggregate the values.
193
 
194
  Args:
195
+ df (DataFrame): Input data.
196
+ group_col (str, optional): Column to group by. Defaults to "parent_slug".
197
+ value_col (str, optional): Column for values for grouping and aggregation. Defaults to "document_number".
198
+ aggfunc (str, optional): Aggregation function. Defaults to "count".
199
+ significant (bool, optional): Whether to include significance data in values. Defaults to True.
200
+ metadata (dict | None, optional): Agency metadata. Defaults to None.
201
+ metadata_value (str, optional): Metadata value to add to output data. Defaults to "acronym".
202
 
203
  Returns:
204
+ DataFrame: Grouped and aggregated data.
205
  """
206
  aggfunc_dict = {value_col: aggfunc, }
207
  if significant:
 
240
  aggfunc: str = "count",
241
  significant: bool = True
242
  ):
243
+ """Group data by a given date frequency and aggregate the values.
244
+
245
+ Args:
246
+ df (DataFrame): Input data.
247
+ group_col (str | tuple | list, optional): Columns to group by. Defaults to ("publication_year", "publication_month", ).
248
+ value_col (str, optional): Column for values for grouping and aggregation. Defaults to "document_number".
249
+ aggfunc (str, optional): Aggregation function. Defaults to "count".
250
+ significant (bool, optional): Whether to include significance data in values. Defaults to True.
251
+
252
+ Raises:
253
+ TypeError: Parameter 'group_col' must be type `str`, `list`, or `tuple`.
254
+
255
+ Returns:
256
+ DataFrame: Grouped and aggregated data.
257
+ """
258
  if isinstance(group_col, str):
259
  group_col = [group_col]
260
  elif isinstance(group_col, (list, tuple)):
261
  group_col = list(group_col)
262
  else:
263
+ raise TypeError("Parameter 'group_col' must be type `str`, `list`, or `tuple`.")
264
 
265
  aggfunc_dict = {value_col: aggfunc, }
266
  if significant:
 
293
  df_a = pad_missing_dates(df, "dates", "days", fill_padded_values={"values": 0})
294
  print(df_a.head(10))
295
 
296
+ df = add_week_info_to_data(df, date_column="dates")
297
  print(df.head(10))
298
 
299
  grouped = groupby_date(df, group_col=("week_number", "week_of"), value_col="values", significant=False)
modules/plotting.py CHANGED
@@ -22,7 +22,8 @@ class DataAvailabilityError(Exception):
22
 
23
 
24
  def plot_NA(placeholder_text: str = "Not enough data available to visualize.", placeholder_size: int = 14):
25
- """Placeholder plot for when there is not enough data available to visualize."""
 
26
  return (
27
  ggplot()
28
  + annotate("text", x=0, y=0, label=placeholder_text, size=placeholder_size)
@@ -32,7 +33,8 @@ def plot_NA(placeholder_text: str = "Not enough data available to visualize.", p
32
 
33
 
34
  def generate_rule_axis_label(rule_types: list | None = None):
35
- """Generate axis label for rules, accounting for rule type ("all", "3f1", or "other")."""
 
36
  categories = ""
37
  if (rule_types is None) or ("all" in rule_types):
38
  pass
@@ -52,7 +54,9 @@ def plot_agency(df, group_col = "acronym", value_col = "rules", color="#033C5A",
52
  df (DataFrame): Input data.
53
  group_col (str, optional): Column on which the data are grouped. Defaults to "acronym".
54
  value_col (str, optional): Column of values to be plotted. Defaults to "rules".
55
-
 
 
56
  Returns:
57
  ggplot: Plotted data.
58
  """
@@ -71,7 +75,6 @@ def plot_agency(df, group_col = "acronym", value_col = "rules", color="#033C5A",
71
  + labs(y=y_lab, x="", title="Rules Published by Agency")
72
  + theme_light()
73
  )
74
-
75
  return plot
76
 
77
 
@@ -89,7 +92,10 @@ def plot_month(
89
  df (DataFrame): Input data.
90
  group_cols (tuple, optional): Columns on which the data are grouped. Defaults to ("publication_year", "publication_month").
91
  value_col (str, optional): Column of values to be plotted. Defaults to "rules".
92
-
 
 
 
93
  Returns:
94
  ggplot: Plotted data.
95
  """
@@ -123,8 +129,11 @@ def plot_day(
123
 
124
  Args:
125
  df (DataFrame): Input data.
126
- group_col (str, optional): Column on which the data are grouped. Defaults to ("publication_year", "publication_month").
127
  value_col (str, optional): Column of values to be plotted. Defaults to "rules".
 
 
 
128
 
129
  Returns:
130
  ggplot: Plotted data.
@@ -167,6 +176,19 @@ def plot_week(
167
  y_lab: str = "",
168
  show_significant: bool = False,
169
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  max_value = df.loc[:, value_col].max()
171
 
172
  date_values = df[group_col].to_list()
@@ -199,26 +221,16 @@ def plot_week(
199
  + labs(y=y_lab, x="", title=title)
200
  + theme_light()
201
  )
202
- if show_significant:
203
- # trying to add significant rules as additional lines
204
- # but getting "TypeError: Discrete value supplied to continuous scale"
205
- # for 3f1 sig rules
206
- df = df.astype({"3f1_significant": "float"})
207
- plot = (
208
- plot
209
- #+ geom_line(aes(x=group_col, y="3f1_significant"), inherit_aes=False, group=1, color="#AA9868", linetype="dotted")
210
- + geom_line(aes(x=group_col, y="other_significant"), inherit_aes=False, group=1, color="#0190DB", linetype="dashed")
211
- #+ guide_legend()
212
- )
213
  return plot
214
 
215
 
216
- def plot_tf(df: DataFrame, frequency: str, rule_types: str | None = None, **kwargs) -> ggplot:
217
  """Plot rules over time by given frequency.
218
 
219
  Args:
220
  df (DataFrame): Input data.
221
  frequency (str): Frequency of time for aggregating rules. Accepts "monthly" or "daily".
 
222
 
223
  Raises:
224
  ValueError: Frequency parameter received invalid value.
 
22
 
23
 
24
  def plot_NA(placeholder_text: str = "Not enough data available to visualize.", placeholder_size: int = 14):
25
+ """Placeholder plot for when there is not enough data available to visualize.
26
+ """
27
  return (
28
  ggplot()
29
  + annotate("text", x=0, y=0, label=placeholder_text, size=placeholder_size)
 
33
 
34
 
35
  def generate_rule_axis_label(rule_types: list | None = None):
36
+ """Generate axis label for rules, accounting for rule type ("all", "3f1-significant", or "other-significant").
37
+ """
38
  categories = ""
39
  if (rule_types is None) or ("all" in rule_types):
40
  pass
 
54
  df (DataFrame): Input data.
55
  group_col (str, optional): Column on which the data are grouped. Defaults to "acronym".
56
  value_col (str, optional): Column of values to be plotted. Defaults to "rules".
57
+ color (str, optional): Color of values in plot. Defaults to "#033C5A" ([GW Blue](https://communications.gwu.edu/visual-identity/color-palette)).
58
+ rule_types (list | None, optional): One or more rule types to include in plot. Accepts "all", "3f1-significant", or "other-significant". Defaults to None.
59
+
60
  Returns:
61
  ggplot: Plotted data.
62
  """
 
75
  + labs(y=y_lab, x="", title="Rules Published by Agency")
76
  + theme_light()
77
  )
 
78
  return plot
79
 
80
 
 
92
  df (DataFrame): Input data.
93
  group_cols (tuple, optional): Columns on which the data are grouped. Defaults to ("publication_year", "publication_month").
94
  value_col (str, optional): Column of values to be plotted. Defaults to "rules".
95
+ color (str, optional): Color of values in plot. Defaults to "#033C5A" ([GW Blue](https://communications.gwu.edu/visual-identity/color-palette)).
96
+ title (str | None, optional): Plot title. Defaults to None.
97
+ y_lab (str, optional): Plot y label. Defaults to "" (empty string).
98
+
99
  Returns:
100
  ggplot: Plotted data.
101
  """
 
129
 
130
  Args:
131
  df (DataFrame): Input data.
132
+ group_col (str, optional): Column on which the data are grouped. Defaults to "publication_date".
133
  value_col (str, optional): Column of values to be plotted. Defaults to "rules".
134
+ color (str, optional): Color of values in plot. Defaults to "#033C5A" ([GW Blue](https://communications.gwu.edu/visual-identity/color-palette)).
135
+ title (str | None, optional): Plot title. Defaults to None.
136
+ y_lab (str, optional): Plot y label. Defaults to "" (empty string).
137
 
138
  Returns:
139
  ggplot: Plotted data.
 
176
  y_lab: str = "",
177
  show_significant: bool = False,
178
  ):
179
+ """Plot rules by week.
180
+
181
+ Args:
182
+ df (DataFrame): Input data.
183
+ group_col (str, optional): Column on which the data are grouped. Defaults to "week_of".
184
+ value_col (str, optional): Column of values to be plotted. Defaults to "rules".
185
+ color (str, optional): Color of values in plot. Defaults to "#033C5A" ([GW Blue](https://communications.gwu.edu/visual-identity/color-palette)).
186
+ title (str | None, optional): Plot title. Defaults to None.
187
+ y_lab (str, optional): Plot y label. Defaults to "" (empty string).
188
+
189
+ Returns:
190
+ ggplot: Plotted data.
191
+ """
192
  max_value = df.loc[:, value_col].max()
193
 
194
  date_values = df[group_col].to_list()
 
221
  + labs(y=y_lab, x="", title=title)
222
  + theme_light()
223
  )
 
 
 
 
 
 
 
 
 
 
 
224
  return plot
225
 
226
 
227
+ def plot_tf(df: DataFrame, frequency: str, rule_types: list | None = None, **kwargs) -> ggplot:
228
  """Plot rules over time by given frequency.
229
 
230
  Args:
231
  df (DataFrame): Input data.
232
  frequency (str): Frequency of time for aggregating rules. Accepts "monthly" or "daily".
233
+ rule_types (list | None, optional): One or more rule types to include in plot. Accepts "all", "3f1-significant", or "other-significant". Defaults to None.
234
 
235
  Raises:
236
  ValueError: Frequency parameter received invalid value.
modules/search_columns.py CHANGED
@@ -11,12 +11,14 @@ class SearchError(Exception):
11
 
12
 
13
  # Defining a function to search for string patterns within dataframe columns
14
- def search_columns(df: DataFrame,
15
- patterns: list,
16
- columns: list,
17
- return_as: str = "indicator_column",
18
- return_column: str = "indicator",
19
- re_flags = re.I | re.X):
 
 
20
  """Search columns for string patterns within dataframe columns.
21
 
22
  Args:
 
11
 
12
 
13
  # Defining a function to search for string patterns within dataframe columns
14
+ def search_columns(
15
+ df: DataFrame,
16
+ patterns: list,
17
+ columns: list,
18
+ return_as: str = "indicator_column",
19
+ return_column: str = "indicator",
20
+ re_flags = re.I | re.X
21
+ ):
22
  """Search columns for string patterns within dataframe columns.
23
 
24
  Args:
modules/significant.py CHANGED
@@ -21,7 +21,17 @@ def read_csv_data(
21
  "Major"
22
  ),
23
  url: str = r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv"
24
- ):
 
 
 
 
 
 
 
 
 
 
25
  # handle dates formatted as str
26
  if isinstance(start_date, str):
27
  start_date = date.fromisoformat(start_date)
@@ -56,13 +66,22 @@ def read_csv_data(
56
  return None, cols, max_date
57
 
58
 
59
- def clean_data(df: pl.DataFrame,
60
- document_numbers: list,
61
- clean_columns: list | tuple,
62
- #format_not_available_values: str = ".",
63
- return_optimized_plan = False
64
- ):
65
-
 
 
 
 
 
 
 
 
 
66
  # start a lazy query
67
  lf = (
68
  df.lazy()
@@ -70,10 +89,6 @@ def clean_data(df: pl.DataFrame,
70
  .with_columns(pl.col("document_number").str.strip_chars())
71
  # only keep document_numbers from input
72
  .filter(pl.col("document_number").is_in(document_numbers))
73
- # temporarily format "not available" data (input as dots)
74
- #.with_columns(pl.col(c for c in clean_columns if c != "document_number").str.replace_all(".", f"{format_not_available_values}", literal=True))
75
- # cast to nullable int dtype
76
- #.with_columns(pl.col(c for c in clean_columns if c != "document_number").cast(pl.Int64, strict=False))
77
  )
78
 
79
  # return optimized query plan instead of df
@@ -84,22 +99,40 @@ def clean_data(df: pl.DataFrame,
84
  return lf.collect()
85
 
86
 
87
- def merge_with_api_results(pd_df: pd_DataFrame,
88
- pl_df: pl.DataFrame
89
- ):
90
-
 
 
 
 
 
 
 
 
 
91
  main_df = pl.from_pandas(pd_df)
92
  df = main_df.join(pl_df, on="document_number", how="left", validate="1:1", coalesce=True)
93
  return df.to_pandas()
94
 
95
 
96
- def get_significant_info(input_df, start_date, document_numbers):
97
-
98
- pl_df, clean_cols, max_date = read_csv_data(start_date)
 
 
 
 
 
 
 
 
 
99
  if pl_df is None:
100
  print("Failed to integrate significance tracking data with retrieved documents.")
101
  return input_df
102
- pl_df = clean_data(pl_df, document_numbers, clean_cols)
103
  pd_df = merge_with_api_results(input_df, pl_df)
104
  return pd_df, max_date
105
 
 
21
  "Major"
22
  ),
23
  url: str = r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv"
24
+ ) -> tuple[pd_DataFrame | None, list, date]:
25
+ """Read CSV data from GitHub file.
26
+
27
+ Args:
28
+ start_date (date | str): Start date of read data.
29
+ retrieve_columns (list | tuple, optional): Get select columns. Defaults to ( "publication_date", "document_number", "significant", "econ_significant", "3(f)(1) significant", "Major" ).
30
+ url (str, optional): URL where data are located. Defaults to r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv".
31
+
32
+ Returns:
33
+ tuple: Data, column names, max date in dataset
34
+ """
35
  # handle dates formatted as str
36
  if isinstance(start_date, str):
37
  start_date = date.fromisoformat(start_date)
 
66
  return None, cols, max_date
67
 
68
 
69
+ def clean_data(
70
+ df: pl.DataFrame,
71
+ document_numbers: list,
72
+ *,
73
+ return_optimized_plan: bool = False
74
+ ):
75
+ """Clean data.
76
+
77
+ Args:
78
+ df (pl.DataFrame): Input polars dataframe.
79
+ document_numbers (list): List of document numbers to keep.
80
+ return_optimized_plan (bool, optional): Return optimized query plan rather than dataframe. Defaults to False.
81
+
82
+ Returns:
83
+ DataFrame | str: Cleaned data (or string representation of the query plan)
84
+ """
85
  # start a lazy query
86
  lf = (
87
  df.lazy()
 
89
  .with_columns(pl.col("document_number").str.strip_chars())
90
  # only keep document_numbers from input
91
  .filter(pl.col("document_number").is_in(document_numbers))
 
 
 
 
92
  )
93
 
94
  # return optimized query plan instead of df
 
99
  return lf.collect()
100
 
101
 
102
+ def merge_with_api_results(
103
+ pd_df: pd_DataFrame,
104
+ pl_df: pl.DataFrame
105
+ ):
106
+ """Merge significance data with FR API data.
107
+
108
+ Args:
109
+ pd_df (pd_DataFrame): Main dataset of FR rules.
110
+ pl_df (pl.DataFrame): Significance data.
111
+
112
+ Returns:
113
+ DataFrame: Merged data.
114
+ """
115
  main_df = pl.from_pandas(pd_df)
116
  df = main_df.join(pl_df, on="document_number", how="left", validate="1:1", coalesce=True)
117
  return df.to_pandas()
118
 
119
 
120
+ def get_significant_info(input_df: pd_DataFrame, start_date: str, document_numbers: list):
121
+ """Retrieve significance information for input data.
122
+
123
+ Args:
124
+ input_df (pd.DataFrame): Input data.
125
+ start_date (str): Start date of data.
126
+ document_numbers (list): Documents to keep.
127
+
128
+ Returns:
129
+ tuple[DataFrame, datetime.date]: Data with significance information, max date in dataset
130
+ """
131
+ pl_df, _, max_date = read_csv_data(start_date)
132
  if pl_df is None:
133
  print("Failed to integrate significance tracking data with retrieved documents.")
134
  return input_df
135
+ pl_df = clean_data(pl_df, document_numbers)
136
  pd_df = merge_with_api_results(input_df, pl_df)
137
  return pd_df, max_date
138
 
modules/utils.py CHANGED
@@ -2,6 +2,9 @@ from pandas import DataFrame
2
 
3
 
4
  def _get_nested_metadata(metadata_key: str, metadata: dict[dict], metadata_value: str):
 
 
 
5
  getter = metadata.get(metadata_key, {})
6
  return getter.get(metadata_value, metadata_key)
7
 
 
2
 
3
 
4
  def _get_nested_metadata(metadata_key: str, metadata: dict[dict], metadata_value: str):
5
+ """Get nested metadata from `dict[dict, Any]` structure.
6
+ Returns "metadata_key" as default value.
7
+ """
8
  getter = metadata.get(metadata_key, {})
9
  return getter.get(metadata_value, metadata_key)
10
 
tests/test_get_data.py CHANGED
@@ -6,7 +6,7 @@ from modules.get_rules_in_window import (
6
  )
7
 
8
 
9
- def test_get_date_range(start_str: str = "2024-05-01"):
10
 
11
  start_date = date.fromisoformat(start_str)
12
  end_year = start_date.year + 1
@@ -15,7 +15,7 @@ def test_get_date_range(start_str: str = "2024-05-01"):
15
  assert isinstance(dates_str, dict)
16
  assert (
17
  dates_str.get("start") == start_str
18
- and dates_str.get("end") == f"{end_year}-01-31"
19
  and dates_str.get("transition_year") == end_year
20
  )
21
 
 
6
  )
7
 
8
 
9
+ def test_get_date_range(start_str: str = "2024-05-01", end_mmdd: str = "01-03"):
10
 
11
  start_date = date.fromisoformat(start_str)
12
  end_year = start_date.year + 1
 
15
  assert isinstance(dates_str, dict)
16
  assert (
17
  dates_str.get("start") == start_str
18
+ and dates_str.get("end") == f"{end_year}-{end_mmdd}"
19
  and dates_str.get("transition_year") == end_year
20
  )
21