Spaces:

regulatorystudies
/

cra-window-rules

Running

Mark Febrizio commited on Aug 1

Commit

fe4f734

•

1 Parent(s): a0ce115

Documentation (#24)

* Update significant.py

* docstrings for grouping

* Update plotting.py

docstrings and remove inoperable code

* docstrings

* Update dependabot.yml

* Update significant.py

bugfix where clean_cols was still being passed even after being removed

* Update test_get_data.py

update test end date

* Update README.md

Add summary of potential uses

* Update app.py

adjust sig tooltip language

Files changed (10) hide show

.github/dependabot.yml +1 -2
README.md +11 -0
app.py +3 -3
cra_window_rules.py +12 -1
modules/grouping.py +45 -15
modules/plotting.py +30 -18
modules/search_columns.py +8 -6
modules/significant.py +53 -20
modules/utils.py +3 -0
tests/test_get_data.py +2 -2

.github/dependabot.yml CHANGED Viewed

@@ -8,9 +8,8 @@ updates:
   - package-ecosystem: "pip"
     directory: "/" # Location of package manifests
     schedule:
-      interval: "weekly"
     allow:
-      - dependency-name: "faicons"
       - dependency-name: "fr-toolbelt"
       - dependency-name: "pandas"
       - dependency-name: "plotnine"

   - package-ecosystem: "pip"
     directory: "/" # Location of package manifests
     schedule:
+      interval: "monthly"
     allow:
       - dependency-name: "fr-toolbelt"
       - dependency-name: "pandas"
       - dependency-name: "plotnine"

README.md CHANGED Viewed

@@ -20,3 +20,14 @@ Dashboard app tracking rules falling within the Congressional Review Act (CRA) w
 **[Link to HuggingFace Space that hosts the dashboard app.](https://huggingface.co/spaces/regulatorystudies/cra-window-rules)**
 Developed by the [GW Regulatory Studies Center](https://go.gwu.edu/regstudies).

 **[Link to HuggingFace Space that hosts the dashboard app.](https://huggingface.co/spaces/regulatorystudies/cra-window-rules)**
 Developed by the [GW Regulatory Studies Center](https://go.gwu.edu/regstudies).
+## Potential Uses
+The CRA lookback window is a hot topic going into a presidential election year. One natural question building on the opening of the window is: *what rules might fall into the window and could be subject to the CRA next year?*
+This information may be valuable for different stakeholders in different ways:
+1. Industry observers and other regulated entities may want to be aware of what rules might or might not be durable given an administration change;
+2. Advocacy groups might want to know how to advocate for or against such policies;
+3. Federal policymakers can have a better sense of what policies may or may not carry over given a change in administration;
+4. Journalists may want to focus on what highly salient policies might be reversed depending on the results of the election. They also may want to know more broadly how many rules fall into the window and from which agencies.

app.py CHANGED Viewed

@@ -18,7 +18,7 @@ from modules import (
     AGENCIES,
     groupby_agency,
     groupby_date,
-    add_weeks_to_data,
     pad_missing_dates,
     plot_agency,
     plot_tf,
@@ -85,7 +85,7 @@ with ui.sidebar(open={"desktop": "open", "mobile": "closed"}, fg="#033C5A"):
     with ui.tooltip(placement="right", id="sig_tooltip"):
         ui.input_select("menu_significant", "Select rule significance", choices=["all", "3f1-significant", "other-significant"], selected="all", multiple=True, size=3)
-        "Rule significance as defined in Executive Order 12866."
     with ui.tooltip(placement="right", id="agency_tooltip"):
         ui.input_select("menu_agency", "Select agencies", choices=["all"] + AGENCIES, selected=["all"], multiple=True, size=6)
@@ -348,7 +348,7 @@ def grouped_df_day():
 @reactive.calc
 def grouped_df_week():
     filt_df = filter_significance()
-    filt_df = add_weeks_to_data(filt_df)
     try:
         grouped = groupby_date(filt_df, group_col=("week_number", "week_of"), significant=GET_SIGNIFICANT)
         grouped = pad_missing_dates(

     AGENCIES,
     groupby_agency,
     groupby_date,
+    add_week_info_to_data,
     pad_missing_dates,
     plot_agency,
     plot_tf,
     with ui.tooltip(placement="right", id="sig_tooltip"):
         ui.input_select("menu_significant", "Select rule significance", choices=["all", "3f1-significant", "other-significant"], selected="all", multiple=True, size=3)
+        "Rule significance as defined in Executive Order 12866, as amended by Executive Order 14094."
     with ui.tooltip(placement="right", id="agency_tooltip"):
         ui.input_select("menu_agency", "Select agencies", choices=["all"] + AGENCIES, selected=["all"], multiple=True, size=6)
 @reactive.calc
 def grouped_df_week():
     filt_df = filter_significance()
+    filt_df = add_week_info_to_data(filt_df)
     try:
         grouped = groupby_date(filt_df, group_col=("week_number", "week_of"), significant=GET_SIGNIFICANT)
         grouped = pad_missing_dates(

cra_window_rules.py CHANGED Viewed

@@ -13,6 +13,15 @@ from modules import (
 def save_csv(path: Path, df_all: DataFrame, df_agency: DataFrame, df_ym: DataFrame, transition_year: int):
     files = (
         f"rules_{transition_year - 1}_{transition_year}.csv",
         f"rules_by_agency_{transition_year - 1}_{transition_year}.csv",
@@ -23,7 +32,9 @@ def save_csv(path: Path, df_all: DataFrame, df_agency: DataFrame, df_ym: DataFra
         data.to_csv(path / file, index=False)
-def main(start_date, save_data: bool = True, path: Path | None = None, metadata: dict | None = None, significant: bool = True):
     if date.fromisoformat(start_date) < date(2023, 4, 6):
         significant = False
     date_range = get_date_range(start_date)

 def save_csv(path: Path, df_all: DataFrame, df_agency: DataFrame, df_ym: DataFrame, transition_year: int):
+    """Save output as CSV files.
+    Args:
+        path (Path): Save data here.
+        df_all (DataFrame): Data at the rule level.
+        df_agency (DataFrame): Data grouped by agency.
+        df_ym (DataFrame): Data grouped by publication year and month.
+        transition_year (int): Presidential transition year.
+    """
     files = (
         f"rules_{transition_year - 1}_{transition_year}.csv",
         f"rules_by_agency_{transition_year - 1}_{transition_year}.csv",
         data.to_csv(path / file, index=False)
+def main(start_date: str, save_data: bool = True, path: Path | None = None, metadata: dict | None = None, significant: bool = True):
+    """Retrieve rules in CRA window and save resulting data.
+    """
     if date.fromisoformat(start_date) < date(2023, 4, 6):
         significant = False
     date_range = get_date_range(start_date)

modules/grouping.py CHANGED Viewed

@@ -87,7 +87,7 @@ def _get_weeks(dates: list[date], end_date: date | None = None, **kwargs) -> lis
     return results
-def add_weeks_to_data(df: DataFrame, date_column: str = "publication_date", new_columns: tuple[str] = ("week_number", "week_of")):
     """Add week number and week start date to input data.
     Args:
@@ -108,7 +108,8 @@ def add_weeks_to_data(df: DataFrame, date_column: str = "publication_date", new_
 def _pad_missing_weeks(timeframe_list: list[date], **kwargs):
     # get the start date for the first week
     first_week_start = _get_first_week_start(timeframe_list)
@@ -117,7 +118,8 @@ def _pad_missing_weeks(timeframe_list: list[date], **kwargs):
 def _pad_missing_days(timeframe_list: list[date], end_date: date | None = None):
     start_date = min(timeframe_list)
     if end_date is None:
         end_date = date.today()
@@ -132,7 +134,20 @@ def _pad_missing_days(timeframe_list: list[date], end_date: date | None = None):
 def pad_missing_dates(df: DataFrame, pad_column: str, how: str, fill_padded_values: dict | None = None, **kwargs):
     df_copy = df.copy()
     timeframe_list = [d.date() if isinstance(d, (Timestamp, datetime)) else d for d in df_copy[pad_column].to_list()]
     df_copy = df_copy.astype({pad_column: "object"})
@@ -148,7 +163,7 @@ def pad_missing_dates(df: DataFrame, pad_column: str, how: str, fill_padded_valu
         elif how == "weeks":
             week_numbers, padded_timeframes = zip(*_pad_missing_weeks(timeframe_list, **kwargs))
         else:
-            raise ValueError
         # incorporate extended dates into dataframe
         df_merge = DataFrame({pad_column: padded_timeframes})
@@ -174,19 +189,19 @@ def groupby_agency(
         metadata: dict | None = None,
         metadata_value: str = "acronym",
     ):
-    """_summary_
     Args:
-        df (DataFrame): _description_
-        group_col (str, optional): _description_. Defaults to "parent_slug".
-        value_col (str, optional): _description_. Defaults to "document_number".
-        aggfunc (str, optional): _description_. Defaults to "count".
-        significant (bool, optional): _description_. Defaults to True.
-        metadata (dict | None, optional): _description_. Defaults to None.
-        metadata_value (str, optional): _description_. Defaults to "acronym".
     Returns:
-        _type_: _description_
     """
     aggfunc_dict = {value_col: aggfunc, }
     if significant:
@@ -225,12 +240,27 @@ def groupby_date(
         aggfunc: str = "count",
         significant: bool = True
     ):
     if isinstance(group_col, str):
         group_col = [group_col]
     elif isinstance(group_col, (list, tuple)):
         group_col = list(group_col)
     else:
-        raise TypeError
     aggfunc_dict = {value_col: aggfunc, }
     if significant:
@@ -263,7 +293,7 @@ if __name__ == "__main__":
     df_a = pad_missing_dates(df, "dates", "days", fill_padded_values={"values": 0})
     print(df_a.head(10))
-    df = add_weeks_to_data(df, date_column="dates")
     print(df.head(10))
     grouped = groupby_date(df, group_col=("week_number", "week_of"), value_col="values", significant=False)

     return results
+def add_week_info_to_data(df: DataFrame, date_column: str = "publication_date", new_columns: tuple[str] = ("week_number", "week_of")):
     """Add week number and week start date to input data.
     Args:
 def _pad_missing_weeks(timeframe_list: list[date], **kwargs):
+    """Pad dataframe with weeks missing from retrieved data (i.e., weeks without qualifying rule data).
+    """
     # get the start date for the first week
     first_week_start = _get_first_week_start(timeframe_list)
 def _pad_missing_days(timeframe_list: list[date], end_date: date | None = None):
+    """Pad dataframe with days missing from retrieved data (i.e., days without qualifying rule data).
+    """
     start_date = min(timeframe_list)
     if end_date is None:
         end_date = date.today()
 def pad_missing_dates(df: DataFrame, pad_column: str, how: str, fill_padded_values: dict | None = None, **kwargs):
+    """Add missing dates (either weeks or days) to the dataset.
+    Args:
+        df (DataFrame): Input data.
+        pad_column (str): Date column to pad.
+        how (str): Whether to pad by "days" or "weeks".
+        fill_padded_values (dict | None, optional): Dictionary of columns and values to fill for padded observations (e.g., {"column": 0}). Defaults to None.
+    Raises:
+        ValueError: Must pass 'days' or 'weeks' to parameter 'how'.
+    Returns:
+        DataFrame: Padded data.
+    """
     df_copy = df.copy()
     timeframe_list = [d.date() if isinstance(d, (Timestamp, datetime)) else d for d in df_copy[pad_column].to_list()]
     df_copy = df_copy.astype({pad_column: "object"})
         elif how == "weeks":
             week_numbers, padded_timeframes = zip(*_pad_missing_weeks(timeframe_list, **kwargs))
         else:
+            raise ValueError("Must pass 'days' or 'weeks' to parameter 'how'.")
         # incorporate extended dates into dataframe
         df_merge = DataFrame({pad_column: padded_timeframes})
         metadata: dict | None = None,
         metadata_value: str = "acronym",
     ):
+    """Group data by agencies and aggregate the values.
     Args:
+        df (DataFrame): Input data.
+        group_col (str, optional): Column to group by. Defaults to "parent_slug".
+        value_col (str, optional): Column for values for grouping and aggregation. Defaults to "document_number".
+        aggfunc (str, optional): Aggregation function. Defaults to "count".
+        significant (bool, optional): Whether to include significance data in values. Defaults to True.
+        metadata (dict | None, optional): Agency metadata. Defaults to None.
+        metadata_value (str, optional): Metadata value to add to output data. Defaults to "acronym".
     Returns:
+        DataFrame: Grouped and aggregated data.
     """
     aggfunc_dict = {value_col: aggfunc, }
     if significant:
         aggfunc: str = "count",
         significant: bool = True
     ):
+    """Group data by a given date frequency and aggregate the values.
+    Args:
+        df (DataFrame): Input data.
+        group_col (str | tuple | list, optional): Columns to group by. Defaults to ("publication_year", "publication_month", ).
+        value_col (str, optional): Column for values for grouping and aggregation. Defaults to "document_number".
+        aggfunc (str, optional): Aggregation function. Defaults to "count".
+        significant (bool, optional): Whether to include significance data in values. Defaults to True.
+    Raises:
+        TypeError: Parameter 'group_col' must be type `str`, `list`, or `tuple`.
+    Returns:
+        DataFrame: Grouped and aggregated data.
+    """
     if isinstance(group_col, str):
         group_col = [group_col]
     elif isinstance(group_col, (list, tuple)):
         group_col = list(group_col)
     else:
+        raise TypeError("Parameter 'group_col' must be type `str`, `list`, or `tuple`.")
     aggfunc_dict = {value_col: aggfunc, }
     if significant:
     df_a = pad_missing_dates(df, "dates", "days", fill_padded_values={"values": 0})
     print(df_a.head(10))
+    df = add_week_info_to_data(df, date_column="dates")
     print(df.head(10))
     grouped = groupby_date(df, group_col=("week_number", "week_of"), value_col="values", significant=False)

modules/plotting.py CHANGED Viewed

@@ -22,7 +22,8 @@ class DataAvailabilityError(Exception):
 def plot_NA(placeholder_text: str = "Not enough data available to visualize.", placeholder_size: int = 14):
-    """Placeholder plot for when there is not enough data available to visualize."""
     return (
         ggplot()
         + annotate("text", x=0, y=0, label=placeholder_text, size=placeholder_size)
@@ -32,7 +33,8 @@ def plot_NA(placeholder_text: str = "Not enough data available to visualize.", p
 def generate_rule_axis_label(rule_types: list | None = None):
-    """Generate axis label for rules, accounting for rule type ("all", "3f1", or "other")."""
     categories = ""
     if (rule_types is None) or ("all" in rule_types):
         pass
@@ -52,7 +54,9 @@ def plot_agency(df, group_col = "acronym", value_col = "rules", color="#033C5A",
         df (DataFrame): Input data.
         group_col (str, optional): Column on which the data are grouped. Defaults to "acronym".
         value_col (str, optional): Column of values to be plotted. Defaults to "rules".
     Returns:
         ggplot: Plotted data.
     """
@@ -71,7 +75,6 @@ def plot_agency(df, group_col = "acronym", value_col = "rules", color="#033C5A",
         + labs(y=y_lab, x="", title="Rules Published by Agency")
         + theme_light()
         )
     return plot
@@ -89,7 +92,10 @@ def plot_month(
         df (DataFrame): Input data.
         group_cols (tuple, optional): Columns on which the data are grouped. Defaults to ("publication_year", "publication_month").
         value_col (str, optional): Column of values to be plotted. Defaults to "rules".
     Returns:
         ggplot: Plotted data.
     """
@@ -123,8 +129,11 @@ def plot_day(
     Args:
         df (DataFrame): Input data.
-        group_col (str, optional): Column on which the data are grouped. Defaults to ("publication_year", "publication_month").
         value_col (str, optional): Column of values to be plotted. Defaults to "rules".
     Returns:
         ggplot: Plotted data.
@@ -167,6 +176,19 @@ def plot_week(
         y_lab: str = "",
         show_significant: bool = False,
     ):
     max_value = df.loc[:, value_col].max()
     date_values = df[group_col].to_list()
@@ -199,26 +221,16 @@ def plot_week(
         + labs(y=y_lab, x="", title=title)
         + theme_light()
         )
-    if show_significant:
-        # trying to add significant rules as additional lines
-        # but getting "TypeError: Discrete value supplied to continuous scale"
-        # for 3f1 sig rules
-        df = df.astype({"3f1_significant": "float"})
-        plot = (
-            plot
-            #+ geom_line(aes(x=group_col, y="3f1_significant"), inherit_aes=False, group=1, color="#AA9868", linetype="dotted")
-            + geom_line(aes(x=group_col, y="other_significant"), inherit_aes=False, group=1, color="#0190DB", linetype="dashed")
-            #+ guide_legend()
-        )
     return plot
-def plot_tf(df: DataFrame, frequency: str, rule_types: str | None = None, **kwargs) -> ggplot:
     """Plot rules over time by given frequency.
     Args:
         df (DataFrame): Input data.
         frequency (str): Frequency of time for aggregating rules. Accepts "monthly" or "daily".
     Raises:
         ValueError: Frequency parameter received invalid value.

 def plot_NA(placeholder_text: str = "Not enough data available to visualize.", placeholder_size: int = 14):
+    """Placeholder plot for when there is not enough data available to visualize.
+    """
     return (
         ggplot()
         + annotate("text", x=0, y=0, label=placeholder_text, size=placeholder_size)
 def generate_rule_axis_label(rule_types: list | None = None):
+    """Generate axis label for rules, accounting for rule type ("all", "3f1-significant", or "other-significant").
+    """
     categories = ""
     if (rule_types is None) or ("all" in rule_types):
         pass
         df (DataFrame): Input data.
         group_col (str, optional): Column on which the data are grouped. Defaults to "acronym".
         value_col (str, optional): Column of values to be plotted. Defaults to "rules".
+        color (str, optional): Color of values in plot. Defaults to "#033C5A" ([GW Blue](https://communications.gwu.edu/visual-identity/color-palette)).
+        rule_types (list | None, optional): One or more rule types to include in plot. Accepts "all", "3f1-significant", or "other-significant". Defaults to None.
     Returns:
         ggplot: Plotted data.
     """
         + labs(y=y_lab, x="", title="Rules Published by Agency")
         + theme_light()
         )
     return plot
         df (DataFrame): Input data.
         group_cols (tuple, optional): Columns on which the data are grouped. Defaults to ("publication_year", "publication_month").
         value_col (str, optional): Column of values to be plotted. Defaults to "rules".
+        color (str, optional): Color of values in plot. Defaults to "#033C5A" ([GW Blue](https://communications.gwu.edu/visual-identity/color-palette)).
+        title (str | None, optional): Plot title. Defaults to None.
+        y_lab (str, optional): Plot y label. Defaults to "" (empty string).
     Returns:
         ggplot: Plotted data.
     """
     Args:
         df (DataFrame): Input data.
+        group_col (str, optional): Column on which the data are grouped. Defaults to "publication_date".
         value_col (str, optional): Column of values to be plotted. Defaults to "rules".
+        color (str, optional): Color of values in plot. Defaults to "#033C5A" ([GW Blue](https://communications.gwu.edu/visual-identity/color-palette)).
+        title (str | None, optional): Plot title. Defaults to None.
+        y_lab (str, optional): Plot y label. Defaults to "" (empty string).
     Returns:
         ggplot: Plotted data.
         y_lab: str = "",
         show_significant: bool = False,
     ):
+    """Plot rules by week.
+    Args:
+        df (DataFrame): Input data.
+        group_col (str, optional): Column on which the data are grouped. Defaults to "week_of".
+        value_col (str, optional): Column of values to be plotted. Defaults to "rules".
+        color (str, optional): Color of values in plot. Defaults to "#033C5A" ([GW Blue](https://communications.gwu.edu/visual-identity/color-palette)).
+        title (str | None, optional): Plot title. Defaults to None.
+        y_lab (str, optional): Plot y label. Defaults to "" (empty string).
+    Returns:
+        ggplot: Plotted data.
+    """
     max_value = df.loc[:, value_col].max()
     date_values = df[group_col].to_list()
         + labs(y=y_lab, x="", title=title)
         + theme_light()
         )
     return plot
+def plot_tf(df: DataFrame, frequency: str, rule_types: list | None = None, **kwargs) -> ggplot:
     """Plot rules over time by given frequency.
     Args:
         df (DataFrame): Input data.
         frequency (str): Frequency of time for aggregating rules. Accepts "monthly" or "daily".
+        rule_types (list | None, optional): One or more rule types to include in plot. Accepts "all", "3f1-significant", or "other-significant". Defaults to None.
     Raises:
         ValueError: Frequency parameter received invalid value.

modules/search_columns.py CHANGED Viewed

@@ -11,12 +11,14 @@ class SearchError(Exception):
 # Defining a function to search for string patterns within dataframe columns
-def search_columns(df: DataFrame,
-                   patterns: list,
-                   columns: list,
-                   return_as: str = "indicator_column",
-                   return_column: str = "indicator",
-                   re_flags = re.I | re.X):
     """Search columns for string patterns within dataframe columns.
     Args:

 # Defining a function to search for string patterns within dataframe columns
+def search_columns(
+        df: DataFrame,
+        patterns: list,
+        columns: list,
+        return_as: str = "indicator_column",
+        return_column: str = "indicator",
+        re_flags = re.I | re.X
+    ):
     """Search columns for string patterns within dataframe columns.
     Args:

modules/significant.py CHANGED Viewed

@@ -21,7 +21,17 @@ def read_csv_data(
         "Major"
         ),
     url: str = r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv"
-    ):
     # handle dates formatted as str
     if isinstance(start_date, str):
         start_date = date.fromisoformat(start_date)
@@ -56,13 +66,22 @@ def read_csv_data(
         return None, cols, max_date
-def clean_data(df: pl.DataFrame,
-               document_numbers: list,
-               clean_columns: list | tuple,
-               #format_not_available_values: str = ".",
-               return_optimized_plan = False
-               ):
     # start a lazy query
     lf = (
         df.lazy()
@@ -70,10 +89,6 @@ def clean_data(df: pl.DataFrame,
         .with_columns(pl.col("document_number").str.strip_chars())
         # only keep document_numbers from input
         .filter(pl.col("document_number").is_in(document_numbers))
-        # temporarily format "not available" data (input as dots)
-        #.with_columns(pl.col(c for c in clean_columns if c != "document_number").str.replace_all(".", f"{format_not_available_values}", literal=True))
-        # cast to nullable int dtype
-        #.with_columns(pl.col(c for c in clean_columns if c != "document_number").cast(pl.Int64, strict=False))
         )
     # return optimized query plan instead of df
@@ -84,22 +99,40 @@ def clean_data(df: pl.DataFrame,
     return lf.collect()
-def merge_with_api_results(pd_df: pd_DataFrame,
-                           pl_df: pl.DataFrame
-                           ):
     main_df = pl.from_pandas(pd_df)
     df = main_df.join(pl_df, on="document_number", how="left", validate="1:1", coalesce=True)
     return df.to_pandas()
-def get_significant_info(input_df, start_date, document_numbers):
-    pl_df, clean_cols, max_date = read_csv_data(start_date)
     if pl_df is None:
         print("Failed to integrate significance tracking data with retrieved documents.")
         return input_df
-    pl_df = clean_data(pl_df, document_numbers, clean_cols)
     pd_df = merge_with_api_results(input_df, pl_df)
     return pd_df, max_date

         "Major"
         ),
     url: str = r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv"
+    ) -> tuple[pd_DataFrame | None, list, date]:
+    """Read CSV data from GitHub file.
+    Args:
+        start_date (date | str): Start date of read data.
+        retrieve_columns (list | tuple, optional): Get select columns. Defaults to ( "publication_date", "document_number", "significant", "econ_significant", "3(f)(1) significant", "Major" ).
+        url (str, optional): URL where data are located. Defaults to r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv".
+    Returns:
+        tuple: Data, column names, max date in dataset
+    """
     # handle dates formatted as str
     if isinstance(start_date, str):
         start_date = date.fromisoformat(start_date)
         return None, cols, max_date
+def clean_data(
+        df: pl.DataFrame,
+        document_numbers: list,
+        *,
+        return_optimized_plan: bool = False
+    ):
+    """Clean data.
+    Args:
+        df (pl.DataFrame): Input polars dataframe.
+        document_numbers (list): List of document numbers to keep.
+        return_optimized_plan (bool, optional): Return optimized query plan rather than dataframe. Defaults to False.
+    Returns:
+        DataFrame | str: Cleaned data (or string representation of the query plan)
+    """
     # start a lazy query
     lf = (
         df.lazy()
         .with_columns(pl.col("document_number").str.strip_chars())
         # only keep document_numbers from input
         .filter(pl.col("document_number").is_in(document_numbers))
         )
     # return optimized query plan instead of df
     return lf.collect()
+def merge_with_api_results(
+        pd_df: pd_DataFrame,
+        pl_df: pl.DataFrame
+    ):
+    """Merge significance data with FR API data.
+    Args:
+        pd_df (pd_DataFrame): Main dataset of FR rules.
+        pl_df (pl.DataFrame): Significance data.
+    Returns:
+        DataFrame: Merged data.
+    """
     main_df = pl.from_pandas(pd_df)
     df = main_df.join(pl_df, on="document_number", how="left", validate="1:1", coalesce=True)
     return df.to_pandas()
+def get_significant_info(input_df: pd_DataFrame, start_date: str, document_numbers: list):
+    """Retrieve significance information for input data.
+    Args:
+        input_df (pd.DataFrame): Input data.
+        start_date (str): Start date of data.
+        document_numbers (list): Documents to keep.
+    Returns:
+        tuple[DataFrame, datetime.date]: Data with significance information, max date in dataset
+    """
+    pl_df, _, max_date = read_csv_data(start_date)
     if pl_df is None:
         print("Failed to integrate significance tracking data with retrieved documents.")
         return input_df
+    pl_df = clean_data(pl_df, document_numbers)
     pd_df = merge_with_api_results(input_df, pl_df)
     return pd_df, max_date

modules/utils.py CHANGED Viewed

@@ -2,6 +2,9 @@ from pandas import DataFrame
 def _get_nested_metadata(metadata_key: str, metadata: dict[dict], metadata_value: str):
     getter = metadata.get(metadata_key, {})
     return getter.get(metadata_value, metadata_key)

 def _get_nested_metadata(metadata_key: str, metadata: dict[dict], metadata_value: str):
+    """Get nested metadata from `dict[dict, Any]` structure.
+    Returns "metadata_key" as default value.
+    """
     getter = metadata.get(metadata_key, {})
     return getter.get(metadata_value, metadata_key)

tests/test_get_data.py CHANGED Viewed

@@ -6,7 +6,7 @@ from modules.get_rules_in_window import (
 )
-def test_get_date_range(start_str: str = "2024-05-01"):
     start_date = date.fromisoformat(start_str)
     end_year = start_date.year + 1
@@ -15,7 +15,7 @@ def test_get_date_range(start_str: str = "2024-05-01"):
     assert isinstance(dates_str, dict)
     assert (
         dates_str.get("start") == start_str
-        and dates_str.get("end") == f"{end_year}-01-31"
         and dates_str.get("transition_year") == end_year
         )

 )
+def test_get_date_range(start_str: str = "2024-05-01", end_mmdd: str = "01-03"):
     start_date = date.fromisoformat(start_str)
     end_year = start_date.year + 1
     assert isinstance(dates_str, dict)
     assert (
         dates_str.get("start") == start_str
+        and dates_str.get("end") == f"{end_year}-{end_mmdd}"
         and dates_str.get("transition_year") == end_year
         )