# gather details on rule significance from FR tracking document # see: https://github.com/regulatorystudies/Reg-Stats/blob/main/data/fr_tracking/fr_tracking.csv from datetime import date import polars as pl from pandas import ( DataFrame as pd_DataFrame, read_csv as pd_read_csv, to_datetime, ) def read_csv_data( start_date: date | str, retrieve_columns: list | tuple = ( "publication_date", "document_number", "significant", "econ_significant", "3(f)(1) significant", "Major" ), url: str = r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv" ) -> tuple[pd_DataFrame | None, list, date]: """Read CSV data from GitHub file. Args: start_date (date | str): Start date of read data. retrieve_columns (list | tuple, optional): Get select columns. Defaults to ( "publication_date", "document_number", "significant", "econ_significant", "3(f)(1) significant", "Major" ). url (str, optional): URL where data are located. Defaults to r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv". Returns: tuple: Data, column names, max date in dataset """ # handle dates formatted as str if isinstance(start_date, str): start_date = date.fromisoformat(start_date) # drop econ_significant column for dates on or after EO 14094 if start_date >= date.fromisoformat("2023-04-06"): cols = [col for col in retrieve_columns if col != "econ_significant"] else: cols = list(retrieve_columns) # read csv; try different encoding if raises error try: df_pd = pd_read_csv(url, usecols=cols) except UnicodeDecodeError: df_pd = pd_read_csv(url, usecols=cols, encoding="latin") # drop duplicates if any (to avoid dashboard crush; original data need to be revised) if len(df_pd[df_pd.duplicated(subset=['document_number'],keep=False)])>0: df_pd=df_pd.sort_values(['document_number','publication_date','significant','3(f)(1) significant','Major']).\ drop_duplicates(subset=['document_number'],keep='last',ignore_index=True) else: pass df_pd.loc[:, "publication_dt"] = to_datetime(df_pd["publication_date"], format="mixed", dayfirst=False, yearfirst=False) max_date = max(df_pd.loc[:, "publication_dt"].to_list()).date() #print(max_date) cols.remove("publication_date") df = pl.from_pandas(df_pd.loc[:, cols]) if df.shape[1] == len(cols): # rename columns if they exist rename_cols = {"3(f)(1) significant": "3f1_significant", "Major": "major"} if all(True if rename in cols else False for rename in rename_cols.keys()): df = df.rename(rename_cols) cols = [rename_cols.get(col, col) for col in cols] return df, cols, max_date else: return None, cols, max_date def clean_data( df: pl.DataFrame, document_numbers: list, *, return_optimized_plan: bool = False ): """Clean data. Args: df (pl.DataFrame): Input polars dataframe. document_numbers (list): List of document numbers to keep. return_optimized_plan (bool, optional): Return optimized query plan rather than dataframe. Defaults to False. Returns: DataFrame | str: Cleaned data (or string representation of the query plan) """ # start a lazy query lf = ( df.lazy() # strip whitespace .with_columns(pl.col("document_number").str.strip_chars()) # only keep document_numbers from input .filter(pl.col("document_number").is_in(document_numbers)) ) # return optimized query plan instead of df if return_optimized_plan: return lf.explain(optimized=True) # call collect to return df return lf.collect() def merge_with_api_results( pd_df: pd_DataFrame, pl_df: pl.DataFrame ): """Merge significance data with FR API data. Args: pd_df (pd_DataFrame): Main dataset of FR rules. pl_df (pl.DataFrame): Significance data. Returns: DataFrame: Merged data. """ main_df = pl.from_pandas(pd_df) df = main_df.join(pl_df, on="document_number", how="left", validate="1:1", coalesce=True) return df.to_pandas() def get_significant_info(input_df: pd_DataFrame, start_date: str, document_numbers: list): """Retrieve significance information for input data. Args: input_df (pd.DataFrame): Input data. start_date (str): Start date of data. document_numbers (list): Documents to keep. Returns: tuple[DataFrame, datetime.date]: Data with significance information, max date in dataset """ pl_df, _, max_date = read_csv_data(start_date) if pl_df is None: print("Failed to integrate significance tracking data with retrieved documents.") return input_df pl_df = clean_data(pl_df, document_numbers) pd_df = merge_with_api_results(input_df, pl_df) return pd_df, max_date if __name__ == "__main__": date_a = "2023-04-05" date_b = "2023-04-06" numbers = [ "2021-01303", '2023-28006', '2024-00149', '2024-00089', '2023-28828', '2024-00300', '2024-00045', '2024-00192', '2024-00228', '2024-00187' ] # test for dates before EO 14094 df_a, clean_cols = read_csv_data(date_a) df_a = clean_data(df_a, numbers, clean_cols) # test for dates after EO 14094 df_b, clean_cols = read_csv_data(date_b) df_b = clean_data(df_b, numbers, clean_cols) #df_b.rename({"test": "test1"}) #print(df_a.shape, df_b.shape)