cra-window-rules / modules /significant.py
zhoudanxie's picture
Fix fr_tracking data errors
4ea5901
# gather details on rule significance from FR tracking document
# see: https://github.com/regulatorystudies/Reg-Stats/blob/main/data/fr_tracking/fr_tracking.csv
from datetime import date
import polars as pl
from pandas import (
DataFrame as pd_DataFrame,
read_csv as pd_read_csv,
to_datetime,
)
def read_csv_data(
start_date: date | str,
retrieve_columns: list | tuple = (
"publication_date",
"document_number",
"significant",
"econ_significant",
"3(f)(1) significant",
"Major"
),
url: str = r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv"
) -> tuple[pd_DataFrame | None, list, date]:
"""Read CSV data from GitHub file.
Args:
start_date (date | str): Start date of read data.
retrieve_columns (list | tuple, optional): Get select columns. Defaults to ( "publication_date", "document_number", "significant", "econ_significant", "3(f)(1) significant", "Major" ).
url (str, optional): URL where data are located. Defaults to r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv".
Returns:
tuple: Data, column names, max date in dataset
"""
# handle dates formatted as str
if isinstance(start_date, str):
start_date = date.fromisoformat(start_date)
# drop econ_significant column for dates on or after EO 14094
if start_date >= date.fromisoformat("2023-04-06"):
cols = [col for col in retrieve_columns if col != "econ_significant"]
else:
cols = list(retrieve_columns)
# read csv; try different encoding if raises error
try:
df_pd = pd_read_csv(url, usecols=cols)
except UnicodeDecodeError:
df_pd = pd_read_csv(url, usecols=cols, encoding="latin")
# drop duplicates if any (to avoid dashboard crush; original data need to be revised)
if len(df_pd[df_pd.duplicated(subset=['document_number'],keep=False)])>0:
df_pd=df_pd.sort_values(['document_number','publication_date','significant','3(f)(1) significant','Major']).\
drop_duplicates(subset=['document_number'],keep='last',ignore_index=True)
else:
pass
df_pd.loc[:, "publication_dt"] = to_datetime(df_pd["publication_date"], format="mixed", dayfirst=False, yearfirst=False)
max_date = max(df_pd.loc[:, "publication_dt"].to_list()).date()
#print(max_date)
cols.remove("publication_date")
df = pl.from_pandas(df_pd.loc[:, cols])
if df.shape[1] == len(cols):
# rename columns if they exist
rename_cols = {"3(f)(1) significant": "3f1_significant", "Major": "major"}
if all(True if rename in cols else False for rename in rename_cols.keys()):
df = df.rename(rename_cols)
cols = [rename_cols.get(col, col) for col in cols]
return df, cols, max_date
else:
return None, cols, max_date
def clean_data(
df: pl.DataFrame,
document_numbers: list,
*,
return_optimized_plan: bool = False
):
"""Clean data.
Args:
df (pl.DataFrame): Input polars dataframe.
document_numbers (list): List of document numbers to keep.
return_optimized_plan (bool, optional): Return optimized query plan rather than dataframe. Defaults to False.
Returns:
DataFrame | str: Cleaned data (or string representation of the query plan)
"""
# start a lazy query
lf = (
df.lazy()
# strip whitespace
.with_columns(pl.col("document_number").str.strip_chars())
# only keep document_numbers from input
.filter(pl.col("document_number").is_in(document_numbers))
)
# return optimized query plan instead of df
if return_optimized_plan:
return lf.explain(optimized=True)
# call collect to return df
return lf.collect()
def merge_with_api_results(
pd_df: pd_DataFrame,
pl_df: pl.DataFrame
):
"""Merge significance data with FR API data.
Args:
pd_df (pd_DataFrame): Main dataset of FR rules.
pl_df (pl.DataFrame): Significance data.
Returns:
DataFrame: Merged data.
"""
main_df = pl.from_pandas(pd_df)
df = main_df.join(pl_df, on="document_number", how="left", validate="1:1", coalesce=True)
return df.to_pandas()
def get_significant_info(input_df: pd_DataFrame, start_date: str, document_numbers: list):
"""Retrieve significance information for input data.
Args:
input_df (pd.DataFrame): Input data.
start_date (str): Start date of data.
document_numbers (list): Documents to keep.
Returns:
tuple[DataFrame, datetime.date]: Data with significance information, max date in dataset
"""
pl_df, _, max_date = read_csv_data(start_date)
if pl_df is None:
print("Failed to integrate significance tracking data with retrieved documents.")
return input_df
pl_df = clean_data(pl_df, document_numbers)
pd_df = merge_with_api_results(input_df, pl_df)
return pd_df, max_date
if __name__ == "__main__":
date_a = "2023-04-05"
date_b = "2023-04-06"
numbers = [
"2021-01303",
'2023-28006',
'2024-00149',
'2024-00089',
'2023-28828',
'2024-00300',
'2024-00045',
'2024-00192',
'2024-00228',
'2024-00187'
]
# test for dates before EO 14094
df_a, clean_cols = read_csv_data(date_a)
df_a = clean_data(df_a, numbers, clean_cols)
# test for dates after EO 14094
df_b, clean_cols = read_csv_data(date_b)
df_b = clean_data(df_b, numbers, clean_cols)
#df_b.rename({"test": "test1"})
#print(df_a.shape, df_b.shape)