cra-window-rules / modules /significant.py
Mark Febrizio
Revisions 2024 06 (#11)
58bb4c7 unverified
raw
history blame
4.47 kB
# gather details on rule significance from FR tracking document
# see: https://github.com/regulatorystudies/Reg-Stats/blob/main/data/fr_tracking/fr_tracking.csv
from datetime import date
import polars as pl
from pandas import (
DataFrame as pd_DataFrame,
read_csv as pd_read_csv,
to_datetime,
)
def read_csv_data(
start_date: date | str,
retrieve_columns: list | tuple = (
"publication_date",
"document_number",
"significant",
"econ_significant",
"3(f)(1) significant",
"Major"
),
url: str = r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv"
):
# handle dates formatted as str
if isinstance(start_date, str):
start_date = date.fromisoformat(start_date)
# drop econ_significant column for dates on or after EO 14094
if start_date >= date.fromisoformat("2023-04-06"):
cols = [col for col in retrieve_columns if col != "econ_significant"]
else:
cols = list(retrieve_columns)
# read csv; try different encoding if raises error
try:
df_pd = pd_read_csv(url, usecols=cols)
except UnicodeDecodeError:
df_pd = pd_read_csv(url, usecols=cols, encoding="latin")
df_pd.loc[:, "publication_dt"] = to_datetime(df_pd["publication_date"], format="mixed", dayfirst=False, yearfirst=False)
max_date = max(df_pd.loc[:, "publication_dt"].to_list()).date()
#print(max_date)
cols.remove("publication_date")
df = pl.from_pandas(df_pd.loc[:, cols])
if df.shape[1] == len(cols):
# rename columns if they exist
rename_cols = {"3(f)(1) significant": "3f1_significant", "Major": "major"}
if all(True if rename in cols else False for rename in rename_cols.keys()):
df = df.rename(rename_cols)
cols = [rename_cols.get(col, col) for col in cols]
return df, cols, max_date
else:
return None, cols, max_date
def clean_data(df: pl.DataFrame,
document_numbers: list,
clean_columns: list | tuple,
#format_not_available_values: str = ".",
return_optimized_plan = False
):
# start a lazy query
lf = (
df.lazy()
# strip whitespace
.with_columns(pl.col("document_number").str.strip_chars())
# only keep document_numbers from input
.filter(pl.col("document_number").is_in(document_numbers))
# temporarily format "not available" data (input as dots)
#.with_columns(pl.col(c for c in clean_columns if c != "document_number").str.replace_all(".", f"{format_not_available_values}", literal=True))
# cast to nullable int dtype
#.with_columns(pl.col(c for c in clean_columns if c != "document_number").cast(pl.Int64, strict=False))
)
# return optimized query plan instead of df
if return_optimized_plan:
return lf.explain(optimized=True)
# call collect to return df
return lf.collect()
def merge_with_api_results(pd_df: pd_DataFrame,
pl_df: pl.DataFrame
):
main_df = pl.from_pandas(pd_df)
df = main_df.join(pl_df, on="document_number", how="left", validate="1:1", coalesce=True)
return df.to_pandas()
def get_significant_info(input_df, start_date, document_numbers):
pl_df, clean_cols, max_date = read_csv_data(start_date)
if pl_df is None:
print("Failed to integrate significance tracking data with retrieved documents.")
return input_df
pl_df = clean_data(pl_df, document_numbers, clean_cols)
pd_df = merge_with_api_results(input_df, pl_df)
return pd_df, max_date
if __name__ == "__main__":
date_a = "2023-04-05"
date_b = "2023-04-06"
numbers = [
"2021-01303",
'2023-28006',
'2024-00149',
'2024-00089',
'2023-28828',
'2024-00300',
'2024-00045',
'2024-00192',
'2024-00228',
'2024-00187'
]
# test for dates before EO 14094
df_a, clean_cols = read_csv_data(date_a)
df_a = clean_data(df_a, numbers, clean_cols)
# test for dates after EO 14094
df_b, clean_cols = read_csv_data(date_b)
df_b = clean_data(df_b, numbers, clean_cols)
#df_b.rename({"test": "test1"})
#print(df_a.shape, df_b.shape)