File size: 5,869 Bytes
bb93e21 fe4f734 bb93e21 4ea5901 bb93e21 fe4f734 bb93e21 fe4f734 bb93e21 58bb4c7 bb93e21 fe4f734 bb93e21 fe4f734 bb93e21 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
# gather details on rule significance from FR tracking document
# see: https://github.com/regulatorystudies/Reg-Stats/blob/main/data/fr_tracking/fr_tracking.csv
from datetime import date
import polars as pl
from pandas import (
DataFrame as pd_DataFrame,
read_csv as pd_read_csv,
to_datetime,
)
def read_csv_data(
start_date: date | str,
retrieve_columns: list | tuple = (
"publication_date",
"document_number",
"significant",
"econ_significant",
"3(f)(1) significant",
"Major"
),
url: str = r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv"
) -> tuple[pd_DataFrame | None, list, date]:
"""Read CSV data from GitHub file.
Args:
start_date (date | str): Start date of read data.
retrieve_columns (list | tuple, optional): Get select columns. Defaults to ( "publication_date", "document_number", "significant", "econ_significant", "3(f)(1) significant", "Major" ).
url (str, optional): URL where data are located. Defaults to r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv".
Returns:
tuple: Data, column names, max date in dataset
"""
# handle dates formatted as str
if isinstance(start_date, str):
start_date = date.fromisoformat(start_date)
# drop econ_significant column for dates on or after EO 14094
if start_date >= date.fromisoformat("2023-04-06"):
cols = [col for col in retrieve_columns if col != "econ_significant"]
else:
cols = list(retrieve_columns)
# read csv; try different encoding if raises error
try:
df_pd = pd_read_csv(url, usecols=cols)
except UnicodeDecodeError:
df_pd = pd_read_csv(url, usecols=cols, encoding="latin")
# drop duplicates if any (to avoid dashboard crush; original data need to be revised)
if len(df_pd[df_pd.duplicated(subset=['document_number'],keep=False)])>0:
df_pd=df_pd.sort_values(['document_number','publication_date','significant','3(f)(1) significant','Major']).\
drop_duplicates(subset=['document_number'],keep='last',ignore_index=True)
else:
pass
df_pd.loc[:, "publication_dt"] = to_datetime(df_pd["publication_date"], format="mixed", dayfirst=False, yearfirst=False)
max_date = max(df_pd.loc[:, "publication_dt"].to_list()).date()
#print(max_date)
cols.remove("publication_date")
df = pl.from_pandas(df_pd.loc[:, cols])
if df.shape[1] == len(cols):
# rename columns if they exist
rename_cols = {"3(f)(1) significant": "3f1_significant", "Major": "major"}
if all(True if rename in cols else False for rename in rename_cols.keys()):
df = df.rename(rename_cols)
cols = [rename_cols.get(col, col) for col in cols]
return df, cols, max_date
else:
return None, cols, max_date
def clean_data(
df: pl.DataFrame,
document_numbers: list,
*,
return_optimized_plan: bool = False
):
"""Clean data.
Args:
df (pl.DataFrame): Input polars dataframe.
document_numbers (list): List of document numbers to keep.
return_optimized_plan (bool, optional): Return optimized query plan rather than dataframe. Defaults to False.
Returns:
DataFrame | str: Cleaned data (or string representation of the query plan)
"""
# start a lazy query
lf = (
df.lazy()
# strip whitespace
.with_columns(pl.col("document_number").str.strip_chars())
# only keep document_numbers from input
.filter(pl.col("document_number").is_in(document_numbers))
)
# return optimized query plan instead of df
if return_optimized_plan:
return lf.explain(optimized=True)
# call collect to return df
return lf.collect()
def merge_with_api_results(
pd_df: pd_DataFrame,
pl_df: pl.DataFrame
):
"""Merge significance data with FR API data.
Args:
pd_df (pd_DataFrame): Main dataset of FR rules.
pl_df (pl.DataFrame): Significance data.
Returns:
DataFrame: Merged data.
"""
main_df = pl.from_pandas(pd_df)
df = main_df.join(pl_df, on="document_number", how="left", validate="1:1", coalesce=True)
return df.to_pandas()
def get_significant_info(input_df: pd_DataFrame, start_date: str, document_numbers: list):
"""Retrieve significance information for input data.
Args:
input_df (pd.DataFrame): Input data.
start_date (str): Start date of data.
document_numbers (list): Documents to keep.
Returns:
tuple[DataFrame, datetime.date]: Data with significance information, max date in dataset
"""
pl_df, _, max_date = read_csv_data(start_date)
if pl_df is None:
print("Failed to integrate significance tracking data with retrieved documents.")
return input_df
pl_df = clean_data(pl_df, document_numbers)
pd_df = merge_with_api_results(input_df, pl_df)
return pd_df, max_date
if __name__ == "__main__":
date_a = "2023-04-05"
date_b = "2023-04-06"
numbers = [
"2021-01303",
'2023-28006',
'2024-00149',
'2024-00089',
'2023-28828',
'2024-00300',
'2024-00045',
'2024-00192',
'2024-00228',
'2024-00187'
]
# test for dates before EO 14094
df_a, clean_cols = read_csv_data(date_a)
df_a = clean_data(df_a, numbers, clean_cols)
# test for dates after EO 14094
df_b, clean_cols = read_csv_data(date_b)
df_b = clean_data(df_b, numbers, clean_cols)
#df_b.rename({"test": "test1"})
#print(df_a.shape, df_b.shape)
|