|
|
|
|
|
|
|
from datetime import date |
|
import polars as pl |
|
from pandas import ( |
|
DataFrame as pd_DataFrame, |
|
read_csv as pd_read_csv, |
|
to_datetime, |
|
) |
|
|
|
|
|
def read_csv_data( |
|
start_date: date | str, |
|
retrieve_columns: list | tuple = ( |
|
"publication_date", |
|
"document_number", |
|
"significant", |
|
"econ_significant", |
|
"3(f)(1) significant", |
|
"Major" |
|
), |
|
url: str = r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv" |
|
) -> tuple[pd_DataFrame | None, list, date]: |
|
"""Read CSV data from GitHub file. |
|
|
|
Args: |
|
start_date (date | str): Start date of read data. |
|
retrieve_columns (list | tuple, optional): Get select columns. Defaults to ( "publication_date", "document_number", "significant", "econ_significant", "3(f)(1) significant", "Major" ). |
|
url (str, optional): URL where data are located. Defaults to r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv". |
|
|
|
Returns: |
|
tuple: Data, column names, max date in dataset |
|
""" |
|
|
|
if isinstance(start_date, str): |
|
start_date = date.fromisoformat(start_date) |
|
|
|
|
|
if start_date >= date.fromisoformat("2023-04-06"): |
|
cols = [col for col in retrieve_columns if col != "econ_significant"] |
|
else: |
|
cols = list(retrieve_columns) |
|
|
|
|
|
try: |
|
df_pd = pd_read_csv(url, usecols=cols) |
|
except UnicodeDecodeError: |
|
df_pd = pd_read_csv(url, usecols=cols, encoding="latin") |
|
|
|
|
|
if len(df_pd[df_pd.duplicated(subset=['document_number'],keep=False)])>0: |
|
df_pd=df_pd.sort_values(['document_number','publication_date','significant','3(f)(1) significant','Major']).\ |
|
drop_duplicates(subset=['document_number'],keep='last',ignore_index=True) |
|
else: |
|
pass |
|
|
|
df_pd.loc[:, "publication_dt"] = to_datetime(df_pd["publication_date"], format="mixed", dayfirst=False, yearfirst=False) |
|
max_date = max(df_pd.loc[:, "publication_dt"].to_list()).date() |
|
|
|
cols.remove("publication_date") |
|
df = pl.from_pandas(df_pd.loc[:, cols]) |
|
|
|
if df.shape[1] == len(cols): |
|
|
|
rename_cols = {"3(f)(1) significant": "3f1_significant", "Major": "major"} |
|
if all(True if rename in cols else False for rename in rename_cols.keys()): |
|
df = df.rename(rename_cols) |
|
cols = [rename_cols.get(col, col) for col in cols] |
|
|
|
return df, cols, max_date |
|
else: |
|
return None, cols, max_date |
|
|
|
|
|
def clean_data( |
|
df: pl.DataFrame, |
|
document_numbers: list, |
|
*, |
|
return_optimized_plan: bool = False |
|
): |
|
"""Clean data. |
|
|
|
Args: |
|
df (pl.DataFrame): Input polars dataframe. |
|
document_numbers (list): List of document numbers to keep. |
|
return_optimized_plan (bool, optional): Return optimized query plan rather than dataframe. Defaults to False. |
|
|
|
Returns: |
|
DataFrame | str: Cleaned data (or string representation of the query plan) |
|
""" |
|
|
|
lf = ( |
|
df.lazy() |
|
|
|
.with_columns(pl.col("document_number").str.strip_chars()) |
|
|
|
.filter(pl.col("document_number").is_in(document_numbers)) |
|
) |
|
|
|
|
|
if return_optimized_plan: |
|
return lf.explain(optimized=True) |
|
|
|
|
|
return lf.collect() |
|
|
|
|
|
def merge_with_api_results( |
|
pd_df: pd_DataFrame, |
|
pl_df: pl.DataFrame |
|
): |
|
"""Merge significance data with FR API data. |
|
|
|
Args: |
|
pd_df (pd_DataFrame): Main dataset of FR rules. |
|
pl_df (pl.DataFrame): Significance data. |
|
|
|
Returns: |
|
DataFrame: Merged data. |
|
""" |
|
main_df = pl.from_pandas(pd_df) |
|
df = main_df.join(pl_df, on="document_number", how="left", validate="1:1", coalesce=True) |
|
return df.to_pandas() |
|
|
|
|
|
def get_significant_info(input_df: pd_DataFrame, start_date: str, document_numbers: list): |
|
"""Retrieve significance information for input data. |
|
|
|
Args: |
|
input_df (pd.DataFrame): Input data. |
|
start_date (str): Start date of data. |
|
document_numbers (list): Documents to keep. |
|
|
|
Returns: |
|
tuple[DataFrame, datetime.date]: Data with significance information, max date in dataset |
|
""" |
|
pl_df, _, max_date = read_csv_data(start_date) |
|
if pl_df is None: |
|
print("Failed to integrate significance tracking data with retrieved documents.") |
|
return input_df |
|
pl_df = clean_data(pl_df, document_numbers) |
|
pd_df = merge_with_api_results(input_df, pl_df) |
|
return pd_df, max_date |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
date_a = "2023-04-05" |
|
date_b = "2023-04-06" |
|
numbers = [ |
|
"2021-01303", |
|
'2023-28006', |
|
'2024-00149', |
|
'2024-00089', |
|
'2023-28828', |
|
'2024-00300', |
|
'2024-00045', |
|
'2024-00192', |
|
'2024-00228', |
|
'2024-00187' |
|
] |
|
|
|
|
|
df_a, clean_cols = read_csv_data(date_a) |
|
df_a = clean_data(df_a, numbers, clean_cols) |
|
|
|
|
|
df_b, clean_cols = read_csv_data(date_b) |
|
df_b = clean_data(df_b, numbers, clean_cols) |
|
|
|
|
|
|
|
|