File size: 5,869 Bytes
bb93e21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe4f734
 
 
 
 
 
 
 
 
 
 
bb93e21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ea5901
 
 
 
 
 
 
bb93e21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe4f734
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb93e21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe4f734
 
 
 
 
 
 
 
 
 
 
 
 
bb93e21
58bb4c7
bb93e21
 
 
fe4f734
 
 
 
 
 
 
 
 
 
 
 
bb93e21
 
 
fe4f734
bb93e21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# gather details on rule significance from FR tracking document
# see: https://github.com/regulatorystudies/Reg-Stats/blob/main/data/fr_tracking/fr_tracking.csv

from datetime import date
import polars as pl
from pandas import (
    DataFrame as pd_DataFrame, 
    read_csv as pd_read_csv, 
    to_datetime, 
    )


def read_csv_data(
    start_date: date | str, 
    retrieve_columns: list | tuple = (
        "publication_date", 
        "document_number",
        "significant", 
        "econ_significant", 
        "3(f)(1) significant", 
        "Major"
        ), 
    url: str = r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv"
    ) -> tuple[pd_DataFrame | None, list, date]:
    """Read CSV data from GitHub file.

    Args:
        start_date (date | str): Start date of read data.
        retrieve_columns (list | tuple, optional): Get select columns. Defaults to ( "publication_date", "document_number", "significant", "econ_significant", "3(f)(1) significant", "Major" ).
        url (str, optional): URL where data are located. Defaults to r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv".

    Returns:
        tuple: Data, column names, max date in dataset
    """    
    # handle dates formatted as str
    if isinstance(start_date, str):
        start_date = date.fromisoformat(start_date)
    
    # drop econ_significant column for dates on or after EO 14094 
    if start_date >= date.fromisoformat("2023-04-06"):
        cols = [col for col in retrieve_columns if col != "econ_significant"]
    else:
        cols = list(retrieve_columns)
    
    # read csv; try different encoding if raises error
    try:
        df_pd = pd_read_csv(url, usecols=cols)
    except UnicodeDecodeError:
        df_pd = pd_read_csv(url, usecols=cols, encoding="latin")
    
    # drop duplicates if any (to avoid dashboard crush; original data need to be revised)
    if len(df_pd[df_pd.duplicated(subset=['document_number'],keep=False)])>0:
        df_pd=df_pd.sort_values(['document_number','publication_date','significant','3(f)(1) significant','Major']).\
            drop_duplicates(subset=['document_number'],keep='last',ignore_index=True)
    else:
        pass

    df_pd.loc[:, "publication_dt"] = to_datetime(df_pd["publication_date"], format="mixed", dayfirst=False, yearfirst=False)
    max_date = max(df_pd.loc[:, "publication_dt"].to_list()).date()
    #print(max_date)
    cols.remove("publication_date")
    df = pl.from_pandas(df_pd.loc[:, cols])
    
    if df.shape[1] == len(cols):
        # rename columns if they exist
        rename_cols = {"3(f)(1) significant": "3f1_significant", "Major": "major"}
        if all(True if rename in cols else False for rename in rename_cols.keys()):
            df = df.rename(rename_cols)
            cols = [rename_cols.get(col, col) for col in cols]
        
        return df, cols, max_date
    else:
        return None, cols, max_date


def clean_data(
        df: pl.DataFrame, 
        document_numbers: list, 
        *,
        return_optimized_plan: bool = False
    ):
    """Clean data.

    Args:
        df (pl.DataFrame): Input polars dataframe.
        document_numbers (list): List of document numbers to keep.
        return_optimized_plan (bool, optional): Return optimized query plan rather than dataframe. Defaults to False.

    Returns:
        DataFrame | str: Cleaned data (or string representation of the query plan)
    """
    # start a lazy query
    lf = (
        df.lazy()
        # strip whitespace
        .with_columns(pl.col("document_number").str.strip_chars())
        # only keep document_numbers from input
        .filter(pl.col("document_number").is_in(document_numbers))
        )
    
    # return optimized query plan instead of df
    if return_optimized_plan:
        return lf.explain(optimized=True)
    
    # call collect to return df
    return lf.collect()


def merge_with_api_results(
        pd_df: pd_DataFrame, 
        pl_df: pl.DataFrame
    ):
    """Merge significance data with FR API data.

    Args:
        pd_df (pd_DataFrame): Main dataset of FR rules.
        pl_df (pl.DataFrame): Significance data.

    Returns:
        DataFrame: Merged data.
    """
    main_df = pl.from_pandas(pd_df)
    df = main_df.join(pl_df, on="document_number", how="left", validate="1:1", coalesce=True)
    return df.to_pandas()


def get_significant_info(input_df: pd_DataFrame, start_date: str, document_numbers: list):
    """Retrieve significance information for input data.

    Args:
        input_df (pd.DataFrame): Input data.
        start_date (str): Start date of data.
        document_numbers (list): Documents to keep.

    Returns:
        tuple[DataFrame, datetime.date]: Data with significance information, max date in dataset
    """
    pl_df, _, max_date = read_csv_data(start_date)
    if pl_df is None:
        print("Failed to integrate significance tracking data with retrieved documents.")
        return input_df
    pl_df = clean_data(pl_df, document_numbers)
    pd_df = merge_with_api_results(input_df, pl_df)
    return pd_df, max_date


if __name__ == "__main__":
    
    date_a = "2023-04-05"
    date_b = "2023-04-06"
    numbers = [
        "2021-01303", 
        '2023-28006', 
        '2024-00149', 
        '2024-00089',
        '2023-28828',
        '2024-00300',
        '2024-00045',
        '2024-00192',
        '2024-00228',
        '2024-00187'
        ]

    # test for dates before EO 14094
    df_a, clean_cols = read_csv_data(date_a)
    df_a = clean_data(df_a, numbers, clean_cols)
    
    # test for dates after EO 14094
    df_b, clean_cols = read_csv_data(date_b)
    df_b = clean_data(df_b, numbers, clean_cols)
    
    #df_b.rename({"test": "test1"})
    #print(df_a.shape, df_b.shape)