Spaces:

regulatorystudies
/

cra-window-rules

Running

File size: 8,320 Bytes

import pandas as pd
import subprocess
import datetime
from dateutil import parser
import re
import os

#%% Identify directory path
dir_path=os.path.dirname(os.path.realpath(__file__))

#%% Define RD stages and corresponding date columns
rd_stages={0:('RD Introduced','introdate'),
           1:('RD Passed One Chamber','chamber1passdate'),
           2:('RD Passed Two Chambers','chamber2passdate'),
           3:('RD Became Law','becamelawdate'),
           4:('RD Vetoed by President','vetodate'),
           9:('RD Failed','faildate'),
           100:('RD Passed Senate','senatevotedate'),
           200:('RD Passed House','housevotedate'),
           901:('RD Failed Senate','senatevotedate'),
           902:('RD Failed House','housevotedate')}

#%% Function to read and clean CRA RD dataset
def read_cra_data(file_path):
    
    # Import CRA data set
    df_rd=pd.read_csv(file_path)

    # Refine to 119th congress
    df_rd=df_rd[df_rd['congress']==119].reset_index(drop=True)

    # Remove entries with no FR citation
    df_rd=df_rd[(df_rd['ridentifier']!='.') & (df_rd['ridentifier'].notnull())].reset_index(drop=True)

    # Covert date format
    for d in ['introdate', 'housevotedate', 'senatevotedate', 'becamelawdate', 'vetodate', 'voteovervetodate']:
         df_rd.loc[df_rd[d]=='.',d]=None
         df_rd[d]=pd.to_datetime(df_rd[d]).dt.strftime('%Y-%m-%d')

    # Get revised stage dates
    df_rd.loc[df_rd['chamberpass_atleast1']==1,'chamber1passdate']=df_rd[['housevotedate','senatevotedate']].\
            astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d')
    df_rd.loc[df_rd['chamberpass_2']==1,'chamber2passdate']=df_rd[['housevotedate','senatevotedate']].\
            astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d')
    df_rd.loc[((df_rd['chambervote_atleast1']==1) & (df_rd['chamberpass_atleast1']==0)) | 
              ((df_rd['chambervote_2']==1) & (df_rd['chamberpass_2']==0)),
              'faildate']=df_rd[['housevotedate','senatevotedate']].astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d')

    # Combine detailed stages & dates for downloadable dataset
    def determine_stages(row):
        """Determine RD stages based on chamber and voting conditions."""
        if row['intro_chamber'] == 'Senate':
            if (row['senatevotebin'] == row['senatepassbin']) and (row['housevotebin'] == row['housepassbin']):
                select_stages = [0, 100, 200, 3, 4]
            elif row['senatevotebin'] != row['senatepassbin']:
                select_stages = [0, 901]
            else:
                select_stages = [0, 100, 902]
        elif row['intro_chamber'] == 'House':
            if (row['senatevotebin'] == row['senatepassbin']) and (row['housevotebin'] == row['housepassbin']):
                select_stages = [0, 200, 100, 3, 4]
            elif row['housevotebin'] != row['housepassbin']:
                select_stages = [0, 902]
            else:
                select_stages = [0, 200, 901]
        else:
            select_stages = []

        # Build the 'all_stages' string
        stages = [
            f"{rd_stages[stage][0]} {row[rd_stages[stage][1]]}" 
            for stage in select_stages 
            if pd.notnull(row.get(rd_stages[stage][1]))
        ]

        return '; '.join(stages)

    # Apply the function to the DataFrame without looping
    df_rd['all_stages'] = df_rd.apply(determine_stages, axis=1)

    # Convert RDs to a wide format by rule
    df_rd['rd_seq']=df_rd.sort_values(['ridentifier','introdate']).groupby('ridentifier').cumcount()+1
    df_wide = df_rd[['ridentifier','rd','rd_seq','all_stages','rd_link']].set_index(['ridentifier','rd_seq']).unstack()
    df_wide.columns = df_wide.columns.map(lambda x: '{}_{}'.format(x[0], x[1]))
    df_wide = df_wide.reset_index()

    # Get the latest RD date (to be used as data updated date if README date is not available)
    last_rd_date=df_rd[[rd_stages[k][1] for k in rd_stages]].astype('datetime64[ns]').max().max()

    # Define latest RD stage variable
    df_rd['rd_status_no']=df_rd[['chamberpass_atleast1','chamberpass_2']].sum(axis=1)
    df_rd.loc[df_rd['becamelawbin']==1,'rd_status_no']=3
    df_rd.loc[df_rd['vetobin']==1,'rd_status_no']=4
    df_rd.loc[((df_rd['chambervote_atleast1']==1) & (df_rd['chamberpass_atleast1']==0)) | 
              ((df_rd['chambervote_2']==1) & (df_rd['chamberpass_2']==0)),
              'rd_status_no']=9

    # Replace latest RD stage values
    df_rd['rd_status'] = [rd_stages[k][0] for k in df_rd['rd_status_no']]

    # Combine latest RD stage and RD date
    df_rd['rd_status'] = df_rd.apply(
        lambda row: f"{row['rd_status']} ({row[rd_stages[row['rd_status_no']][1]]})", 
        axis=1)

    # Group by rule
    df_rd=df_rd.sort_values(['ridentifier','rd_status_no','introdate'],ascending=False).groupby('ridentifier').\
            agg({'introducedbin':'max','rd_status':'first','rd':lambda x: ';\n'.join(x)}).reset_index()

    # Merge with the wide dataset (individual RD columns)
    df_rd=df_rd.merge(df_wide,on='ridentifier',how='left')

    # Rename aggregate columns
    df_rd.rename(columns={'ridentifier':'citation','introducedbin':'CRA_Target',
                          'rd_status':'Latest_CRA_Stage','rd': 'RD_No'},inplace=True)

    # Rename and reorder individual RD columns
    # Identify max RD index from columns
    indices = set()
    for col in df_rd.columns:
        match = re.search(r'_(\d+)$', col)
        if match:
            indices.add(int(match.group(1)))
    max_index = max(indices) if indices else 0

    # Create renaming mapping dynamically
    rename_mapping={}
    for i in range(1, max_index + 1):
        rename_mapping.update({
            f'rd_{i}': f'RD{i}_No',
            f'all_stages_{i}': f'RD{i}_Stage',
            f'rd_link_{i}': f'RD{i}_Link', 
        })

    # Rename RD columns
    df_rd.rename(columns=rename_mapping, inplace=True)

    # Reorder columns as RD1_No, RD1_Stage, RD1_Link, RD2_No, RD2_Stage, RD2_Link, ...
    new_order = []
    for i in range(1, max_index + 1):
        new_order.extend([f'RD{i}_No', f'RD{i}_Stage', f'RD{i}_Link'])

    # Reorder DataFrame columns
    df_rd = df_rd[['citation','CRA_Target', 'Latest_CRA_Stage', 'RD_No']+[col for col in new_order if col in df_rd.columns]]

    return df_rd, last_rd_date

#%% Function to merge RD data with FR data
def get_rd(df_fr):
    # Read CRA data set
    cra_file_path=f'{dir_path}/../congress_data/cra_data.csv'
    df_rd,_=read_cra_data(cra_file_path)

    # Merge
    df_merged=df_fr.merge(df_rd,on='citation',how='left')

    # Fill null
    df_merged['CRA_Target']=df_merged['CRA_Target'].fillna(0)

    return df_merged

#%% Function to extract a date from a string
def extract_date_after_keyword(text, keyword='data last updated'):
    # Regular expression to match dates in various formats (YYYY-MM-DD, MM/DD/YYYY, etc.)
    date_patterns = [
        r'\d{4}-\d{2}-\d{2}',  # YYYY-MM-DD
        r'\d{2}/\d{2}/\d{4}',  # MM/DD/YYYY
        r'\d{1}/\d{2}/\d{4}',  # M/DD/YYYY
        r'\d{1}/\d{1}/\d{4}',  # M/D/YYYY
        r'\d{2}-\d{2}-\d{4}',  # MM-DD-YYYY
        r'\d{4}/\d{2}/\d{2}',  # YYYY/MM/DD
        r'\d{2} \w{3,9} \d{4}',  # DD Month YYYY
        r'\w{3,9} \d{2}, \d{4}',  # Month DD, YYYY
    ]

    # Combine patterns into a single regex
    date_regex = '|'.join(date_patterns)

    # Search for the keyword followed by any amount of whitespace and then a date
    pattern = rf'{re.escape(keyword)}\s*(?:[:\-]?\s*)({date_regex})'
    match = re.search(pattern, text, re.IGNORECASE)
    
    if match:
        return parser.parse(match.group(1)).strftime("%Y-%m-%d")      # Return the matched date
    return None  # Return None if no match is found

#%% Function to get last modified date of the RD dataset from README
def get_cra_updated_date(cra_file_path=f'{dir_path}/../congress_data/cra_data.csv'):
    with open(f'{dir_path}/../congress_data/README.md', 'r', encoding='utf-8') as file:
            text = file.read()
    cra_last_updated = extract_date_after_keyword(text)

    if cra_last_updated:
        return cra_last_updated
    else:
        _,last_rd_date=read_cra_data(cra_file_path)
        return last_rd_date.strftime("%Y-%m-%d")
    
#%% Create objects to import in app
CRA_LAST_UPDATED=get_cra_updated_date()