import pandas as pd import subprocess import datetime from dateutil import parser import re import os #%% Identify directory path dir_path=os.path.dirname(os.path.realpath(__file__)) #%% Define RD stages and corresponding date columns rd_stages={0:('RD Introduced','introdate'), 1:('RD Passed One Chamber','chamber1passdate'), 2:('RD Passed Two Chambers','chamber2passdate'), 3:('RD Became Law','becamelawdate'), 4:('RD Vetoed by President','vetodate'), 9:('RD Failed','faildate'), 100:('RD Passed Senate','senatevotedate'), 200:('RD Passed House','housevotedate'), 901:('RD Failed Senate','senatevotedate'), 902:('RD Failed House','housevotedate')} #%% Function to read and clean CRA RD dataset def read_cra_data(file_path): # Import CRA data set df_rd=pd.read_csv(file_path) # Refine to 119th congress df_rd=df_rd[df_rd['congress']==119].reset_index(drop=True) # Remove entries with no FR citation df_rd=df_rd[(df_rd['ridentifier']!='.') & (df_rd['ridentifier'].notnull())].reset_index(drop=True) # Covert date format for d in ['introdate', 'housevotedate', 'senatevotedate', 'becamelawdate', 'vetodate', 'voteovervetodate']: df_rd.loc[df_rd[d]=='.',d]=None df_rd[d]=pd.to_datetime(df_rd[d]).dt.strftime('%Y-%m-%d') # Get revised stage dates df_rd.loc[df_rd['chamberpass_atleast1']==1,'chamber1passdate']=df_rd[['housevotedate','senatevotedate']].\ astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d') df_rd.loc[df_rd['chamberpass_2']==1,'chamber2passdate']=df_rd[['housevotedate','senatevotedate']].\ astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d') df_rd.loc[((df_rd['chambervote_atleast1']==1) & (df_rd['chamberpass_atleast1']==0)) | ((df_rd['chambervote_2']==1) & (df_rd['chamberpass_2']==0)), 'faildate']=df_rd[['housevotedate','senatevotedate']].astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d') # Combine detailed stages & dates for downloadable dataset def determine_stages(row): """Determine RD stages based on chamber and voting conditions.""" if row['intro_chamber'] == 'Senate': if (row['senatevotebin'] == row['senatepassbin']) and (row['housevotebin'] == row['housepassbin']): select_stages = [0, 100, 200, 3, 4] elif row['senatevotebin'] != row['senatepassbin']: select_stages = [0, 901] else: select_stages = [0, 100, 902] elif row['intro_chamber'] == 'House': if (row['senatevotebin'] == row['senatepassbin']) and (row['housevotebin'] == row['housepassbin']): select_stages = [0, 200, 100, 3, 4] elif row['housevotebin'] != row['housepassbin']: select_stages = [0, 902] else: select_stages = [0, 200, 901] else: select_stages = [] # Build the 'all_stages' string stages = [ f"{rd_stages[stage][0]} {row[rd_stages[stage][1]]}" for stage in select_stages if pd.notnull(row.get(rd_stages[stage][1])) ] return '; '.join(stages) # Apply the function to the DataFrame without looping df_rd['all_stages'] = df_rd.apply(determine_stages, axis=1) # Convert RDs to a wide format by rule df_rd['rd_seq']=df_rd.sort_values(['ridentifier','introdate']).groupby('ridentifier').cumcount()+1 df_wide = df_rd[['ridentifier','rd','rd_seq','all_stages','rd_link']].set_index(['ridentifier','rd_seq']).unstack() df_wide.columns = df_wide.columns.map(lambda x: '{}_{}'.format(x[0], x[1])) df_wide = df_wide.reset_index() # Get the latest RD date (to be used as data updated date if README date is not available) last_rd_date=df_rd[[rd_stages[k][1] for k in rd_stages]].astype('datetime64[ns]').max().max() # Define latest RD stage variable df_rd['rd_status_no']=df_rd[['chamberpass_atleast1','chamberpass_2']].sum(axis=1) df_rd.loc[df_rd['becamelawbin']==1,'rd_status_no']=3 df_rd.loc[df_rd['vetobin']==1,'rd_status_no']=4 df_rd.loc[((df_rd['chambervote_atleast1']==1) & (df_rd['chamberpass_atleast1']==0)) | ((df_rd['chambervote_2']==1) & (df_rd['chamberpass_2']==0)), 'rd_status_no']=9 # Replace latest RD stage values df_rd['rd_status'] = [rd_stages[k][0] for k in df_rd['rd_status_no']] # Combine latest RD stage and RD date df_rd['rd_status'] = df_rd.apply( lambda row: f"{row['rd_status']} ({row[rd_stages[row['rd_status_no']][1]]})", axis=1) # Group by rule df_rd=df_rd.sort_values(['ridentifier','rd_status_no','introdate'],ascending=False).groupby('ridentifier').\ agg({'introducedbin':'max','rd_status':'first','rd':lambda x: ';\n'.join(x)}).reset_index() # Merge with the wide dataset (individual RD columns) df_rd=df_rd.merge(df_wide,on='ridentifier',how='left') # Rename aggregate columns df_rd.rename(columns={'ridentifier':'citation','introducedbin':'CRA_Target', 'rd_status':'Latest_CRA_Stage','rd': 'RD_No'},inplace=True) # Rename and reorder individual RD columns # Identify max RD index from columns indices = set() for col in df_rd.columns: match = re.search(r'_(\d+)$', col) if match: indices.add(int(match.group(1))) max_index = max(indices) if indices else 0 # Create renaming mapping dynamically rename_mapping={} for i in range(1, max_index + 1): rename_mapping.update({ f'rd_{i}': f'RD{i}_No', f'all_stages_{i}': f'RD{i}_Stage', f'rd_link_{i}': f'RD{i}_Link', }) # Rename RD columns df_rd.rename(columns=rename_mapping, inplace=True) # Reorder columns as RD1_No, RD1_Stage, RD1_Link, RD2_No, RD2_Stage, RD2_Link, ... new_order = [] for i in range(1, max_index + 1): new_order.extend([f'RD{i}_No', f'RD{i}_Stage', f'RD{i}_Link']) # Reorder DataFrame columns df_rd = df_rd[['citation','CRA_Target', 'Latest_CRA_Stage', 'RD_No']+[col for col in new_order if col in df_rd.columns]] return df_rd, last_rd_date #%% Function to merge RD data with FR data def get_rd(df_fr): # Read CRA data set cra_file_path=f'{dir_path}/../congress_data/cra_data.csv' df_rd,_=read_cra_data(cra_file_path) # Merge df_merged=df_fr.merge(df_rd,on='citation',how='left') # Fill null df_merged['CRA_Target']=df_merged['CRA_Target'].fillna(0) return df_merged #%% Function to extract a date from a string def extract_date_after_keyword(text, keyword='data last updated'): # Regular expression to match dates in various formats (YYYY-MM-DD, MM/DD/YYYY, etc.) date_patterns = [ r'\d{4}-\d{2}-\d{2}', # YYYY-MM-DD r'\d{2}/\d{2}/\d{4}', # MM/DD/YYYY r'\d{1}/\d{2}/\d{4}', # M/DD/YYYY r'\d{1}/\d{1}/\d{4}', # M/D/YYYY r'\d{2}-\d{2}-\d{4}', # MM-DD-YYYY r'\d{4}/\d{2}/\d{2}', # YYYY/MM/DD r'\d{2} \w{3,9} \d{4}', # DD Month YYYY r'\w{3,9} \d{2}, \d{4}', # Month DD, YYYY ] # Combine patterns into a single regex date_regex = '|'.join(date_patterns) # Search for the keyword followed by any amount of whitespace and then a date pattern = rf'{re.escape(keyword)}\s*(?:[:\-]?\s*)({date_regex})' match = re.search(pattern, text, re.IGNORECASE) if match: return parser.parse(match.group(1)).strftime("%Y-%m-%d") # Return the matched date return None # Return None if no match is found #%% Function to get last modified date of the RD dataset from README def get_cra_updated_date(cra_file_path=f'{dir_path}/../congress_data/cra_data.csv'): with open(f'{dir_path}/../congress_data/README.md', 'r', encoding='utf-8') as file: text = file.read() cra_last_updated = extract_date_after_keyword(text) if cra_last_updated: return cra_last_updated else: _,last_rd_date=read_cra_data(cra_file_path) return last_rd_date.strftime("%Y-%m-%d") #%% Create objects to import in app CRA_LAST_UPDATED=get_cra_updated_date()