cra-window-rules / modules /get_resolutions.py
zhoudanxie's picture
Add an alternative date format for CRA data updated date
ce7f31f
import pandas as pd
import subprocess
import datetime
from dateutil import parser
import re
import os
#%% Identify directory path
dir_path=os.path.dirname(os.path.realpath(__file__))
#%% Define RD stages and corresponding date columns
rd_stages={0:('RD Introduced','introdate'),
1:('RD Passed One Chamber','chamber1passdate'),
2:('RD Passed Two Chambers','chamber2passdate'),
3:('RD Became Law','becamelawdate'),
4:('RD Vetoed by President','vetodate'),
9:('RD Failed','faildate'),
100:('RD Passed Senate','senatevotedate'),
200:('RD Passed House','housevotedate'),
901:('RD Failed Senate','senatevotedate'),
902:('RD Failed House','housevotedate')}
#%% Function to read and clean CRA RD dataset
def read_cra_data(file_path):
# Import CRA data set
df_rd=pd.read_csv(file_path)
# Refine to 119th congress
df_rd=df_rd[df_rd['congress']==119].reset_index(drop=True)
# Remove entries with no FR citation
df_rd=df_rd[(df_rd['ridentifier']!='.') & (df_rd['ridentifier'].notnull())].reset_index(drop=True)
# Covert date format
for d in ['introdate', 'housevotedate', 'senatevotedate', 'becamelawdate', 'vetodate', 'voteovervetodate']:
df_rd.loc[df_rd[d]=='.',d]=None
df_rd[d]=pd.to_datetime(df_rd[d]).dt.strftime('%Y-%m-%d')
# Get revised stage dates
df_rd.loc[df_rd['chamberpass_atleast1']==1,'chamber1passdate']=df_rd[['housevotedate','senatevotedate']].\
astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d')
df_rd.loc[df_rd['chamberpass_2']==1,'chamber2passdate']=df_rd[['housevotedate','senatevotedate']].\
astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d')
df_rd.loc[((df_rd['chambervote_atleast1']==1) & (df_rd['chamberpass_atleast1']==0)) |
((df_rd['chambervote_2']==1) & (df_rd['chamberpass_2']==0)),
'faildate']=df_rd[['housevotedate','senatevotedate']].astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d')
# Combine detailed stages & dates for downloadable dataset
def determine_stages(row):
"""Determine RD stages based on chamber and voting conditions."""
if row['intro_chamber'] == 'Senate':
if (row['senatevotebin'] == row['senatepassbin']) and (row['housevotebin'] == row['housepassbin']):
select_stages = [0, 100, 200, 3, 4]
elif row['senatevotebin'] != row['senatepassbin']:
select_stages = [0, 901]
else:
select_stages = [0, 100, 902]
elif row['intro_chamber'] == 'House':
if (row['senatevotebin'] == row['senatepassbin']) and (row['housevotebin'] == row['housepassbin']):
select_stages = [0, 200, 100, 3, 4]
elif row['housevotebin'] != row['housepassbin']:
select_stages = [0, 902]
else:
select_stages = [0, 200, 901]
else:
select_stages = []
# Build the 'all_stages' string
stages = [
f"{rd_stages[stage][0]} {row[rd_stages[stage][1]]}"
for stage in select_stages
if pd.notnull(row.get(rd_stages[stage][1]))
]
return '; '.join(stages)
# Apply the function to the DataFrame without looping
df_rd['all_stages'] = df_rd.apply(determine_stages, axis=1)
# Convert RDs to a wide format by rule
df_rd['rd_seq']=df_rd.sort_values(['ridentifier','introdate']).groupby('ridentifier').cumcount()+1
df_wide = df_rd[['ridentifier','rd','rd_seq','all_stages','rd_link']].set_index(['ridentifier','rd_seq']).unstack()
df_wide.columns = df_wide.columns.map(lambda x: '{}_{}'.format(x[0], x[1]))
df_wide = df_wide.reset_index()
# Get the latest RD date (to be used as data updated date if README date is not available)
last_rd_date=df_rd[[rd_stages[k][1] for k in rd_stages]].astype('datetime64[ns]').max().max()
# Define latest RD stage variable
df_rd['rd_status_no']=df_rd[['chamberpass_atleast1','chamberpass_2']].sum(axis=1)
df_rd.loc[df_rd['becamelawbin']==1,'rd_status_no']=3
df_rd.loc[df_rd['vetobin']==1,'rd_status_no']=4
df_rd.loc[((df_rd['chambervote_atleast1']==1) & (df_rd['chamberpass_atleast1']==0)) |
((df_rd['chambervote_2']==1) & (df_rd['chamberpass_2']==0)),
'rd_status_no']=9
# Replace latest RD stage values
df_rd['rd_status'] = [rd_stages[k][0] for k in df_rd['rd_status_no']]
# Combine latest RD stage and RD date
df_rd['rd_status'] = df_rd.apply(
lambda row: f"{row['rd_status']} ({row[rd_stages[row['rd_status_no']][1]]})",
axis=1)
# Group by rule
df_rd=df_rd.sort_values(['ridentifier','rd_status_no','introdate'],ascending=False).groupby('ridentifier').\
agg({'introducedbin':'max','rd_status':'first','rd':lambda x: ';\n'.join(x)}).reset_index()
# Merge with the wide dataset (individual RD columns)
df_rd=df_rd.merge(df_wide,on='ridentifier',how='left')
# Rename aggregate columns
df_rd.rename(columns={'ridentifier':'citation','introducedbin':'CRA_Target',
'rd_status':'Latest_CRA_Stage','rd': 'RD_No'},inplace=True)
# Rename and reorder individual RD columns
# Identify max RD index from columns
indices = set()
for col in df_rd.columns:
match = re.search(r'_(\d+)$', col)
if match:
indices.add(int(match.group(1)))
max_index = max(indices) if indices else 0
# Create renaming mapping dynamically
rename_mapping={}
for i in range(1, max_index + 1):
rename_mapping.update({
f'rd_{i}': f'RD{i}_No',
f'all_stages_{i}': f'RD{i}_Stage',
f'rd_link_{i}': f'RD{i}_Link',
})
# Rename RD columns
df_rd.rename(columns=rename_mapping, inplace=True)
# Reorder columns as RD1_No, RD1_Stage, RD1_Link, RD2_No, RD2_Stage, RD2_Link, ...
new_order = []
for i in range(1, max_index + 1):
new_order.extend([f'RD{i}_No', f'RD{i}_Stage', f'RD{i}_Link'])
# Reorder DataFrame columns
df_rd = df_rd[['citation','CRA_Target', 'Latest_CRA_Stage', 'RD_No']+[col for col in new_order if col in df_rd.columns]]
return df_rd, last_rd_date
#%% Function to merge RD data with FR data
def get_rd(df_fr):
# Read CRA data set
cra_file_path=f'{dir_path}/../congress_data/cra_data.csv'
df_rd,_=read_cra_data(cra_file_path)
# Merge
df_merged=df_fr.merge(df_rd,on='citation',how='left')
# Fill null
df_merged['CRA_Target']=df_merged['CRA_Target'].fillna(0)
return df_merged
#%% Function to extract a date from a string
def extract_date_after_keyword(text, keyword='data last updated'):
# Regular expression to match dates in various formats (YYYY-MM-DD, MM/DD/YYYY, etc.)
date_patterns = [
r'\d{4}-\d{2}-\d{2}', # YYYY-MM-DD
r'\d{2}/\d{2}/\d{4}', # MM/DD/YYYY
r'\d{1}/\d{2}/\d{4}', # M/DD/YYYY
r'\d{1}/\d{1}/\d{4}', # M/D/YYYY
r'\d{2}-\d{2}-\d{4}', # MM-DD-YYYY
r'\d{4}/\d{2}/\d{2}', # YYYY/MM/DD
r'\d{2} \w{3,9} \d{4}', # DD Month YYYY
r'\w{3,9} \d{2}, \d{4}', # Month DD, YYYY
]
# Combine patterns into a single regex
date_regex = '|'.join(date_patterns)
# Search for the keyword followed by any amount of whitespace and then a date
pattern = rf'{re.escape(keyword)}\s*(?:[:\-]?\s*)({date_regex})'
match = re.search(pattern, text, re.IGNORECASE)
if match:
return parser.parse(match.group(1)).strftime("%Y-%m-%d") # Return the matched date
return None # Return None if no match is found
#%% Function to get last modified date of the RD dataset from README
def get_cra_updated_date(cra_file_path=f'{dir_path}/../congress_data/cra_data.csv'):
with open(f'{dir_path}/../congress_data/README.md', 'r', encoding='utf-8') as file:
text = file.read()
cra_last_updated = extract_date_after_keyword(text)
if cra_last_updated:
return cra_last_updated
else:
_,last_rd_date=read_cra_data(cra_file_path)
return last_rd_date.strftime("%Y-%m-%d")
#%% Create objects to import in app
CRA_LAST_UPDATED=get_cra_updated_date()