|
import pandas as pd |
|
import subprocess |
|
import datetime |
|
from dateutil import parser |
|
import re |
|
import os |
|
|
|
|
|
dir_path=os.path.dirname(os.path.realpath(__file__)) |
|
|
|
|
|
rd_stages={0:('RD Introduced','introdate'), |
|
1:('RD Passed One Chamber','chamber1passdate'), |
|
2:('RD Passed Two Chambers','chamber2passdate'), |
|
3:('RD Became Law','becamelawdate'), |
|
4:('RD Vetoed by President','vetodate'), |
|
9:('RD Failed','faildate'), |
|
100:('RD Passed Senate','senatevotedate'), |
|
200:('RD Passed House','housevotedate'), |
|
901:('RD Failed Senate','senatevotedate'), |
|
902:('RD Failed House','housevotedate')} |
|
|
|
|
|
def read_cra_data(file_path): |
|
|
|
|
|
df_rd=pd.read_csv(file_path) |
|
|
|
|
|
df_rd=df_rd[df_rd['congress']==119].reset_index(drop=True) |
|
|
|
|
|
df_rd=df_rd[(df_rd['ridentifier']!='.') & (df_rd['ridentifier'].notnull())].reset_index(drop=True) |
|
|
|
|
|
for d in ['introdate', 'housevotedate', 'senatevotedate', 'becamelawdate', 'vetodate', 'voteovervetodate']: |
|
df_rd.loc[df_rd[d]=='.',d]=None |
|
df_rd[d]=pd.to_datetime(df_rd[d]).dt.strftime('%Y-%m-%d') |
|
|
|
|
|
df_rd.loc[df_rd['chamberpass_atleast1']==1,'chamber1passdate']=df_rd[['housevotedate','senatevotedate']].\ |
|
astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d') |
|
df_rd.loc[df_rd['chamberpass_2']==1,'chamber2passdate']=df_rd[['housevotedate','senatevotedate']].\ |
|
astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d') |
|
df_rd.loc[((df_rd['chambervote_atleast1']==1) & (df_rd['chamberpass_atleast1']==0)) | |
|
((df_rd['chambervote_2']==1) & (df_rd['chamberpass_2']==0)), |
|
'faildate']=df_rd[['housevotedate','senatevotedate']].astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d') |
|
|
|
|
|
def determine_stages(row): |
|
"""Determine RD stages based on chamber and voting conditions.""" |
|
if row['intro_chamber'] == 'Senate': |
|
if (row['senatevotebin'] == row['senatepassbin']) and (row['housevotebin'] == row['housepassbin']): |
|
select_stages = [0, 100, 200, 3, 4] |
|
elif row['senatevotebin'] != row['senatepassbin']: |
|
select_stages = [0, 901] |
|
else: |
|
select_stages = [0, 100, 902] |
|
elif row['intro_chamber'] == 'House': |
|
if (row['senatevotebin'] == row['senatepassbin']) and (row['housevotebin'] == row['housepassbin']): |
|
select_stages = [0, 200, 100, 3, 4] |
|
elif row['housevotebin'] != row['housepassbin']: |
|
select_stages = [0, 902] |
|
else: |
|
select_stages = [0, 200, 901] |
|
else: |
|
select_stages = [] |
|
|
|
|
|
stages = [ |
|
f"{rd_stages[stage][0]} {row[rd_stages[stage][1]]}" |
|
for stage in select_stages |
|
if pd.notnull(row.get(rd_stages[stage][1])) |
|
] |
|
|
|
return '; '.join(stages) |
|
|
|
|
|
df_rd['all_stages'] = df_rd.apply(determine_stages, axis=1) |
|
|
|
|
|
df_rd['rd_seq']=df_rd.sort_values(['ridentifier','introdate']).groupby('ridentifier').cumcount()+1 |
|
df_wide = df_rd[['ridentifier','rd','rd_seq','all_stages','rd_link']].set_index(['ridentifier','rd_seq']).unstack() |
|
df_wide.columns = df_wide.columns.map(lambda x: '{}_{}'.format(x[0], x[1])) |
|
df_wide = df_wide.reset_index() |
|
|
|
|
|
last_rd_date=df_rd[[rd_stages[k][1] for k in rd_stages]].astype('datetime64[ns]').max().max() |
|
|
|
|
|
df_rd['rd_status_no']=df_rd[['chamberpass_atleast1','chamberpass_2']].sum(axis=1) |
|
df_rd.loc[df_rd['becamelawbin']==1,'rd_status_no']=3 |
|
df_rd.loc[df_rd['vetobin']==1,'rd_status_no']=4 |
|
df_rd.loc[((df_rd['chambervote_atleast1']==1) & (df_rd['chamberpass_atleast1']==0)) | |
|
((df_rd['chambervote_2']==1) & (df_rd['chamberpass_2']==0)), |
|
'rd_status_no']=9 |
|
|
|
|
|
df_rd['rd_status'] = [rd_stages[k][0] for k in df_rd['rd_status_no']] |
|
|
|
|
|
df_rd['rd_status'] = df_rd.apply( |
|
lambda row: f"{row['rd_status']} ({row[rd_stages[row['rd_status_no']][1]]})", |
|
axis=1) |
|
|
|
|
|
df_rd=df_rd.sort_values(['ridentifier','rd_status_no','introdate'],ascending=False).groupby('ridentifier').\ |
|
agg({'introducedbin':'max','rd_status':'first','rd':lambda x: ';\n'.join(x)}).reset_index() |
|
|
|
|
|
df_rd=df_rd.merge(df_wide,on='ridentifier',how='left') |
|
|
|
|
|
df_rd.rename(columns={'ridentifier':'citation','introducedbin':'CRA_Target', |
|
'rd_status':'Latest_CRA_Stage','rd': 'RD_No'},inplace=True) |
|
|
|
|
|
|
|
indices = set() |
|
for col in df_rd.columns: |
|
match = re.search(r'_(\d+)$', col) |
|
if match: |
|
indices.add(int(match.group(1))) |
|
max_index = max(indices) if indices else 0 |
|
|
|
|
|
rename_mapping={} |
|
for i in range(1, max_index + 1): |
|
rename_mapping.update({ |
|
f'rd_{i}': f'RD{i}_No', |
|
f'all_stages_{i}': f'RD{i}_Stage', |
|
f'rd_link_{i}': f'RD{i}_Link', |
|
}) |
|
|
|
|
|
df_rd.rename(columns=rename_mapping, inplace=True) |
|
|
|
|
|
new_order = [] |
|
for i in range(1, max_index + 1): |
|
new_order.extend([f'RD{i}_No', f'RD{i}_Stage', f'RD{i}_Link']) |
|
|
|
|
|
df_rd = df_rd[['citation','CRA_Target', 'Latest_CRA_Stage', 'RD_No']+[col for col in new_order if col in df_rd.columns]] |
|
|
|
return df_rd, last_rd_date |
|
|
|
|
|
def get_rd(df_fr): |
|
|
|
cra_file_path=f'{dir_path}/../congress_data/cra_data.csv' |
|
df_rd,_=read_cra_data(cra_file_path) |
|
|
|
|
|
df_merged=df_fr.merge(df_rd,on='citation',how='left') |
|
|
|
|
|
df_merged['CRA_Target']=df_merged['CRA_Target'].fillna(0) |
|
|
|
return df_merged |
|
|
|
|
|
def extract_date_after_keyword(text, keyword='data last updated'): |
|
|
|
date_patterns = [ |
|
r'\d{4}-\d{2}-\d{2}', |
|
r'\d{2}/\d{2}/\d{4}', |
|
r'\d{1}/\d{2}/\d{4}', |
|
r'\d{1}/\d{1}/\d{4}', |
|
r'\d{2}-\d{2}-\d{4}', |
|
r'\d{4}/\d{2}/\d{2}', |
|
r'\d{2} \w{3,9} \d{4}', |
|
r'\w{3,9} \d{2}, \d{4}', |
|
] |
|
|
|
|
|
date_regex = '|'.join(date_patterns) |
|
|
|
|
|
pattern = rf'{re.escape(keyword)}\s*(?:[:\-]?\s*)({date_regex})' |
|
match = re.search(pattern, text, re.IGNORECASE) |
|
|
|
if match: |
|
return parser.parse(match.group(1)).strftime("%Y-%m-%d") |
|
return None |
|
|
|
|
|
def get_cra_updated_date(cra_file_path=f'{dir_path}/../congress_data/cra_data.csv'): |
|
with open(f'{dir_path}/../congress_data/README.md', 'r', encoding='utf-8') as file: |
|
text = file.read() |
|
cra_last_updated = extract_date_after_keyword(text) |
|
|
|
if cra_last_updated: |
|
return cra_last_updated |
|
else: |
|
_,last_rd_date=read_cra_data(cra_file_path) |
|
return last_rd_date.strftime("%Y-%m-%d") |
|
|
|
|
|
CRA_LAST_UPDATED=get_cra_updated_date() |