File size: 8,320 Bytes
761e2b2 64cb2a0 4211da7 764bf24 761e2b2 3d723f7 c7e3855 64cb2a0 761e2b2 3d723f7 c7e3855 3d723f7 c7e3855 3d723f7 c7e3855 3d723f7 c7e3855 761e2b2 3d723f7 64cb2a0 3d723f7 761e2b2 3d723f7 761e2b2 c7e3855 3d723f7 c7e3855 761e2b2 c7e3855 761e2b2 3d723f7 c7e3855 761e2b2 64cb2a0 761e2b2 764bf24 64cb2a0 761e2b2 64cb2a0 c7e3855 761e2b2 4211da7 ce7f31f 4211da7 764bf24 4211da7 64cb2a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
import pandas as pd
import subprocess
import datetime
from dateutil import parser
import re
import os
#%% Identify directory path
dir_path=os.path.dirname(os.path.realpath(__file__))
#%% Define RD stages and corresponding date columns
rd_stages={0:('RD Introduced','introdate'),
1:('RD Passed One Chamber','chamber1passdate'),
2:('RD Passed Two Chambers','chamber2passdate'),
3:('RD Became Law','becamelawdate'),
4:('RD Vetoed by President','vetodate'),
9:('RD Failed','faildate'),
100:('RD Passed Senate','senatevotedate'),
200:('RD Passed House','housevotedate'),
901:('RD Failed Senate','senatevotedate'),
902:('RD Failed House','housevotedate')}
#%% Function to read and clean CRA RD dataset
def read_cra_data(file_path):
# Import CRA data set
df_rd=pd.read_csv(file_path)
# Refine to 119th congress
df_rd=df_rd[df_rd['congress']==119].reset_index(drop=True)
# Remove entries with no FR citation
df_rd=df_rd[(df_rd['ridentifier']!='.') & (df_rd['ridentifier'].notnull())].reset_index(drop=True)
# Covert date format
for d in ['introdate', 'housevotedate', 'senatevotedate', 'becamelawdate', 'vetodate', 'voteovervetodate']:
df_rd.loc[df_rd[d]=='.',d]=None
df_rd[d]=pd.to_datetime(df_rd[d]).dt.strftime('%Y-%m-%d')
# Get revised stage dates
df_rd.loc[df_rd['chamberpass_atleast1']==1,'chamber1passdate']=df_rd[['housevotedate','senatevotedate']].\
astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d')
df_rd.loc[df_rd['chamberpass_2']==1,'chamber2passdate']=df_rd[['housevotedate','senatevotedate']].\
astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d')
df_rd.loc[((df_rd['chambervote_atleast1']==1) & (df_rd['chamberpass_atleast1']==0)) |
((df_rd['chambervote_2']==1) & (df_rd['chamberpass_2']==0)),
'faildate']=df_rd[['housevotedate','senatevotedate']].astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d')
# Combine detailed stages & dates for downloadable dataset
def determine_stages(row):
"""Determine RD stages based on chamber and voting conditions."""
if row['intro_chamber'] == 'Senate':
if (row['senatevotebin'] == row['senatepassbin']) and (row['housevotebin'] == row['housepassbin']):
select_stages = [0, 100, 200, 3, 4]
elif row['senatevotebin'] != row['senatepassbin']:
select_stages = [0, 901]
else:
select_stages = [0, 100, 902]
elif row['intro_chamber'] == 'House':
if (row['senatevotebin'] == row['senatepassbin']) and (row['housevotebin'] == row['housepassbin']):
select_stages = [0, 200, 100, 3, 4]
elif row['housevotebin'] != row['housepassbin']:
select_stages = [0, 902]
else:
select_stages = [0, 200, 901]
else:
select_stages = []
# Build the 'all_stages' string
stages = [
f"{rd_stages[stage][0]} {row[rd_stages[stage][1]]}"
for stage in select_stages
if pd.notnull(row.get(rd_stages[stage][1]))
]
return '; '.join(stages)
# Apply the function to the DataFrame without looping
df_rd['all_stages'] = df_rd.apply(determine_stages, axis=1)
# Convert RDs to a wide format by rule
df_rd['rd_seq']=df_rd.sort_values(['ridentifier','introdate']).groupby('ridentifier').cumcount()+1
df_wide = df_rd[['ridentifier','rd','rd_seq','all_stages','rd_link']].set_index(['ridentifier','rd_seq']).unstack()
df_wide.columns = df_wide.columns.map(lambda x: '{}_{}'.format(x[0], x[1]))
df_wide = df_wide.reset_index()
# Get the latest RD date (to be used as data updated date if README date is not available)
last_rd_date=df_rd[[rd_stages[k][1] for k in rd_stages]].astype('datetime64[ns]').max().max()
# Define latest RD stage variable
df_rd['rd_status_no']=df_rd[['chamberpass_atleast1','chamberpass_2']].sum(axis=1)
df_rd.loc[df_rd['becamelawbin']==1,'rd_status_no']=3
df_rd.loc[df_rd['vetobin']==1,'rd_status_no']=4
df_rd.loc[((df_rd['chambervote_atleast1']==1) & (df_rd['chamberpass_atleast1']==0)) |
((df_rd['chambervote_2']==1) & (df_rd['chamberpass_2']==0)),
'rd_status_no']=9
# Replace latest RD stage values
df_rd['rd_status'] = [rd_stages[k][0] for k in df_rd['rd_status_no']]
# Combine latest RD stage and RD date
df_rd['rd_status'] = df_rd.apply(
lambda row: f"{row['rd_status']} ({row[rd_stages[row['rd_status_no']][1]]})",
axis=1)
# Group by rule
df_rd=df_rd.sort_values(['ridentifier','rd_status_no','introdate'],ascending=False).groupby('ridentifier').\
agg({'introducedbin':'max','rd_status':'first','rd':lambda x: ';\n'.join(x)}).reset_index()
# Merge with the wide dataset (individual RD columns)
df_rd=df_rd.merge(df_wide,on='ridentifier',how='left')
# Rename aggregate columns
df_rd.rename(columns={'ridentifier':'citation','introducedbin':'CRA_Target',
'rd_status':'Latest_CRA_Stage','rd': 'RD_No'},inplace=True)
# Rename and reorder individual RD columns
# Identify max RD index from columns
indices = set()
for col in df_rd.columns:
match = re.search(r'_(\d+)$', col)
if match:
indices.add(int(match.group(1)))
max_index = max(indices) if indices else 0
# Create renaming mapping dynamically
rename_mapping={}
for i in range(1, max_index + 1):
rename_mapping.update({
f'rd_{i}': f'RD{i}_No',
f'all_stages_{i}': f'RD{i}_Stage',
f'rd_link_{i}': f'RD{i}_Link',
})
# Rename RD columns
df_rd.rename(columns=rename_mapping, inplace=True)
# Reorder columns as RD1_No, RD1_Stage, RD1_Link, RD2_No, RD2_Stage, RD2_Link, ...
new_order = []
for i in range(1, max_index + 1):
new_order.extend([f'RD{i}_No', f'RD{i}_Stage', f'RD{i}_Link'])
# Reorder DataFrame columns
df_rd = df_rd[['citation','CRA_Target', 'Latest_CRA_Stage', 'RD_No']+[col for col in new_order if col in df_rd.columns]]
return df_rd, last_rd_date
#%% Function to merge RD data with FR data
def get_rd(df_fr):
# Read CRA data set
cra_file_path=f'{dir_path}/../congress_data/cra_data.csv'
df_rd,_=read_cra_data(cra_file_path)
# Merge
df_merged=df_fr.merge(df_rd,on='citation',how='left')
# Fill null
df_merged['CRA_Target']=df_merged['CRA_Target'].fillna(0)
return df_merged
#%% Function to extract a date from a string
def extract_date_after_keyword(text, keyword='data last updated'):
# Regular expression to match dates in various formats (YYYY-MM-DD, MM/DD/YYYY, etc.)
date_patterns = [
r'\d{4}-\d{2}-\d{2}', # YYYY-MM-DD
r'\d{2}/\d{2}/\d{4}', # MM/DD/YYYY
r'\d{1}/\d{2}/\d{4}', # M/DD/YYYY
r'\d{1}/\d{1}/\d{4}', # M/D/YYYY
r'\d{2}-\d{2}-\d{4}', # MM-DD-YYYY
r'\d{4}/\d{2}/\d{2}', # YYYY/MM/DD
r'\d{2} \w{3,9} \d{4}', # DD Month YYYY
r'\w{3,9} \d{2}, \d{4}', # Month DD, YYYY
]
# Combine patterns into a single regex
date_regex = '|'.join(date_patterns)
# Search for the keyword followed by any amount of whitespace and then a date
pattern = rf'{re.escape(keyword)}\s*(?:[:\-]?\s*)({date_regex})'
match = re.search(pattern, text, re.IGNORECASE)
if match:
return parser.parse(match.group(1)).strftime("%Y-%m-%d") # Return the matched date
return None # Return None if no match is found
#%% Function to get last modified date of the RD dataset from README
def get_cra_updated_date(cra_file_path=f'{dir_path}/../congress_data/cra_data.csv'):
with open(f'{dir_path}/../congress_data/README.md', 'r', encoding='utf-8') as file:
text = file.read()
cra_last_updated = extract_date_after_keyword(text)
if cra_last_updated:
return cra_last_updated
else:
_,last_rd_date=read_cra_data(cra_file_path)
return last_rd_date.strftime("%Y-%m-%d")
#%% Create objects to import in app
CRA_LAST_UPDATED=get_cra_updated_date() |