File size: 8,320 Bytes
761e2b2
64cb2a0
 
4211da7
 
764bf24
 
 
 
761e2b2
3d723f7
 
 
 
 
 
 
 
 
 
 
c7e3855
64cb2a0
761e2b2
 
 
 
 
3d723f7
 
 
 
 
 
c7e3855
3d723f7
 
c7e3855
 
3d723f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7e3855
3d723f7
c7e3855
 
 
 
761e2b2
3d723f7
 
64cb2a0
3d723f7
 
 
 
 
 
 
761e2b2
3d723f7
 
761e2b2
c7e3855
 
3d723f7
c7e3855
761e2b2
 
c7e3855
 
761e2b2
3d723f7
c7e3855
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
761e2b2
64cb2a0
 
 
 
761e2b2
764bf24
64cb2a0
761e2b2
 
 
 
64cb2a0
c7e3855
761e2b2
 
 
4211da7
 
 
 
 
 
 
ce7f31f
4211da7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
764bf24
 
4211da7
 
64cb2a0
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import pandas as pd
import subprocess
import datetime
from dateutil import parser
import re
import os

#%% Identify directory path
dir_path=os.path.dirname(os.path.realpath(__file__))

#%% Define RD stages and corresponding date columns
rd_stages={0:('RD Introduced','introdate'),
           1:('RD Passed One Chamber','chamber1passdate'),
           2:('RD Passed Two Chambers','chamber2passdate'),
           3:('RD Became Law','becamelawdate'),
           4:('RD Vetoed by President','vetodate'),
           9:('RD Failed','faildate'),
           100:('RD Passed Senate','senatevotedate'),
           200:('RD Passed House','housevotedate'),
           901:('RD Failed Senate','senatevotedate'),
           902:('RD Failed House','housevotedate')}

#%% Function to read and clean CRA RD dataset
def read_cra_data(file_path):
    
    # Import CRA data set
    df_rd=pd.read_csv(file_path)

    # Refine to 119th congress
    df_rd=df_rd[df_rd['congress']==119].reset_index(drop=True)

    # Remove entries with no FR citation
    df_rd=df_rd[(df_rd['ridentifier']!='.') & (df_rd['ridentifier'].notnull())].reset_index(drop=True)

    # Covert date format
    for d in ['introdate', 'housevotedate', 'senatevotedate', 'becamelawdate', 'vetodate', 'voteovervetodate']:
         df_rd.loc[df_rd[d]=='.',d]=None
         df_rd[d]=pd.to_datetime(df_rd[d]).dt.strftime('%Y-%m-%d')

    # Get revised stage dates
    df_rd.loc[df_rd['chamberpass_atleast1']==1,'chamber1passdate']=df_rd[['housevotedate','senatevotedate']].\
            astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d')
    df_rd.loc[df_rd['chamberpass_2']==1,'chamber2passdate']=df_rd[['housevotedate','senatevotedate']].\
            astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d')
    df_rd.loc[((df_rd['chambervote_atleast1']==1) & (df_rd['chamberpass_atleast1']==0)) | 
              ((df_rd['chambervote_2']==1) & (df_rd['chamberpass_2']==0)),
              'faildate']=df_rd[['housevotedate','senatevotedate']].astype('datetime64[ns]').max(axis=1).dt.strftime('%Y-%m-%d')

    # Combine detailed stages & dates for downloadable dataset
    def determine_stages(row):
        """Determine RD stages based on chamber and voting conditions."""
        if row['intro_chamber'] == 'Senate':
            if (row['senatevotebin'] == row['senatepassbin']) and (row['housevotebin'] == row['housepassbin']):
                select_stages = [0, 100, 200, 3, 4]
            elif row['senatevotebin'] != row['senatepassbin']:
                select_stages = [0, 901]
            else:
                select_stages = [0, 100, 902]
        elif row['intro_chamber'] == 'House':
            if (row['senatevotebin'] == row['senatepassbin']) and (row['housevotebin'] == row['housepassbin']):
                select_stages = [0, 200, 100, 3, 4]
            elif row['housevotebin'] != row['housepassbin']:
                select_stages = [0, 902]
            else:
                select_stages = [0, 200, 901]
        else:
            select_stages = []

        # Build the 'all_stages' string
        stages = [
            f"{rd_stages[stage][0]} {row[rd_stages[stage][1]]}" 
            for stage in select_stages 
            if pd.notnull(row.get(rd_stages[stage][1]))
        ]

        return '; '.join(stages)

    # Apply the function to the DataFrame without looping
    df_rd['all_stages'] = df_rd.apply(determine_stages, axis=1)

    # Convert RDs to a wide format by rule
    df_rd['rd_seq']=df_rd.sort_values(['ridentifier','introdate']).groupby('ridentifier').cumcount()+1
    df_wide = df_rd[['ridentifier','rd','rd_seq','all_stages','rd_link']].set_index(['ridentifier','rd_seq']).unstack()
    df_wide.columns = df_wide.columns.map(lambda x: '{}_{}'.format(x[0], x[1]))
    df_wide = df_wide.reset_index()

    # Get the latest RD date (to be used as data updated date if README date is not available)
    last_rd_date=df_rd[[rd_stages[k][1] for k in rd_stages]].astype('datetime64[ns]').max().max()

    # Define latest RD stage variable
    df_rd['rd_status_no']=df_rd[['chamberpass_atleast1','chamberpass_2']].sum(axis=1)
    df_rd.loc[df_rd['becamelawbin']==1,'rd_status_no']=3
    df_rd.loc[df_rd['vetobin']==1,'rd_status_no']=4
    df_rd.loc[((df_rd['chambervote_atleast1']==1) & (df_rd['chamberpass_atleast1']==0)) | 
              ((df_rd['chambervote_2']==1) & (df_rd['chamberpass_2']==0)),
              'rd_status_no']=9

    # Replace latest RD stage values
    df_rd['rd_status'] = [rd_stages[k][0] for k in df_rd['rd_status_no']]

    # Combine latest RD stage and RD date
    df_rd['rd_status'] = df_rd.apply(
        lambda row: f"{row['rd_status']} ({row[rd_stages[row['rd_status_no']][1]]})", 
        axis=1)

    # Group by rule
    df_rd=df_rd.sort_values(['ridentifier','rd_status_no','introdate'],ascending=False).groupby('ridentifier').\
            agg({'introducedbin':'max','rd_status':'first','rd':lambda x: ';\n'.join(x)}).reset_index()

    # Merge with the wide dataset (individual RD columns)
    df_rd=df_rd.merge(df_wide,on='ridentifier',how='left')

    # Rename aggregate columns
    df_rd.rename(columns={'ridentifier':'citation','introducedbin':'CRA_Target',
                          'rd_status':'Latest_CRA_Stage','rd': 'RD_No'},inplace=True)

    # Rename and reorder individual RD columns
    # Identify max RD index from columns
    indices = set()
    for col in df_rd.columns:
        match = re.search(r'_(\d+)$', col)
        if match:
            indices.add(int(match.group(1)))
    max_index = max(indices) if indices else 0

    # Create renaming mapping dynamically
    rename_mapping={}
    for i in range(1, max_index + 1):
        rename_mapping.update({
            f'rd_{i}': f'RD{i}_No',
            f'all_stages_{i}': f'RD{i}_Stage',
            f'rd_link_{i}': f'RD{i}_Link', 
        })

    # Rename RD columns
    df_rd.rename(columns=rename_mapping, inplace=True)

    # Reorder columns as RD1_No, RD1_Stage, RD1_Link, RD2_No, RD2_Stage, RD2_Link, ...
    new_order = []
    for i in range(1, max_index + 1):
        new_order.extend([f'RD{i}_No', f'RD{i}_Stage', f'RD{i}_Link'])

    # Reorder DataFrame columns
    df_rd = df_rd[['citation','CRA_Target', 'Latest_CRA_Stage', 'RD_No']+[col for col in new_order if col in df_rd.columns]]

    return df_rd, last_rd_date

#%% Function to merge RD data with FR data
def get_rd(df_fr):
    # Read CRA data set
    cra_file_path=f'{dir_path}/../congress_data/cra_data.csv'
    df_rd,_=read_cra_data(cra_file_path)

    # Merge
    df_merged=df_fr.merge(df_rd,on='citation',how='left')

    # Fill null
    df_merged['CRA_Target']=df_merged['CRA_Target'].fillna(0)

    return df_merged

#%% Function to extract a date from a string
def extract_date_after_keyword(text, keyword='data last updated'):
    # Regular expression to match dates in various formats (YYYY-MM-DD, MM/DD/YYYY, etc.)
    date_patterns = [
        r'\d{4}-\d{2}-\d{2}',  # YYYY-MM-DD
        r'\d{2}/\d{2}/\d{4}',  # MM/DD/YYYY
        r'\d{1}/\d{2}/\d{4}',  # M/DD/YYYY
        r'\d{1}/\d{1}/\d{4}',  # M/D/YYYY
        r'\d{2}-\d{2}-\d{4}',  # MM-DD-YYYY
        r'\d{4}/\d{2}/\d{2}',  # YYYY/MM/DD
        r'\d{2} \w{3,9} \d{4}',  # DD Month YYYY
        r'\w{3,9} \d{2}, \d{4}',  # Month DD, YYYY
    ]

    # Combine patterns into a single regex
    date_regex = '|'.join(date_patterns)

    # Search for the keyword followed by any amount of whitespace and then a date
    pattern = rf'{re.escape(keyword)}\s*(?:[:\-]?\s*)({date_regex})'
    match = re.search(pattern, text, re.IGNORECASE)
    
    if match:
        return parser.parse(match.group(1)).strftime("%Y-%m-%d")      # Return the matched date
    return None  # Return None if no match is found

#%% Function to get last modified date of the RD dataset from README
def get_cra_updated_date(cra_file_path=f'{dir_path}/../congress_data/cra_data.csv'):
    with open(f'{dir_path}/../congress_data/README.md', 'r', encoding='utf-8') as file:
            text = file.read()
    cra_last_updated = extract_date_after_keyword(text)

    if cra_last_updated:
        return cra_last_updated
    else:
        _,last_rd_date=read_cra_data(cra_file_path)
        return last_rd_date.strftime("%Y-%m-%d")
    
#%% Create objects to import in app
CRA_LAST_UPDATED=get_cra_updated_date()