Mark Febrizio commited on
Commit
bb93e21
0 Parent(s):
Files changed (11) hide show
  1. .gitattributes +35 -0
  2. .gitignore +167 -0
  3. Dockerfile +13 -0
  4. LICENSE +21 -0
  5. README.md +20 -0
  6. app.py +190 -0
  7. get_rules_in_window.py +319 -0
  8. requirements.txt +0 -0
  9. search_columns.py +86 -0
  10. settings.json +6 -0
  11. significant.py +133 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ data/rules_2024_2025.csv filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ----- Project Specific ----- #
2
+
3
+ app_planning.txt
4
+
5
+
6
+ # ----- Python ----- #
7
+
8
+ # Byte-compiled / optimized / DLL files
9
+ __pycache__/
10
+ *.py[cod]
11
+ *$py.class
12
+
13
+ # C extensions
14
+ *.so
15
+
16
+ # Distribution / packaging
17
+ .Python
18
+ build/
19
+ develop-eggs/
20
+ dist/
21
+ downloads/
22
+ eggs/
23
+ .eggs/
24
+ lib/
25
+ lib64/
26
+ parts/
27
+ sdist/
28
+ var/
29
+ wheels/
30
+ share/python-wheels/
31
+ *.egg-info/
32
+ .installed.cfg
33
+ *.egg
34
+ MANIFEST
35
+
36
+ # PyInstaller
37
+ # Usually these files are written by a python script from a template
38
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
39
+ *.manifest
40
+ *.spec
41
+
42
+ # Installer logs
43
+ pip-log.txt
44
+ pip-delete-this-directory.txt
45
+
46
+ # Unit test / coverage reports
47
+ htmlcov/
48
+ .tox/
49
+ .nox/
50
+ .coverage
51
+ .coverage.*
52
+ .cache
53
+ nosetests.xml
54
+ coverage.xml
55
+ *.cover
56
+ *.py,cover
57
+ .hypothesis/
58
+ .pytest_cache/
59
+ cover/
60
+
61
+ # Translations
62
+ *.mo
63
+ *.pot
64
+
65
+ # Django stuff:
66
+ *.log
67
+ local_settings.py
68
+ db.sqlite3
69
+ db.sqlite3-journal
70
+
71
+ # Flask stuff:
72
+ instance/
73
+ .webassets-cache
74
+
75
+ # Scrapy stuff:
76
+ .scrapy
77
+
78
+ # Sphinx documentation
79
+ docs/_build/
80
+
81
+ # PyBuilder
82
+ .pybuilder/
83
+ target/
84
+
85
+ # Jupyter Notebook
86
+ .ipynb_checkpoints
87
+
88
+ # IPython
89
+ profile_default/
90
+ ipython_config.py
91
+
92
+ # pyenv
93
+ # For a library or package, you might want to ignore these files since the code is
94
+ # intended to run in multiple environments; otherwise, check them in:
95
+ # .python-version
96
+
97
+ # pipenv
98
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
99
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
100
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
101
+ # install all needed dependencies.
102
+ #Pipfile.lock
103
+
104
+ # poetry
105
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
106
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
107
+ # commonly ignored for libraries.
108
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
109
+ #poetry.lock
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ #pdm.lock
114
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
115
+ # in version control.
116
+ # https://pdm.fming.dev/#use-with-ide
117
+ .pdm.toml
118
+
119
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
120
+ __pypackages__/
121
+
122
+ # Celery stuff
123
+ celerybeat-schedule
124
+ celerybeat.pid
125
+
126
+ # SageMath parsed files
127
+ *.sage.py
128
+
129
+ # Environments
130
+ .env
131
+ .venv
132
+ env/
133
+ venv/
134
+ ENV/
135
+ env.bak/
136
+ venv.bak/
137
+
138
+ # Spyder project settings
139
+ .spyderproject
140
+ .spyproject
141
+
142
+ # Rope project settings
143
+ .ropeproject
144
+
145
+ # mkdocs documentation
146
+ /site
147
+
148
+ # mypy
149
+ .mypy_cache/
150
+ .dmypy.json
151
+ dmypy.json
152
+
153
+ # Pyre type checker
154
+ .pyre/
155
+
156
+ # pytype static type analyzer
157
+ .pytype/
158
+
159
+ # Cython debug symbols
160
+ cython_debug/
161
+
162
+ # PyCharm
163
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
164
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
165
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
166
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
167
+ #.idea/
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ COPY . .
10
+
11
+ EXPOSE 7860
12
+
13
+ CMD ["shiny", "run", "app.py", "--host", "0.0.0.0", "--port", "7860"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 GW Regulatory Studies Center
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Cra Window Rules
3
+ emoji: 🌍
4
+ colorFrom: yellow
5
+ colorTo: indigo
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ ---
10
+
11
+ This is a templated Space for [Shiny for Python](https://shiny.rstudio.com/py/).
12
+
13
+
14
+ To get started with a new app do the following:
15
+
16
+ 1) Install Shiny with `pip install shiny`
17
+ 2) Create a new app with `shiny create .`
18
+ 3) Then run the app with `shiny run --reload`
19
+
20
+ To learn more about this framework please see the [Documentation](https://shiny.rstudio.com/py/docs/overview.html).
app.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from datetime import datetime, date, time
3
+
4
+ from faicons import icon_svg
5
+
6
+ from get_rules_in_window import (
7
+ DF,
8
+ LAST_UPDATED,
9
+ START_DATE,
10
+ GET_SIGNIFICANT,
11
+ METADATA,
12
+ groupby_agency,
13
+ groupby_ym,
14
+ plot_agency,
15
+ plot_month,
16
+ )
17
+
18
+ from shiny import reactive
19
+ from shiny.express import input, render, ui
20
+
21
+ FOOTER = f"""
22
+ -----
23
+
24
+ Developed by the [GW Regulatory Studies Center](https://go.gwu.edu/regstudies). See our page on the [Congressional Review Act](https://regulatorystudies.columbian.gwu.edu/congressional-review-act) for more information.
25
+ """
26
+
27
+ ui.page_opts(
28
+ title="Rules in the Congressional Review Act (CRA) Window", #fillable=True,
29
+ )
30
+
31
+ with ui.sidebar(title="Settings"):
32
+ ui.input_date("start_date", "Start of window", value=START_DATE, min=START_DATE, max=date.today())
33
+
34
+ ui.input_switch("switch", "Show significant rules in plots", False)
35
+ #ui.input_checkbox_group(
36
+ # "significant",
37
+ # "EO 12866 Significance",
38
+ # ["Section 3(f)(1)", "Other"],
39
+ #)
40
+
41
+ with ui.layout_column_wrap():
42
+ with ui.value_box(showcase=icon_svg("book")):
43
+ "All final rules"
44
+
45
+ @render.text
46
+ def count_rules():
47
+ return f"{filtered_df()['document_number'].count()}"
48
+
49
+ with ui.value_box(showcase=icon_svg("book")):
50
+ "Other Significant rules"
51
+
52
+ @render.text
53
+ def count_other_significant():
54
+ output = "Not available"
55
+ if GET_SIGNIFICANT:
56
+ output = f"{filtered_df()['other_significant'].sum()}"
57
+ return output
58
+
59
+ with ui.value_box(showcase=icon_svg("book")):
60
+ "Section 3(f)(1) Significant rules"
61
+
62
+ @render.text
63
+ def count_3f1_significant():
64
+ output = "Not available"
65
+ if GET_SIGNIFICANT:
66
+ output = f"{filtered_df()['3f1_significant'].sum()}"
67
+ return output
68
+
69
+ with ui.navset_card_underline(title=""):
70
+
71
+ with ui.nav_panel("Rules in detail"):
72
+ @render.data_frame
73
+ def table_rule_detail():
74
+ df = filtered_df()
75
+ #print(df.columns)
76
+ #df.loc[:, "date"] = df.apply(lambda x: f"{x['publication_year']}-{x['publication_month']}-{x['publication_day']}", axis=1)
77
+ df.loc[:, "date"] = df.apply(lambda x: f"{x['publication_date'].date()}", axis=1)
78
+ char = " "
79
+ df.loc[:, "title"] = df["title"].apply(lambda x: f"{char.join(x.split(char)[:9])}...")
80
+ df.loc[:, "agencies"] = df["parent_slug"].apply(lambda x: "; ".join(x))
81
+ cols = [
82
+ "date",
83
+ "title",
84
+ "agencies",
85
+ "3f1_significant",
86
+ "other_significant",
87
+ ]
88
+ return render.DataTable(df.loc[:, [c for c in cols if c in df.columns]])
89
+
90
+ with ui.nav_panel("By month"):
91
+
92
+ with ui.layout_columns():
93
+
94
+ @render.plot
95
+ def plot_by_month():
96
+ grouped = grouped_df_month()
97
+ return plot_month(
98
+ grouped
99
+ )
100
+
101
+ @render.data_frame
102
+ def table_by_month():
103
+ grouped = grouped_df_month()
104
+ cols = [
105
+ "publication_year",
106
+ "publication_month",
107
+ "rules",
108
+ "3f1_significant",
109
+ "other_significant",
110
+ ]
111
+ return render.DataTable(grouped.loc[:, [c for c in cols if c in grouped.columns]])
112
+
113
+ with ui.nav_panel("By agency"):
114
+
115
+ with ui.layout_columns():
116
+
117
+ @render.plot
118
+ def plot_by_agency():
119
+ grouped = grouped_df_agency()
120
+ return plot_agency(
121
+ grouped.head(10),
122
+ )
123
+
124
+ @render.data_frame
125
+ def table_by_agency():
126
+ grouped = grouped_df_agency()
127
+ cols = [
128
+ "agency",
129
+ "acronym",
130
+ "rules",
131
+ "3f1_significant",
132
+ "other_significant",
133
+ ]
134
+ return render.DataTable(grouped.loc[:, [c for c in cols if c in grouped.columns]])
135
+
136
+ with ui.accordion(open=False):
137
+
138
+ with ui.accordion_panel("Download Data"):
139
+
140
+ @render.download(
141
+ label="Download data as CSV",
142
+ filename=f"rules_in_cra_window_accessed_{date.today()}.csv",
143
+ )
144
+ async def download():
145
+ await asyncio.sleep(0.25)
146
+ yield filtered_df().to_csv(index=False)
147
+
148
+ with ui.accordion(open=False):
149
+
150
+ with ui.accordion_panel("Notes"):
151
+
152
+ ui.markdown(
153
+ f"""
154
+ Rule data retrieved from the [Federal Register API](https://www.federalregister.gov/developers/documentation/api/v1).
155
+
156
+ Executive Order 12866 significance data last updated **{LAST_UPDATED}**.
157
+ """
158
+ )
159
+
160
+ ui.markdown(
161
+ FOOTER
162
+ )
163
+
164
+ #ui.tags.footer()
165
+
166
+
167
+ # ----- REACTIVE CALCULATIONS ----- #
168
+
169
+
170
+ @reactive.calc
171
+ def filtered_df():
172
+ filt_df = DF
173
+ #filt_df = df[df["species"].isin(input.species())]
174
+ try:
175
+ filt_df = filt_df.loc[filt_df["publication_date"] >= input.start_date()]
176
+ except TypeError:
177
+ filt_df = filt_df.loc[filt_df["publication_date"] >= datetime.combine(input.start_date(), time(0, 0))]
178
+ return filt_df
179
+
180
+ @reactive.calc
181
+ def grouped_df_month():
182
+ filt_df = filtered_df()
183
+ grouped = groupby_ym(filt_df, significant=GET_SIGNIFICANT)
184
+ return grouped
185
+
186
+ @reactive.calc
187
+ def grouped_df_agency():
188
+ filt_df = filtered_df()
189
+ grouped = groupby_agency(filt_df, metadata=METADATA, significant=GET_SIGNIFICANT)
190
+ return grouped
get_rules_in_window.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import date
2
+ from pathlib import Path
3
+
4
+ from fr_toolbelt.api_requests import get_documents_by_date
5
+ from fr_toolbelt.preprocessing import process_documents, AgencyMetadata
6
+ from numpy import array
7
+ from pandas import DataFrame, to_datetime
8
+ from plotnine import (
9
+ ggplot,
10
+ aes,
11
+ geom_col,
12
+ labs,
13
+ coord_flip,
14
+ scale_x_discrete,
15
+ theme_light,
16
+ )
17
+
18
+ try:
19
+ from search_columns import search_columns, SearchError
20
+ from significant import get_significant_info
21
+ except ModuleNotFoundError:
22
+ from .search_columns import search_columns, SearchError
23
+ from .significant import get_significant_info
24
+
25
+
26
+ METADATA, _ = AgencyMetadata().get_agency_metadata()
27
+ START_DATE = "2024-03-01"
28
+ GET_SIGNIFICANT = True if date.fromisoformat(START_DATE) >= date(2023, 4, 6) else False
29
+
30
+
31
+ class DataAvailabilityError(Exception):
32
+ pass
33
+
34
+
35
+ def get_date_range(start_date: str):
36
+ start_year = date.fromisoformat(start_date).year
37
+ end_year = start_year + 1
38
+ date_range = {
39
+ "start": start_date,
40
+ "end": f"{end_year}-01-31",
41
+ "transition_year": end_year,
42
+ }
43
+ return date_range
44
+
45
+
46
+ def get_rules(date_range: dict) -> list[dict]:
47
+ results, _ = get_documents_by_date(
48
+ start_date=date_range.get("start"),
49
+ end_date=date_range.get("end"),
50
+ document_types=("RULE", )
51
+ )
52
+ return results
53
+
54
+
55
+ def format_documents(documents: list[dict]):
56
+ """Format Federal Register documents to generate count by presidential year.
57
+
58
+ Args:
59
+ documents (list[dict]): List of documents.
60
+
61
+ Returns:
62
+ DataFrame: Pandas DataFrame with formatted data.
63
+ """
64
+ # process agency info in documents
65
+ documents = process_documents(
66
+ documents,
67
+ which=("agencies", "presidents"),
68
+ return_values_as_str=False
69
+ )
70
+
71
+ # create dataframe
72
+ df = DataFrame(documents)
73
+
74
+ # convert publication date to datetime format
75
+ df.loc[:, "publication_dt"] = to_datetime(df["publication_date"])
76
+ df.loc[:, "publication_date"] = df.apply(lambda x: x["publication_dt"].date(), axis=1)
77
+ df.loc[:, "publication_year"] = df.apply(lambda x: x["publication_dt"].year, axis=1)
78
+ df.loc[:, "publication_month"] = df.apply(lambda x: x["publication_dt"].month, axis=1)
79
+ df.loc[:, "publication_day"] = df.apply(lambda x: x["publication_dt"].day, axis=1)
80
+
81
+ # return dataframe
82
+ return df
83
+
84
+
85
+ def filter_new_admin_rules(
86
+ df: DataFrame,
87
+ transition_year: int,
88
+ date_col: str = "publication_date",
89
+ ):
90
+
91
+ admin_transitions = {
92
+ 2001: "george-w-bush",
93
+ 2009: "barack-obama",
94
+ 2017: "donald-trump",
95
+ 2021: "joe-biden",
96
+ }
97
+
98
+ bool_date = array(df[date_col] >= date(transition_year, 1, 20))
99
+ bool_prez = array(df["president_id"] == admin_transitions.get(transition_year))
100
+ bool_ = bool_date & bool_prez
101
+ return df.loc[~bool_]
102
+
103
+
104
+ def filter_corrections(df: DataFrame):
105
+ """Filter out corrections from Federal Register documents.
106
+ Identifies corrections using `corrrection_of` field and regex searches of `document_number`, `title`, and `action` fields.
107
+
108
+ Args:
109
+ df (DataFrame): Federal Register data.
110
+
111
+ Returns:
112
+ tuple: DataFrame with corrections removed, DataFrame of corrections
113
+ """
114
+ # get original column names
115
+ cols = df.columns.tolist()
116
+
117
+ # filter out corrections
118
+ # 1. Using correction fields
119
+ bool_na = array(df["correction_of"].isna())
120
+
121
+ # 2. Searching other fields
122
+ search_1 = search_columns(df, [r"^[crxz][\d]{1,2}-(?:[\w]{2,4}-)?[\d]+"], ["document_number"],
123
+ return_column="indicator1")
124
+ search_2 = search_columns(df, [r"(?:;\scorrection\b)|(?:\bcorrecting\samend[\w]+\b)"], ["title", "action"],
125
+ return_column="indicator2")
126
+ bool_search = array(search_1["indicator1"] == 1) | array(search_2["indicator2"] == 1)
127
+
128
+ # separate corrections from non-corrections
129
+ df_no_corrections = df.loc[(bool_na & ~bool_search), cols] # remove flagged documents
130
+ df_corrections = df.loc[(~bool_na | bool_search), cols]
131
+
132
+ # return filtered results
133
+ if len(df) == len(df_no_corrections) + len(df_corrections):
134
+ return df_no_corrections, df_corrections
135
+ else:
136
+ raise SearchError(f"{len(df)} != {len(df_no_corrections)} + {len(df_corrections)}")
137
+
138
+
139
+ def get_significant_rules(df, start_date):
140
+ process_columns = ("significant", "3f1_significant", )
141
+ if date.fromisoformat(start_date) < date(2023, 4, 6):
142
+ raise DataAvailabilityError("This program does not calculate significant rule counts prior to Executive Order 14094 of April 6, 2023.")
143
+ else:
144
+ document_numbers = df.loc[:, "document_number"].to_list()
145
+ df, last_updated = get_significant_info(df, start_date, document_numbers)
146
+ for col in process_columns:
147
+ bool_na = df[col].isna()
148
+ df.loc[bool_na, col] = "0"
149
+ df.loc[:, col] = df[col].replace(".", "0").astype("int64")
150
+ bool_3f1 = df["3f1_significant"] == 1
151
+ bool_sig = df["significant"] == 1
152
+ df.loc[:, "3f1_significant"] = 0
153
+ df.loc[bool_3f1, "3f1_significant"] = 1
154
+ df.loc[:, "other_significant"] = 0
155
+ df.loc[(bool_sig & ~bool_3f1), "other_significant"] = 1
156
+ return df, last_updated
157
+
158
+
159
+ def get_agency_metadata_values(
160
+ df: DataFrame,
161
+ agency_column: str,
162
+ metadata: dict,
163
+ metadata_value: str,
164
+ ):
165
+ if metadata_value == "acronym":
166
+ metadata_value = "short_name"
167
+ return df.loc[:, agency_column].apply(
168
+ lambda x: metadata.get(x, {}).get(metadata_value)
169
+ )
170
+
171
+
172
+ def groupby_agency(
173
+ df: DataFrame,
174
+ group_col: str = "parent_slug",
175
+ value_col: str = "document_number",
176
+ aggfunc: str = "count",
177
+ significant: bool = True,
178
+ metadata: dict | None = None,
179
+ metadata_value: str = "acronym",
180
+ ):
181
+ aggfunc_dict = {value_col: aggfunc, }
182
+ if significant:
183
+ aggfunc_dict.update({
184
+ "3f1_significant": "sum",
185
+ "other_significant": "sum",
186
+ })
187
+ df_ex = df.explode(group_col, ignore_index=True)
188
+ grouped = df_ex.groupby(
189
+ by=group_col
190
+ ).agg(
191
+ aggfunc_dict
192
+ ).reset_index()
193
+ grouped = grouped.sort_values(value_col, ascending=False).rename(
194
+ columns={
195
+ group_col: "agency",
196
+ value_col: "rules",
197
+ }, errors="ignore"
198
+ )
199
+ if metadata is not None:
200
+ grouped.loc[:, metadata_value] = get_agency_metadata_values(
201
+ grouped,
202
+ agency_column="agency",
203
+ metadata=metadata,
204
+ metadata_value=metadata_value
205
+ )
206
+ cols = ["agency", metadata_value, "rules", "3f1_significant", "other_significant"]
207
+ grouped = grouped.loc[:, [c for c in cols if c in grouped.columns]]
208
+ return grouped
209
+
210
+
211
+ def groupby_ym(
212
+ df: DataFrame,
213
+ group_col: tuple | list = ("publication_year", "publication_month", ),
214
+ value_col: str = "document_number",
215
+ aggfunc: str = "count",
216
+ significant: bool = True
217
+ ):
218
+ aggfunc_dict = {value_col: aggfunc, }
219
+ if significant:
220
+ aggfunc_dict.update({
221
+ "3f1_significant": "sum",
222
+ "other_significant": "sum",
223
+ })
224
+ grouped = df.groupby(
225
+ by=list(group_col)
226
+ ).agg(
227
+ aggfunc_dict
228
+ ).reset_index()
229
+ grouped = grouped.rename(columns={
230
+ value_col: "rules",
231
+ }, errors="ignore")
232
+ return grouped
233
+
234
+
235
+ def save_csv(path: Path, df_all: DataFrame, df_agency: DataFrame, df_ym: DataFrame, transition_year: int):
236
+ files = (
237
+ f"rules_{transition_year - 1}_{transition_year}.csv",
238
+ f"rules_by_agency_{transition_year - 1}_{transition_year}.csv",
239
+ f"rules_by_month_{transition_year - 1}_{transition_year}.csv"
240
+ )
241
+ dataframes = (df_all, df_agency, df_ym)
242
+ for data, file in zip(dataframes, files):
243
+ data.to_csv(path / file, index=False)
244
+
245
+
246
+ def plot_agency(df, group_col = "acronym", value_col = "rules"):
247
+
248
+ order_list = df.loc[:, group_col].to_list()[::-1]
249
+
250
+ plot = (
251
+ ggplot(
252
+ df,
253
+ aes(x=group_col, y=value_col),
254
+ )
255
+ + geom_col()
256
+ + coord_flip()
257
+ + scale_x_discrete(limits=order_list)
258
+ + labs(y="", x="", title="Number of Rules Published by Agency")
259
+ + theme_light()
260
+ )
261
+ return plot
262
+
263
+
264
+ def plot_month(df, group_cols = ("publication_year", "publication_month"), value_col = "rules"):
265
+
266
+ df.loc[:, "ym"] = df[group_cols[0]].astype(str) + "-" + df[group_cols[1]].astype(str).str.pad(2, fillchar="0")
267
+ order_list = df.loc[:, "ym"].to_list()
268
+
269
+ plot = (
270
+ ggplot(
271
+ df,
272
+ aes(x="ym", y=value_col),
273
+ )
274
+ + geom_col()
275
+ + scale_x_discrete(limits=order_list)
276
+ + labs(y="", x="", title="Number of Rules Published by Month")
277
+ + theme_light()
278
+ )
279
+ return plot
280
+
281
+
282
+ def get_rules_in_window(start_date: str, get_significant: bool = True):
283
+ date_range = get_date_range(start_date)
284
+ transition_year = date_range.get("transition_year")
285
+ results = get_rules(date_range)
286
+ df = format_documents(results)
287
+ df, _ = filter_corrections(df)
288
+ df = filter_new_admin_rules(df, transition_year)
289
+ if get_significant:
290
+ df, last_updated = get_significant_rules(df, start_date)
291
+ else:
292
+ last_updated = date.today()
293
+ return df, last_updated
294
+
295
+
296
+ DF, LAST_UPDATED = get_rules_in_window(START_DATE, get_significant=GET_SIGNIFICANT)
297
+
298
+
299
+ def main(start_date, save_data: bool = True, path: Path | None = None, metadata: dict | None = None, significant: bool = True):
300
+ if date.fromisoformat(start_date) < date(2023, 4, 6):
301
+ significant = False
302
+ date_range = get_date_range(start_date)
303
+ transition_year = date_range.get("transition_year")
304
+ df, _ = get_rules_in_window(start_date, get_significant=significant)
305
+
306
+ df_agency = groupby_agency(df, metadata=metadata, significant=significant)
307
+ df_ym = groupby_ym(df, significant=significant)
308
+
309
+ if save_data:
310
+ if path is None:
311
+ path = Path(__file__).parent
312
+ save_csv(path, df, df_agency, df_ym, transition_year)
313
+
314
+ return df, df_agency, df_ym
315
+
316
+
317
+ if __name__ == "__main__":
318
+
319
+ pass
requirements.txt ADDED
Binary file (2.01 kB). View file
 
search_columns.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools
2
+ import re
3
+
4
+ from numpy import array
5
+ from pandas import DataFrame
6
+
7
+
8
+ class SearchError(Exception):
9
+ """Search returned misaligned results."""
10
+ pass
11
+
12
+
13
+ # Defining a function to search for string patterns within dataframe columns
14
+ def search_columns(df: DataFrame,
15
+ patterns: list,
16
+ columns: list,
17
+ return_as: str = "indicator_column",
18
+ return_column: str = "indicator",
19
+ re_flags = re.I | re.X):
20
+ """Search columns for string patterns within dataframe columns.
21
+
22
+ Args:
23
+ df (DataFrame): Input data in format of pandas dataframe.
24
+ patterns (list): List of string patterns to input, compatible with regex.
25
+ columns (list): List of column names to search for input patterns.
26
+ return_as (str, optional): Return a DataFrame with indicator column ("indicator_column") or filtered by the search terms ("filtered_df"). Defaults to "indicator_column".
27
+ re_flags (optional): Regex flags to use. Defaults to re.I | re.X.
28
+
29
+ Raises:
30
+ TypeError: Raises exception when `patterns` or `columns` parameters are not lists.
31
+ ValueError: Raises exception when `patterns` or `columns` parameters have incorrect length.
32
+ ValueError: Raises exception when `return_as` parameter receives an incorrect value.
33
+
34
+ Returns:
35
+ DataFrame: DataFrame with "indicator" column or filtered by search terms.
36
+ """
37
+ # create list object for appending boolean arrays
38
+ bool_list = []
39
+
40
+ # ensure that input patterns and columns are formatted as lists
41
+ if not (isinstance(patterns, list) and isinstance(columns, list)):
42
+ raise TypeError('Inputs for "patterns" and "columns" keywords must be lists.')
43
+
44
+ if len(patterns) == len(columns):
45
+ # create list of inputs in format [(pattern1, column1),(pattern2, column2), ...]
46
+ inputs = list(zip(patterns,columns))
47
+
48
+ # loop over list of inputs
49
+ for i in inputs:
50
+ searchre = df[i[1]].str.contains(i[0], regex=True, case=False, flags=re_flags)
51
+ searchbool = array([True if n is True else False for n in searchre])
52
+ bool_list.append(searchbool)
53
+
54
+ elif (len(patterns) == 1) and (len(patterns) != len(columns)):
55
+ # create list of inputs in format [(pattern, column1),(pattern, column2), ...]
56
+ inputs = list(itertools.product(patterns, columns))
57
+
58
+ # loop over list of inputs
59
+ for i in inputs:
60
+ searchre = df[i[1]].str.contains(i[0], regex=True, case=False, flags=re_flags)
61
+ searchbool = array([True if n is True else False for n in searchre])
62
+ bool_list.append(searchbool)
63
+
64
+ else: # eg, patterns formatted as a list of len(n>1) but does not match len(columns)
65
+ raise ValueError("Length of inputs are incorrect. Lengths of 'patterns' and 'columns' can either match or a single pattern can map to multiple columns.")
66
+
67
+ # combine each "searchbool" array elementwise
68
+ # we want a positive match for any column to evaluate as True
69
+ # equivalent to (bool_list[0] | bool_list[1] | bool_list[2] | ... | bool_list[n-1])
70
+ filter_bool = array(bool_list).any(axis=0)
71
+
72
+ if return_as == "indicator_column":
73
+ dfResults = df.copy(deep=True)
74
+ dfResults.loc[:, return_column] = 0
75
+ dfResults.loc[filter_bool, return_column] = 1
76
+ #print(f"Count {return_column}: {sum(dfResults[return_column].values)}")
77
+ return dfResults
78
+
79
+ elif return_as == "filtered_df":
80
+ # filter results
81
+ dfResults = df.loc[filter_bool, :].copy(deep=True)
82
+ #print(f"Count {return_column}: {len(dfResults)}")
83
+ return dfResults
84
+
85
+ else:
86
+ raise ValueError("Incorrect input for 'return_as' parameter.")
settings.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "[python]": {
3
+ "editor.defaultFormatter": "ms-python.black-formatter"
4
+ },
5
+ "python.formatting.provider": "none"
6
+ }
significant.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # gather details on rule significance from FR tracking document
2
+ # see: https://github.com/regulatorystudies/Reg-Stats/blob/main/data/fr_tracking/fr_tracking.csv
3
+
4
+ from datetime import date
5
+ import polars as pl
6
+ from pandas import (
7
+ DataFrame as pd_DataFrame,
8
+ read_csv as pd_read_csv,
9
+ to_datetime,
10
+ )
11
+
12
+
13
+ def read_csv_data(
14
+ start_date: date | str,
15
+ retrieve_columns: list | tuple = (
16
+ "publication_date",
17
+ "document_number",
18
+ "significant",
19
+ "econ_significant",
20
+ "3(f)(1) significant",
21
+ "Major"
22
+ ),
23
+ url: str = r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv"
24
+ ):
25
+ # handle dates formatted as str
26
+ if isinstance(start_date, str):
27
+ start_date = date.fromisoformat(start_date)
28
+
29
+ # drop econ_significant column for dates on or after EO 14094
30
+ if start_date >= date.fromisoformat("2023-04-06"):
31
+ cols = [col for col in retrieve_columns if col != "econ_significant"]
32
+ else:
33
+ cols = list(retrieve_columns)
34
+
35
+ # read csv; try different encoding if raises error
36
+ try:
37
+ df_pd = pd_read_csv(url, usecols=cols)
38
+ except UnicodeDecodeError:
39
+ df_pd = pd_read_csv(url, usecols=cols, encoding="latin")
40
+
41
+ df_pd.loc[:, "publication_dt"] = to_datetime(df_pd["publication_date"], format="mixed", dayfirst=False, yearfirst=False)
42
+ max_date = max(df_pd.loc[:, "publication_dt"].to_list()).date()
43
+ #print(max_date)
44
+ cols.remove("publication_date")
45
+ df = pl.from_pandas(df_pd.loc[:, cols])
46
+
47
+ if df.shape[1] == len(cols):
48
+ # rename columns if they exist
49
+ rename_cols = {"3(f)(1) significant": "3f1_significant", "Major": "major"}
50
+ if all(True if rename in cols else False for rename in rename_cols.keys()):
51
+ df = df.rename(rename_cols)
52
+ cols = [rename_cols.get(col, col) for col in cols]
53
+
54
+ return df, cols, max_date
55
+ else:
56
+ return None, cols, max_date
57
+
58
+
59
+ def clean_data(df: pl.DataFrame,
60
+ document_numbers: list,
61
+ clean_columns: list | tuple,
62
+ #format_not_available_values: str = ".",
63
+ return_optimized_plan = False
64
+ ):
65
+
66
+ # start a lazy query
67
+ lf = (
68
+ df.lazy()
69
+ # strip whitespace
70
+ .with_columns(pl.col("document_number").str.strip_chars())
71
+ # only keep document_numbers from input
72
+ .filter(pl.col("document_number").is_in(document_numbers))
73
+ # temporarily format "not available" data (input as dots)
74
+ #.with_columns(pl.col(c for c in clean_columns if c != "document_number").str.replace_all(".", f"{format_not_available_values}", literal=True))
75
+ # cast to nullable int dtype
76
+ #.with_columns(pl.col(c for c in clean_columns if c != "document_number").cast(pl.Int64, strict=False))
77
+ )
78
+
79
+ # return optimized query plan instead of df
80
+ if return_optimized_plan:
81
+ return lf.explain(optimized=True)
82
+
83
+ # call collect to return df
84
+ return lf.collect()
85
+
86
+
87
+ def merge_with_api_results(pd_df: pd_DataFrame,
88
+ pl_df: pl.DataFrame
89
+ ):
90
+
91
+ main_df = pl.from_pandas(pd_df)
92
+ df = main_df.join(pl_df, on="document_number", how="left", validate="1:1")
93
+ return df.to_pandas()
94
+
95
+
96
+ def get_significant_info(input_df, start_date, document_numbers):
97
+
98
+ pl_df, clean_cols, max_date = read_csv_data(start_date)
99
+ if pl_df is None:
100
+ print("Failed to integrate significance tracking data with retrieved documents.")
101
+ return input_df
102
+ pl_df = clean_data(pl_df, document_numbers, clean_cols)
103
+ pd_df = merge_with_api_results(input_df, pl_df)
104
+ return pd_df, max_date
105
+
106
+
107
+ if __name__ == "__main__":
108
+
109
+ date_a = "2023-04-05"
110
+ date_b = "2023-04-06"
111
+ numbers = [
112
+ "2021-01303",
113
+ '2023-28006',
114
+ '2024-00149',
115
+ '2024-00089',
116
+ '2023-28828',
117
+ '2024-00300',
118
+ '2024-00045',
119
+ '2024-00192',
120
+ '2024-00228',
121
+ '2024-00187'
122
+ ]
123
+
124
+ # test for dates before EO 14094
125
+ df_a, clean_cols = read_csv_data(date_a)
126
+ df_a = clean_data(df_a, numbers, clean_cols)
127
+
128
+ # test for dates after EO 14094
129
+ df_b, clean_cols = read_csv_data(date_b)
130
+ df_b = clean_data(df_b, numbers, clean_cols)
131
+
132
+ #df_b.rename({"test": "test1"})
133
+ #print(df_a.shape, df_b.shape)