Mark Febrizio
commited on
Commit
•
bb93e21
0
Parent(s):
try 2
Browse files- .gitattributes +35 -0
- .gitignore +167 -0
- Dockerfile +13 -0
- LICENSE +21 -0
- README.md +20 -0
- app.py +190 -0
- get_rules_in_window.py +319 -0
- requirements.txt +0 -0
- search_columns.py +86 -0
- settings.json +6 -0
- significant.py +133 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
data/rules_2024_2025.csv filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ----- Project Specific ----- #
|
2 |
+
|
3 |
+
app_planning.txt
|
4 |
+
|
5 |
+
|
6 |
+
# ----- Python ----- #
|
7 |
+
|
8 |
+
# Byte-compiled / optimized / DLL files
|
9 |
+
__pycache__/
|
10 |
+
*.py[cod]
|
11 |
+
*$py.class
|
12 |
+
|
13 |
+
# C extensions
|
14 |
+
*.so
|
15 |
+
|
16 |
+
# Distribution / packaging
|
17 |
+
.Python
|
18 |
+
build/
|
19 |
+
develop-eggs/
|
20 |
+
dist/
|
21 |
+
downloads/
|
22 |
+
eggs/
|
23 |
+
.eggs/
|
24 |
+
lib/
|
25 |
+
lib64/
|
26 |
+
parts/
|
27 |
+
sdist/
|
28 |
+
var/
|
29 |
+
wheels/
|
30 |
+
share/python-wheels/
|
31 |
+
*.egg-info/
|
32 |
+
.installed.cfg
|
33 |
+
*.egg
|
34 |
+
MANIFEST
|
35 |
+
|
36 |
+
# PyInstaller
|
37 |
+
# Usually these files are written by a python script from a template
|
38 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
39 |
+
*.manifest
|
40 |
+
*.spec
|
41 |
+
|
42 |
+
# Installer logs
|
43 |
+
pip-log.txt
|
44 |
+
pip-delete-this-directory.txt
|
45 |
+
|
46 |
+
# Unit test / coverage reports
|
47 |
+
htmlcov/
|
48 |
+
.tox/
|
49 |
+
.nox/
|
50 |
+
.coverage
|
51 |
+
.coverage.*
|
52 |
+
.cache
|
53 |
+
nosetests.xml
|
54 |
+
coverage.xml
|
55 |
+
*.cover
|
56 |
+
*.py,cover
|
57 |
+
.hypothesis/
|
58 |
+
.pytest_cache/
|
59 |
+
cover/
|
60 |
+
|
61 |
+
# Translations
|
62 |
+
*.mo
|
63 |
+
*.pot
|
64 |
+
|
65 |
+
# Django stuff:
|
66 |
+
*.log
|
67 |
+
local_settings.py
|
68 |
+
db.sqlite3
|
69 |
+
db.sqlite3-journal
|
70 |
+
|
71 |
+
# Flask stuff:
|
72 |
+
instance/
|
73 |
+
.webassets-cache
|
74 |
+
|
75 |
+
# Scrapy stuff:
|
76 |
+
.scrapy
|
77 |
+
|
78 |
+
# Sphinx documentation
|
79 |
+
docs/_build/
|
80 |
+
|
81 |
+
# PyBuilder
|
82 |
+
.pybuilder/
|
83 |
+
target/
|
84 |
+
|
85 |
+
# Jupyter Notebook
|
86 |
+
.ipynb_checkpoints
|
87 |
+
|
88 |
+
# IPython
|
89 |
+
profile_default/
|
90 |
+
ipython_config.py
|
91 |
+
|
92 |
+
# pyenv
|
93 |
+
# For a library or package, you might want to ignore these files since the code is
|
94 |
+
# intended to run in multiple environments; otherwise, check them in:
|
95 |
+
# .python-version
|
96 |
+
|
97 |
+
# pipenv
|
98 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
99 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
100 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
101 |
+
# install all needed dependencies.
|
102 |
+
#Pipfile.lock
|
103 |
+
|
104 |
+
# poetry
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
106 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
107 |
+
# commonly ignored for libraries.
|
108 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
109 |
+
#poetry.lock
|
110 |
+
|
111 |
+
# pdm
|
112 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
113 |
+
#pdm.lock
|
114 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
115 |
+
# in version control.
|
116 |
+
# https://pdm.fming.dev/#use-with-ide
|
117 |
+
.pdm.toml
|
118 |
+
|
119 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
120 |
+
__pypackages__/
|
121 |
+
|
122 |
+
# Celery stuff
|
123 |
+
celerybeat-schedule
|
124 |
+
celerybeat.pid
|
125 |
+
|
126 |
+
# SageMath parsed files
|
127 |
+
*.sage.py
|
128 |
+
|
129 |
+
# Environments
|
130 |
+
.env
|
131 |
+
.venv
|
132 |
+
env/
|
133 |
+
venv/
|
134 |
+
ENV/
|
135 |
+
env.bak/
|
136 |
+
venv.bak/
|
137 |
+
|
138 |
+
# Spyder project settings
|
139 |
+
.spyderproject
|
140 |
+
.spyproject
|
141 |
+
|
142 |
+
# Rope project settings
|
143 |
+
.ropeproject
|
144 |
+
|
145 |
+
# mkdocs documentation
|
146 |
+
/site
|
147 |
+
|
148 |
+
# mypy
|
149 |
+
.mypy_cache/
|
150 |
+
.dmypy.json
|
151 |
+
dmypy.json
|
152 |
+
|
153 |
+
# Pyre type checker
|
154 |
+
.pyre/
|
155 |
+
|
156 |
+
# pytype static type analyzer
|
157 |
+
.pytype/
|
158 |
+
|
159 |
+
# Cython debug symbols
|
160 |
+
cython_debug/
|
161 |
+
|
162 |
+
# PyCharm
|
163 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
164 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
165 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
166 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
167 |
+
#.idea/
|
Dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
COPY ./requirements.txt /code/requirements.txt
|
6 |
+
|
7 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
8 |
+
|
9 |
+
COPY . .
|
10 |
+
|
11 |
+
EXPOSE 7860
|
12 |
+
|
13 |
+
CMD ["shiny", "run", "app.py", "--host", "0.0.0.0", "--port", "7860"]
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2024 GW Regulatory Studies Center
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Cra Window Rules
|
3 |
+
emoji: 🌍
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: indigo
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
license: mit
|
9 |
+
---
|
10 |
+
|
11 |
+
This is a templated Space for [Shiny for Python](https://shiny.rstudio.com/py/).
|
12 |
+
|
13 |
+
|
14 |
+
To get started with a new app do the following:
|
15 |
+
|
16 |
+
1) Install Shiny with `pip install shiny`
|
17 |
+
2) Create a new app with `shiny create .`
|
18 |
+
3) Then run the app with `shiny run --reload`
|
19 |
+
|
20 |
+
To learn more about this framework please see the [Documentation](https://shiny.rstudio.com/py/docs/overview.html).
|
app.py
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
from datetime import datetime, date, time
|
3 |
+
|
4 |
+
from faicons import icon_svg
|
5 |
+
|
6 |
+
from get_rules_in_window import (
|
7 |
+
DF,
|
8 |
+
LAST_UPDATED,
|
9 |
+
START_DATE,
|
10 |
+
GET_SIGNIFICANT,
|
11 |
+
METADATA,
|
12 |
+
groupby_agency,
|
13 |
+
groupby_ym,
|
14 |
+
plot_agency,
|
15 |
+
plot_month,
|
16 |
+
)
|
17 |
+
|
18 |
+
from shiny import reactive
|
19 |
+
from shiny.express import input, render, ui
|
20 |
+
|
21 |
+
FOOTER = f"""
|
22 |
+
-----
|
23 |
+
|
24 |
+
Developed by the [GW Regulatory Studies Center](https://go.gwu.edu/regstudies). See our page on the [Congressional Review Act](https://regulatorystudies.columbian.gwu.edu/congressional-review-act) for more information.
|
25 |
+
"""
|
26 |
+
|
27 |
+
ui.page_opts(
|
28 |
+
title="Rules in the Congressional Review Act (CRA) Window", #fillable=True,
|
29 |
+
)
|
30 |
+
|
31 |
+
with ui.sidebar(title="Settings"):
|
32 |
+
ui.input_date("start_date", "Start of window", value=START_DATE, min=START_DATE, max=date.today())
|
33 |
+
|
34 |
+
ui.input_switch("switch", "Show significant rules in plots", False)
|
35 |
+
#ui.input_checkbox_group(
|
36 |
+
# "significant",
|
37 |
+
# "EO 12866 Significance",
|
38 |
+
# ["Section 3(f)(1)", "Other"],
|
39 |
+
#)
|
40 |
+
|
41 |
+
with ui.layout_column_wrap():
|
42 |
+
with ui.value_box(showcase=icon_svg("book")):
|
43 |
+
"All final rules"
|
44 |
+
|
45 |
+
@render.text
|
46 |
+
def count_rules():
|
47 |
+
return f"{filtered_df()['document_number'].count()}"
|
48 |
+
|
49 |
+
with ui.value_box(showcase=icon_svg("book")):
|
50 |
+
"Other Significant rules"
|
51 |
+
|
52 |
+
@render.text
|
53 |
+
def count_other_significant():
|
54 |
+
output = "Not available"
|
55 |
+
if GET_SIGNIFICANT:
|
56 |
+
output = f"{filtered_df()['other_significant'].sum()}"
|
57 |
+
return output
|
58 |
+
|
59 |
+
with ui.value_box(showcase=icon_svg("book")):
|
60 |
+
"Section 3(f)(1) Significant rules"
|
61 |
+
|
62 |
+
@render.text
|
63 |
+
def count_3f1_significant():
|
64 |
+
output = "Not available"
|
65 |
+
if GET_SIGNIFICANT:
|
66 |
+
output = f"{filtered_df()['3f1_significant'].sum()}"
|
67 |
+
return output
|
68 |
+
|
69 |
+
with ui.navset_card_underline(title=""):
|
70 |
+
|
71 |
+
with ui.nav_panel("Rules in detail"):
|
72 |
+
@render.data_frame
|
73 |
+
def table_rule_detail():
|
74 |
+
df = filtered_df()
|
75 |
+
#print(df.columns)
|
76 |
+
#df.loc[:, "date"] = df.apply(lambda x: f"{x['publication_year']}-{x['publication_month']}-{x['publication_day']}", axis=1)
|
77 |
+
df.loc[:, "date"] = df.apply(lambda x: f"{x['publication_date'].date()}", axis=1)
|
78 |
+
char = " "
|
79 |
+
df.loc[:, "title"] = df["title"].apply(lambda x: f"{char.join(x.split(char)[:9])}...")
|
80 |
+
df.loc[:, "agencies"] = df["parent_slug"].apply(lambda x: "; ".join(x))
|
81 |
+
cols = [
|
82 |
+
"date",
|
83 |
+
"title",
|
84 |
+
"agencies",
|
85 |
+
"3f1_significant",
|
86 |
+
"other_significant",
|
87 |
+
]
|
88 |
+
return render.DataTable(df.loc[:, [c for c in cols if c in df.columns]])
|
89 |
+
|
90 |
+
with ui.nav_panel("By month"):
|
91 |
+
|
92 |
+
with ui.layout_columns():
|
93 |
+
|
94 |
+
@render.plot
|
95 |
+
def plot_by_month():
|
96 |
+
grouped = grouped_df_month()
|
97 |
+
return plot_month(
|
98 |
+
grouped
|
99 |
+
)
|
100 |
+
|
101 |
+
@render.data_frame
|
102 |
+
def table_by_month():
|
103 |
+
grouped = grouped_df_month()
|
104 |
+
cols = [
|
105 |
+
"publication_year",
|
106 |
+
"publication_month",
|
107 |
+
"rules",
|
108 |
+
"3f1_significant",
|
109 |
+
"other_significant",
|
110 |
+
]
|
111 |
+
return render.DataTable(grouped.loc[:, [c for c in cols if c in grouped.columns]])
|
112 |
+
|
113 |
+
with ui.nav_panel("By agency"):
|
114 |
+
|
115 |
+
with ui.layout_columns():
|
116 |
+
|
117 |
+
@render.plot
|
118 |
+
def plot_by_agency():
|
119 |
+
grouped = grouped_df_agency()
|
120 |
+
return plot_agency(
|
121 |
+
grouped.head(10),
|
122 |
+
)
|
123 |
+
|
124 |
+
@render.data_frame
|
125 |
+
def table_by_agency():
|
126 |
+
grouped = grouped_df_agency()
|
127 |
+
cols = [
|
128 |
+
"agency",
|
129 |
+
"acronym",
|
130 |
+
"rules",
|
131 |
+
"3f1_significant",
|
132 |
+
"other_significant",
|
133 |
+
]
|
134 |
+
return render.DataTable(grouped.loc[:, [c for c in cols if c in grouped.columns]])
|
135 |
+
|
136 |
+
with ui.accordion(open=False):
|
137 |
+
|
138 |
+
with ui.accordion_panel("Download Data"):
|
139 |
+
|
140 |
+
@render.download(
|
141 |
+
label="Download data as CSV",
|
142 |
+
filename=f"rules_in_cra_window_accessed_{date.today()}.csv",
|
143 |
+
)
|
144 |
+
async def download():
|
145 |
+
await asyncio.sleep(0.25)
|
146 |
+
yield filtered_df().to_csv(index=False)
|
147 |
+
|
148 |
+
with ui.accordion(open=False):
|
149 |
+
|
150 |
+
with ui.accordion_panel("Notes"):
|
151 |
+
|
152 |
+
ui.markdown(
|
153 |
+
f"""
|
154 |
+
Rule data retrieved from the [Federal Register API](https://www.federalregister.gov/developers/documentation/api/v1).
|
155 |
+
|
156 |
+
Executive Order 12866 significance data last updated **{LAST_UPDATED}**.
|
157 |
+
"""
|
158 |
+
)
|
159 |
+
|
160 |
+
ui.markdown(
|
161 |
+
FOOTER
|
162 |
+
)
|
163 |
+
|
164 |
+
#ui.tags.footer()
|
165 |
+
|
166 |
+
|
167 |
+
# ----- REACTIVE CALCULATIONS ----- #
|
168 |
+
|
169 |
+
|
170 |
+
@reactive.calc
|
171 |
+
def filtered_df():
|
172 |
+
filt_df = DF
|
173 |
+
#filt_df = df[df["species"].isin(input.species())]
|
174 |
+
try:
|
175 |
+
filt_df = filt_df.loc[filt_df["publication_date"] >= input.start_date()]
|
176 |
+
except TypeError:
|
177 |
+
filt_df = filt_df.loc[filt_df["publication_date"] >= datetime.combine(input.start_date(), time(0, 0))]
|
178 |
+
return filt_df
|
179 |
+
|
180 |
+
@reactive.calc
|
181 |
+
def grouped_df_month():
|
182 |
+
filt_df = filtered_df()
|
183 |
+
grouped = groupby_ym(filt_df, significant=GET_SIGNIFICANT)
|
184 |
+
return grouped
|
185 |
+
|
186 |
+
@reactive.calc
|
187 |
+
def grouped_df_agency():
|
188 |
+
filt_df = filtered_df()
|
189 |
+
grouped = groupby_agency(filt_df, metadata=METADATA, significant=GET_SIGNIFICANT)
|
190 |
+
return grouped
|
get_rules_in_window.py
ADDED
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import date
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
from fr_toolbelt.api_requests import get_documents_by_date
|
5 |
+
from fr_toolbelt.preprocessing import process_documents, AgencyMetadata
|
6 |
+
from numpy import array
|
7 |
+
from pandas import DataFrame, to_datetime
|
8 |
+
from plotnine import (
|
9 |
+
ggplot,
|
10 |
+
aes,
|
11 |
+
geom_col,
|
12 |
+
labs,
|
13 |
+
coord_flip,
|
14 |
+
scale_x_discrete,
|
15 |
+
theme_light,
|
16 |
+
)
|
17 |
+
|
18 |
+
try:
|
19 |
+
from search_columns import search_columns, SearchError
|
20 |
+
from significant import get_significant_info
|
21 |
+
except ModuleNotFoundError:
|
22 |
+
from .search_columns import search_columns, SearchError
|
23 |
+
from .significant import get_significant_info
|
24 |
+
|
25 |
+
|
26 |
+
METADATA, _ = AgencyMetadata().get_agency_metadata()
|
27 |
+
START_DATE = "2024-03-01"
|
28 |
+
GET_SIGNIFICANT = True if date.fromisoformat(START_DATE) >= date(2023, 4, 6) else False
|
29 |
+
|
30 |
+
|
31 |
+
class DataAvailabilityError(Exception):
|
32 |
+
pass
|
33 |
+
|
34 |
+
|
35 |
+
def get_date_range(start_date: str):
|
36 |
+
start_year = date.fromisoformat(start_date).year
|
37 |
+
end_year = start_year + 1
|
38 |
+
date_range = {
|
39 |
+
"start": start_date,
|
40 |
+
"end": f"{end_year}-01-31",
|
41 |
+
"transition_year": end_year,
|
42 |
+
}
|
43 |
+
return date_range
|
44 |
+
|
45 |
+
|
46 |
+
def get_rules(date_range: dict) -> list[dict]:
|
47 |
+
results, _ = get_documents_by_date(
|
48 |
+
start_date=date_range.get("start"),
|
49 |
+
end_date=date_range.get("end"),
|
50 |
+
document_types=("RULE", )
|
51 |
+
)
|
52 |
+
return results
|
53 |
+
|
54 |
+
|
55 |
+
def format_documents(documents: list[dict]):
|
56 |
+
"""Format Federal Register documents to generate count by presidential year.
|
57 |
+
|
58 |
+
Args:
|
59 |
+
documents (list[dict]): List of documents.
|
60 |
+
|
61 |
+
Returns:
|
62 |
+
DataFrame: Pandas DataFrame with formatted data.
|
63 |
+
"""
|
64 |
+
# process agency info in documents
|
65 |
+
documents = process_documents(
|
66 |
+
documents,
|
67 |
+
which=("agencies", "presidents"),
|
68 |
+
return_values_as_str=False
|
69 |
+
)
|
70 |
+
|
71 |
+
# create dataframe
|
72 |
+
df = DataFrame(documents)
|
73 |
+
|
74 |
+
# convert publication date to datetime format
|
75 |
+
df.loc[:, "publication_dt"] = to_datetime(df["publication_date"])
|
76 |
+
df.loc[:, "publication_date"] = df.apply(lambda x: x["publication_dt"].date(), axis=1)
|
77 |
+
df.loc[:, "publication_year"] = df.apply(lambda x: x["publication_dt"].year, axis=1)
|
78 |
+
df.loc[:, "publication_month"] = df.apply(lambda x: x["publication_dt"].month, axis=1)
|
79 |
+
df.loc[:, "publication_day"] = df.apply(lambda x: x["publication_dt"].day, axis=1)
|
80 |
+
|
81 |
+
# return dataframe
|
82 |
+
return df
|
83 |
+
|
84 |
+
|
85 |
+
def filter_new_admin_rules(
|
86 |
+
df: DataFrame,
|
87 |
+
transition_year: int,
|
88 |
+
date_col: str = "publication_date",
|
89 |
+
):
|
90 |
+
|
91 |
+
admin_transitions = {
|
92 |
+
2001: "george-w-bush",
|
93 |
+
2009: "barack-obama",
|
94 |
+
2017: "donald-trump",
|
95 |
+
2021: "joe-biden",
|
96 |
+
}
|
97 |
+
|
98 |
+
bool_date = array(df[date_col] >= date(transition_year, 1, 20))
|
99 |
+
bool_prez = array(df["president_id"] == admin_transitions.get(transition_year))
|
100 |
+
bool_ = bool_date & bool_prez
|
101 |
+
return df.loc[~bool_]
|
102 |
+
|
103 |
+
|
104 |
+
def filter_corrections(df: DataFrame):
|
105 |
+
"""Filter out corrections from Federal Register documents.
|
106 |
+
Identifies corrections using `corrrection_of` field and regex searches of `document_number`, `title`, and `action` fields.
|
107 |
+
|
108 |
+
Args:
|
109 |
+
df (DataFrame): Federal Register data.
|
110 |
+
|
111 |
+
Returns:
|
112 |
+
tuple: DataFrame with corrections removed, DataFrame of corrections
|
113 |
+
"""
|
114 |
+
# get original column names
|
115 |
+
cols = df.columns.tolist()
|
116 |
+
|
117 |
+
# filter out corrections
|
118 |
+
# 1. Using correction fields
|
119 |
+
bool_na = array(df["correction_of"].isna())
|
120 |
+
|
121 |
+
# 2. Searching other fields
|
122 |
+
search_1 = search_columns(df, [r"^[crxz][\d]{1,2}-(?:[\w]{2,4}-)?[\d]+"], ["document_number"],
|
123 |
+
return_column="indicator1")
|
124 |
+
search_2 = search_columns(df, [r"(?:;\scorrection\b)|(?:\bcorrecting\samend[\w]+\b)"], ["title", "action"],
|
125 |
+
return_column="indicator2")
|
126 |
+
bool_search = array(search_1["indicator1"] == 1) | array(search_2["indicator2"] == 1)
|
127 |
+
|
128 |
+
# separate corrections from non-corrections
|
129 |
+
df_no_corrections = df.loc[(bool_na & ~bool_search), cols] # remove flagged documents
|
130 |
+
df_corrections = df.loc[(~bool_na | bool_search), cols]
|
131 |
+
|
132 |
+
# return filtered results
|
133 |
+
if len(df) == len(df_no_corrections) + len(df_corrections):
|
134 |
+
return df_no_corrections, df_corrections
|
135 |
+
else:
|
136 |
+
raise SearchError(f"{len(df)} != {len(df_no_corrections)} + {len(df_corrections)}")
|
137 |
+
|
138 |
+
|
139 |
+
def get_significant_rules(df, start_date):
|
140 |
+
process_columns = ("significant", "3f1_significant", )
|
141 |
+
if date.fromisoformat(start_date) < date(2023, 4, 6):
|
142 |
+
raise DataAvailabilityError("This program does not calculate significant rule counts prior to Executive Order 14094 of April 6, 2023.")
|
143 |
+
else:
|
144 |
+
document_numbers = df.loc[:, "document_number"].to_list()
|
145 |
+
df, last_updated = get_significant_info(df, start_date, document_numbers)
|
146 |
+
for col in process_columns:
|
147 |
+
bool_na = df[col].isna()
|
148 |
+
df.loc[bool_na, col] = "0"
|
149 |
+
df.loc[:, col] = df[col].replace(".", "0").astype("int64")
|
150 |
+
bool_3f1 = df["3f1_significant"] == 1
|
151 |
+
bool_sig = df["significant"] == 1
|
152 |
+
df.loc[:, "3f1_significant"] = 0
|
153 |
+
df.loc[bool_3f1, "3f1_significant"] = 1
|
154 |
+
df.loc[:, "other_significant"] = 0
|
155 |
+
df.loc[(bool_sig & ~bool_3f1), "other_significant"] = 1
|
156 |
+
return df, last_updated
|
157 |
+
|
158 |
+
|
159 |
+
def get_agency_metadata_values(
|
160 |
+
df: DataFrame,
|
161 |
+
agency_column: str,
|
162 |
+
metadata: dict,
|
163 |
+
metadata_value: str,
|
164 |
+
):
|
165 |
+
if metadata_value == "acronym":
|
166 |
+
metadata_value = "short_name"
|
167 |
+
return df.loc[:, agency_column].apply(
|
168 |
+
lambda x: metadata.get(x, {}).get(metadata_value)
|
169 |
+
)
|
170 |
+
|
171 |
+
|
172 |
+
def groupby_agency(
|
173 |
+
df: DataFrame,
|
174 |
+
group_col: str = "parent_slug",
|
175 |
+
value_col: str = "document_number",
|
176 |
+
aggfunc: str = "count",
|
177 |
+
significant: bool = True,
|
178 |
+
metadata: dict | None = None,
|
179 |
+
metadata_value: str = "acronym",
|
180 |
+
):
|
181 |
+
aggfunc_dict = {value_col: aggfunc, }
|
182 |
+
if significant:
|
183 |
+
aggfunc_dict.update({
|
184 |
+
"3f1_significant": "sum",
|
185 |
+
"other_significant": "sum",
|
186 |
+
})
|
187 |
+
df_ex = df.explode(group_col, ignore_index=True)
|
188 |
+
grouped = df_ex.groupby(
|
189 |
+
by=group_col
|
190 |
+
).agg(
|
191 |
+
aggfunc_dict
|
192 |
+
).reset_index()
|
193 |
+
grouped = grouped.sort_values(value_col, ascending=False).rename(
|
194 |
+
columns={
|
195 |
+
group_col: "agency",
|
196 |
+
value_col: "rules",
|
197 |
+
}, errors="ignore"
|
198 |
+
)
|
199 |
+
if metadata is not None:
|
200 |
+
grouped.loc[:, metadata_value] = get_agency_metadata_values(
|
201 |
+
grouped,
|
202 |
+
agency_column="agency",
|
203 |
+
metadata=metadata,
|
204 |
+
metadata_value=metadata_value
|
205 |
+
)
|
206 |
+
cols = ["agency", metadata_value, "rules", "3f1_significant", "other_significant"]
|
207 |
+
grouped = grouped.loc[:, [c for c in cols if c in grouped.columns]]
|
208 |
+
return grouped
|
209 |
+
|
210 |
+
|
211 |
+
def groupby_ym(
|
212 |
+
df: DataFrame,
|
213 |
+
group_col: tuple | list = ("publication_year", "publication_month", ),
|
214 |
+
value_col: str = "document_number",
|
215 |
+
aggfunc: str = "count",
|
216 |
+
significant: bool = True
|
217 |
+
):
|
218 |
+
aggfunc_dict = {value_col: aggfunc, }
|
219 |
+
if significant:
|
220 |
+
aggfunc_dict.update({
|
221 |
+
"3f1_significant": "sum",
|
222 |
+
"other_significant": "sum",
|
223 |
+
})
|
224 |
+
grouped = df.groupby(
|
225 |
+
by=list(group_col)
|
226 |
+
).agg(
|
227 |
+
aggfunc_dict
|
228 |
+
).reset_index()
|
229 |
+
grouped = grouped.rename(columns={
|
230 |
+
value_col: "rules",
|
231 |
+
}, errors="ignore")
|
232 |
+
return grouped
|
233 |
+
|
234 |
+
|
235 |
+
def save_csv(path: Path, df_all: DataFrame, df_agency: DataFrame, df_ym: DataFrame, transition_year: int):
|
236 |
+
files = (
|
237 |
+
f"rules_{transition_year - 1}_{transition_year}.csv",
|
238 |
+
f"rules_by_agency_{transition_year - 1}_{transition_year}.csv",
|
239 |
+
f"rules_by_month_{transition_year - 1}_{transition_year}.csv"
|
240 |
+
)
|
241 |
+
dataframes = (df_all, df_agency, df_ym)
|
242 |
+
for data, file in zip(dataframes, files):
|
243 |
+
data.to_csv(path / file, index=False)
|
244 |
+
|
245 |
+
|
246 |
+
def plot_agency(df, group_col = "acronym", value_col = "rules"):
|
247 |
+
|
248 |
+
order_list = df.loc[:, group_col].to_list()[::-1]
|
249 |
+
|
250 |
+
plot = (
|
251 |
+
ggplot(
|
252 |
+
df,
|
253 |
+
aes(x=group_col, y=value_col),
|
254 |
+
)
|
255 |
+
+ geom_col()
|
256 |
+
+ coord_flip()
|
257 |
+
+ scale_x_discrete(limits=order_list)
|
258 |
+
+ labs(y="", x="", title="Number of Rules Published by Agency")
|
259 |
+
+ theme_light()
|
260 |
+
)
|
261 |
+
return plot
|
262 |
+
|
263 |
+
|
264 |
+
def plot_month(df, group_cols = ("publication_year", "publication_month"), value_col = "rules"):
|
265 |
+
|
266 |
+
df.loc[:, "ym"] = df[group_cols[0]].astype(str) + "-" + df[group_cols[1]].astype(str).str.pad(2, fillchar="0")
|
267 |
+
order_list = df.loc[:, "ym"].to_list()
|
268 |
+
|
269 |
+
plot = (
|
270 |
+
ggplot(
|
271 |
+
df,
|
272 |
+
aes(x="ym", y=value_col),
|
273 |
+
)
|
274 |
+
+ geom_col()
|
275 |
+
+ scale_x_discrete(limits=order_list)
|
276 |
+
+ labs(y="", x="", title="Number of Rules Published by Month")
|
277 |
+
+ theme_light()
|
278 |
+
)
|
279 |
+
return plot
|
280 |
+
|
281 |
+
|
282 |
+
def get_rules_in_window(start_date: str, get_significant: bool = True):
|
283 |
+
date_range = get_date_range(start_date)
|
284 |
+
transition_year = date_range.get("transition_year")
|
285 |
+
results = get_rules(date_range)
|
286 |
+
df = format_documents(results)
|
287 |
+
df, _ = filter_corrections(df)
|
288 |
+
df = filter_new_admin_rules(df, transition_year)
|
289 |
+
if get_significant:
|
290 |
+
df, last_updated = get_significant_rules(df, start_date)
|
291 |
+
else:
|
292 |
+
last_updated = date.today()
|
293 |
+
return df, last_updated
|
294 |
+
|
295 |
+
|
296 |
+
DF, LAST_UPDATED = get_rules_in_window(START_DATE, get_significant=GET_SIGNIFICANT)
|
297 |
+
|
298 |
+
|
299 |
+
def main(start_date, save_data: bool = True, path: Path | None = None, metadata: dict | None = None, significant: bool = True):
|
300 |
+
if date.fromisoformat(start_date) < date(2023, 4, 6):
|
301 |
+
significant = False
|
302 |
+
date_range = get_date_range(start_date)
|
303 |
+
transition_year = date_range.get("transition_year")
|
304 |
+
df, _ = get_rules_in_window(start_date, get_significant=significant)
|
305 |
+
|
306 |
+
df_agency = groupby_agency(df, metadata=metadata, significant=significant)
|
307 |
+
df_ym = groupby_ym(df, significant=significant)
|
308 |
+
|
309 |
+
if save_data:
|
310 |
+
if path is None:
|
311 |
+
path = Path(__file__).parent
|
312 |
+
save_csv(path, df, df_agency, df_ym, transition_year)
|
313 |
+
|
314 |
+
return df, df_agency, df_ym
|
315 |
+
|
316 |
+
|
317 |
+
if __name__ == "__main__":
|
318 |
+
|
319 |
+
pass
|
requirements.txt
ADDED
Binary file (2.01 kB). View file
|
|
search_columns.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import itertools
|
2 |
+
import re
|
3 |
+
|
4 |
+
from numpy import array
|
5 |
+
from pandas import DataFrame
|
6 |
+
|
7 |
+
|
8 |
+
class SearchError(Exception):
|
9 |
+
"""Search returned misaligned results."""
|
10 |
+
pass
|
11 |
+
|
12 |
+
|
13 |
+
# Defining a function to search for string patterns within dataframe columns
|
14 |
+
def search_columns(df: DataFrame,
|
15 |
+
patterns: list,
|
16 |
+
columns: list,
|
17 |
+
return_as: str = "indicator_column",
|
18 |
+
return_column: str = "indicator",
|
19 |
+
re_flags = re.I | re.X):
|
20 |
+
"""Search columns for string patterns within dataframe columns.
|
21 |
+
|
22 |
+
Args:
|
23 |
+
df (DataFrame): Input data in format of pandas dataframe.
|
24 |
+
patterns (list): List of string patterns to input, compatible with regex.
|
25 |
+
columns (list): List of column names to search for input patterns.
|
26 |
+
return_as (str, optional): Return a DataFrame with indicator column ("indicator_column") or filtered by the search terms ("filtered_df"). Defaults to "indicator_column".
|
27 |
+
re_flags (optional): Regex flags to use. Defaults to re.I | re.X.
|
28 |
+
|
29 |
+
Raises:
|
30 |
+
TypeError: Raises exception when `patterns` or `columns` parameters are not lists.
|
31 |
+
ValueError: Raises exception when `patterns` or `columns` parameters have incorrect length.
|
32 |
+
ValueError: Raises exception when `return_as` parameter receives an incorrect value.
|
33 |
+
|
34 |
+
Returns:
|
35 |
+
DataFrame: DataFrame with "indicator" column or filtered by search terms.
|
36 |
+
"""
|
37 |
+
# create list object for appending boolean arrays
|
38 |
+
bool_list = []
|
39 |
+
|
40 |
+
# ensure that input patterns and columns are formatted as lists
|
41 |
+
if not (isinstance(patterns, list) and isinstance(columns, list)):
|
42 |
+
raise TypeError('Inputs for "patterns" and "columns" keywords must be lists.')
|
43 |
+
|
44 |
+
if len(patterns) == len(columns):
|
45 |
+
# create list of inputs in format [(pattern1, column1),(pattern2, column2), ...]
|
46 |
+
inputs = list(zip(patterns,columns))
|
47 |
+
|
48 |
+
# loop over list of inputs
|
49 |
+
for i in inputs:
|
50 |
+
searchre = df[i[1]].str.contains(i[0], regex=True, case=False, flags=re_flags)
|
51 |
+
searchbool = array([True if n is True else False for n in searchre])
|
52 |
+
bool_list.append(searchbool)
|
53 |
+
|
54 |
+
elif (len(patterns) == 1) and (len(patterns) != len(columns)):
|
55 |
+
# create list of inputs in format [(pattern, column1),(pattern, column2), ...]
|
56 |
+
inputs = list(itertools.product(patterns, columns))
|
57 |
+
|
58 |
+
# loop over list of inputs
|
59 |
+
for i in inputs:
|
60 |
+
searchre = df[i[1]].str.contains(i[0], regex=True, case=False, flags=re_flags)
|
61 |
+
searchbool = array([True if n is True else False for n in searchre])
|
62 |
+
bool_list.append(searchbool)
|
63 |
+
|
64 |
+
else: # eg, patterns formatted as a list of len(n>1) but does not match len(columns)
|
65 |
+
raise ValueError("Length of inputs are incorrect. Lengths of 'patterns' and 'columns' can either match or a single pattern can map to multiple columns.")
|
66 |
+
|
67 |
+
# combine each "searchbool" array elementwise
|
68 |
+
# we want a positive match for any column to evaluate as True
|
69 |
+
# equivalent to (bool_list[0] | bool_list[1] | bool_list[2] | ... | bool_list[n-1])
|
70 |
+
filter_bool = array(bool_list).any(axis=0)
|
71 |
+
|
72 |
+
if return_as == "indicator_column":
|
73 |
+
dfResults = df.copy(deep=True)
|
74 |
+
dfResults.loc[:, return_column] = 0
|
75 |
+
dfResults.loc[filter_bool, return_column] = 1
|
76 |
+
#print(f"Count {return_column}: {sum(dfResults[return_column].values)}")
|
77 |
+
return dfResults
|
78 |
+
|
79 |
+
elif return_as == "filtered_df":
|
80 |
+
# filter results
|
81 |
+
dfResults = df.loc[filter_bool, :].copy(deep=True)
|
82 |
+
#print(f"Count {return_column}: {len(dfResults)}")
|
83 |
+
return dfResults
|
84 |
+
|
85 |
+
else:
|
86 |
+
raise ValueError("Incorrect input for 'return_as' parameter.")
|
settings.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"[python]": {
|
3 |
+
"editor.defaultFormatter": "ms-python.black-formatter"
|
4 |
+
},
|
5 |
+
"python.formatting.provider": "none"
|
6 |
+
}
|
significant.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# gather details on rule significance from FR tracking document
|
2 |
+
# see: https://github.com/regulatorystudies/Reg-Stats/blob/main/data/fr_tracking/fr_tracking.csv
|
3 |
+
|
4 |
+
from datetime import date
|
5 |
+
import polars as pl
|
6 |
+
from pandas import (
|
7 |
+
DataFrame as pd_DataFrame,
|
8 |
+
read_csv as pd_read_csv,
|
9 |
+
to_datetime,
|
10 |
+
)
|
11 |
+
|
12 |
+
|
13 |
+
def read_csv_data(
|
14 |
+
start_date: date | str,
|
15 |
+
retrieve_columns: list | tuple = (
|
16 |
+
"publication_date",
|
17 |
+
"document_number",
|
18 |
+
"significant",
|
19 |
+
"econ_significant",
|
20 |
+
"3(f)(1) significant",
|
21 |
+
"Major"
|
22 |
+
),
|
23 |
+
url: str = r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv"
|
24 |
+
):
|
25 |
+
# handle dates formatted as str
|
26 |
+
if isinstance(start_date, str):
|
27 |
+
start_date = date.fromisoformat(start_date)
|
28 |
+
|
29 |
+
# drop econ_significant column for dates on or after EO 14094
|
30 |
+
if start_date >= date.fromisoformat("2023-04-06"):
|
31 |
+
cols = [col for col in retrieve_columns if col != "econ_significant"]
|
32 |
+
else:
|
33 |
+
cols = list(retrieve_columns)
|
34 |
+
|
35 |
+
# read csv; try different encoding if raises error
|
36 |
+
try:
|
37 |
+
df_pd = pd_read_csv(url, usecols=cols)
|
38 |
+
except UnicodeDecodeError:
|
39 |
+
df_pd = pd_read_csv(url, usecols=cols, encoding="latin")
|
40 |
+
|
41 |
+
df_pd.loc[:, "publication_dt"] = to_datetime(df_pd["publication_date"], format="mixed", dayfirst=False, yearfirst=False)
|
42 |
+
max_date = max(df_pd.loc[:, "publication_dt"].to_list()).date()
|
43 |
+
#print(max_date)
|
44 |
+
cols.remove("publication_date")
|
45 |
+
df = pl.from_pandas(df_pd.loc[:, cols])
|
46 |
+
|
47 |
+
if df.shape[1] == len(cols):
|
48 |
+
# rename columns if they exist
|
49 |
+
rename_cols = {"3(f)(1) significant": "3f1_significant", "Major": "major"}
|
50 |
+
if all(True if rename in cols else False for rename in rename_cols.keys()):
|
51 |
+
df = df.rename(rename_cols)
|
52 |
+
cols = [rename_cols.get(col, col) for col in cols]
|
53 |
+
|
54 |
+
return df, cols, max_date
|
55 |
+
else:
|
56 |
+
return None, cols, max_date
|
57 |
+
|
58 |
+
|
59 |
+
def clean_data(df: pl.DataFrame,
|
60 |
+
document_numbers: list,
|
61 |
+
clean_columns: list | tuple,
|
62 |
+
#format_not_available_values: str = ".",
|
63 |
+
return_optimized_plan = False
|
64 |
+
):
|
65 |
+
|
66 |
+
# start a lazy query
|
67 |
+
lf = (
|
68 |
+
df.lazy()
|
69 |
+
# strip whitespace
|
70 |
+
.with_columns(pl.col("document_number").str.strip_chars())
|
71 |
+
# only keep document_numbers from input
|
72 |
+
.filter(pl.col("document_number").is_in(document_numbers))
|
73 |
+
# temporarily format "not available" data (input as dots)
|
74 |
+
#.with_columns(pl.col(c for c in clean_columns if c != "document_number").str.replace_all(".", f"{format_not_available_values}", literal=True))
|
75 |
+
# cast to nullable int dtype
|
76 |
+
#.with_columns(pl.col(c for c in clean_columns if c != "document_number").cast(pl.Int64, strict=False))
|
77 |
+
)
|
78 |
+
|
79 |
+
# return optimized query plan instead of df
|
80 |
+
if return_optimized_plan:
|
81 |
+
return lf.explain(optimized=True)
|
82 |
+
|
83 |
+
# call collect to return df
|
84 |
+
return lf.collect()
|
85 |
+
|
86 |
+
|
87 |
+
def merge_with_api_results(pd_df: pd_DataFrame,
|
88 |
+
pl_df: pl.DataFrame
|
89 |
+
):
|
90 |
+
|
91 |
+
main_df = pl.from_pandas(pd_df)
|
92 |
+
df = main_df.join(pl_df, on="document_number", how="left", validate="1:1")
|
93 |
+
return df.to_pandas()
|
94 |
+
|
95 |
+
|
96 |
+
def get_significant_info(input_df, start_date, document_numbers):
|
97 |
+
|
98 |
+
pl_df, clean_cols, max_date = read_csv_data(start_date)
|
99 |
+
if pl_df is None:
|
100 |
+
print("Failed to integrate significance tracking data with retrieved documents.")
|
101 |
+
return input_df
|
102 |
+
pl_df = clean_data(pl_df, document_numbers, clean_cols)
|
103 |
+
pd_df = merge_with_api_results(input_df, pl_df)
|
104 |
+
return pd_df, max_date
|
105 |
+
|
106 |
+
|
107 |
+
if __name__ == "__main__":
|
108 |
+
|
109 |
+
date_a = "2023-04-05"
|
110 |
+
date_b = "2023-04-06"
|
111 |
+
numbers = [
|
112 |
+
"2021-01303",
|
113 |
+
'2023-28006',
|
114 |
+
'2024-00149',
|
115 |
+
'2024-00089',
|
116 |
+
'2023-28828',
|
117 |
+
'2024-00300',
|
118 |
+
'2024-00045',
|
119 |
+
'2024-00192',
|
120 |
+
'2024-00228',
|
121 |
+
'2024-00187'
|
122 |
+
]
|
123 |
+
|
124 |
+
# test for dates before EO 14094
|
125 |
+
df_a, clean_cols = read_csv_data(date_a)
|
126 |
+
df_a = clean_data(df_a, numbers, clean_cols)
|
127 |
+
|
128 |
+
# test for dates after EO 14094
|
129 |
+
df_b, clean_cols = read_csv_data(date_b)
|
130 |
+
df_b = clean_data(df_b, numbers, clean_cols)
|
131 |
+
|
132 |
+
#df_b.rename({"test": "test1"})
|
133 |
+
#print(df_a.shape, df_b.shape)
|