Spaces:

regulatorystudies
/

cra-window-rules

Running

App Files Files Community

Mark Febrizio commited on May 28, 2024

Commit

bb93e21

0 Parent(s):

try 2

Browse files

Files changed (11) hide show

.gitattributes +35 -0
.gitignore +167 -0
Dockerfile +13 -0
LICENSE +21 -0
README.md +20 -0
app.py +190 -0
get_rules_in_window.py +319 -0
requirements.txt +0 -0
search_columns.py +86 -0
settings.json +6 -0
significant.py +133 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+data/rules_2024_2025.csv filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,167 @@

+# ----- Project Specific ----- #
+app_planning.txt
+# ----- Python ----- #
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.10
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["shiny", "run", "app.py", "--host", "0.0.0.0", "--port", "7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 GW Regulatory Studies Center
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+---
+title: Cra Window Rules
+emoji: 🌍
+colorFrom: yellow
+colorTo: indigo
+sdk: docker
+pinned: false
+license: mit
+---
+This is a templated Space for [Shiny for Python](https://shiny.rstudio.com/py/).
+To get started with a new app do the following:
+1) Install Shiny with `pip install shiny`
+2) Create a new app with `shiny create .`
+3) Then run the app with `shiny run --reload`
+To learn more about this framework please see the [Documentation](https://shiny.rstudio.com/py/docs/overview.html).

app.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import asyncio
+from datetime import datetime, date, time
+from faicons import icon_svg
+from get_rules_in_window import (
+    DF,
+    LAST_UPDATED,
+    START_DATE,
+    GET_SIGNIFICANT,
+    METADATA,
+    groupby_agency,
+    groupby_ym,
+    plot_agency,
+    plot_month,
+    )
+from shiny import reactive
+from shiny.express import input, render, ui
+FOOTER = f"""
+    -----
+    Developed by the [GW Regulatory Studies Center](https://go.gwu.edu/regstudies). See our page on the [Congressional Review Act](https://regulatorystudies.columbian.gwu.edu/congressional-review-act) for more information.
+    """
+ui.page_opts(
+    title="Rules in the Congressional Review Act (CRA) Window", #fillable=True,
+    )
+with ui.sidebar(title="Settings"):
+    ui.input_date("start_date", "Start of window", value=START_DATE, min=START_DATE, max=date.today())
+    ui.input_switch("switch", "Show significant rules in plots", False)
+    #ui.input_checkbox_group(
+    #    "significant",
+    #    "EO 12866 Significance",
+    #    ["Section 3(f)(1)", "Other"],
+    #)
+with ui.layout_column_wrap():
+    with ui.value_box(showcase=icon_svg("book")):
+        "All final rules"
+        @render.text
+        def count_rules():
+            return f"{filtered_df()['document_number'].count()}"
+    with ui.value_box(showcase=icon_svg("book")):
+        "Other Significant rules"
+        @render.text
+        def count_other_significant():
+            output = "Not available"
+            if GET_SIGNIFICANT:
+                output = f"{filtered_df()['other_significant'].sum()}"
+            return output
+    with ui.value_box(showcase=icon_svg("book")):
+        "Section 3(f)(1) Significant rules"
+        @render.text
+        def count_3f1_significant():
+            output = "Not available"
+            if GET_SIGNIFICANT:
+                output = f"{filtered_df()['3f1_significant'].sum()}"
+            return output
+with ui.navset_card_underline(title=""):
+    with ui.nav_panel("Rules in detail"):
+        @render.data_frame
+        def table_rule_detail():
+            df = filtered_df()
+            #print(df.columns)
+            #df.loc[:, "date"] = df.apply(lambda x: f"{x['publication_year']}-{x['publication_month']}-{x['publication_day']}", axis=1)
+            df.loc[:, "date"] = df.apply(lambda x: f"{x['publication_date'].date()}", axis=1)
+            char = " "
+            df.loc[:, "title"] = df["title"].apply(lambda x: f"{char.join(x.split(char)[:9])}...")
+            df.loc[:, "agencies"] = df["parent_slug"].apply(lambda x: "; ".join(x))
+            cols = [
+                "date",
+                "title",
+                "agencies",
+                "3f1_significant",
+                "other_significant",
+                ]
+            return render.DataTable(df.loc[:, [c for c in cols if c in df.columns]])
+    with ui.nav_panel("By month"):
+        with ui.layout_columns():
+            @render.plot
+            def plot_by_month():
+                grouped = grouped_df_month()
+                return plot_month(
+                    grouped
+                    )
+            @render.data_frame
+            def table_by_month():
+                grouped = grouped_df_month()
+                cols = [
+                    "publication_year",
+                    "publication_month",
+                    "rules",
+                    "3f1_significant",
+                    "other_significant",
+                    ]
+                return render.DataTable(grouped.loc[:, [c for c in cols if c in grouped.columns]])
+    with ui.nav_panel("By agency"):
+        with ui.layout_columns():
+            @render.plot
+            def plot_by_agency():
+                grouped = grouped_df_agency()
+                return plot_agency(
+                    grouped.head(10),
+                    )
+            @render.data_frame
+            def table_by_agency():
+                grouped = grouped_df_agency()
+                cols = [
+                    "agency",
+                    "acronym",
+                    "rules",
+                    "3f1_significant",
+                    "other_significant",
+                    ]
+                return render.DataTable(grouped.loc[:, [c for c in cols if c in grouped.columns]])
+with ui.accordion(open=False):
+    with ui.accordion_panel("Download Data"):
+        @render.download(
+            label="Download data as CSV",
+            filename=f"rules_in_cra_window_accessed_{date.today()}.csv",
+        )
+        async def download():
+            await asyncio.sleep(0.25)
+            yield filtered_df().to_csv(index=False)
+with ui.accordion(open=False):
+    with ui.accordion_panel("Notes"):
+        ui.markdown(
+            f"""
+            Rule data retrieved from the [Federal Register API](https://www.federalregister.gov/developers/documentation/api/v1).
+            Executive Order 12866 significance data last updated **{LAST_UPDATED}**.
+            """
+            )
+ui.markdown(
+    FOOTER
+)
+#ui.tags.footer()
+# ----- REACTIVE CALCULATIONS ----- #
+@reactive.calc
+def filtered_df():
+    filt_df = DF
+    #filt_df = df[df["species"].isin(input.species())]
+    try:
+        filt_df = filt_df.loc[filt_df["publication_date"] >= input.start_date()]
+    except TypeError:
+        filt_df = filt_df.loc[filt_df["publication_date"] >= datetime.combine(input.start_date(), time(0, 0))]
+    return filt_df
+@reactive.calc
+def grouped_df_month():
+    filt_df = filtered_df()
+    grouped = groupby_ym(filt_df, significant=GET_SIGNIFICANT)
+    return grouped
+@reactive.calc
+def grouped_df_agency():
+    filt_df = filtered_df()
+    grouped = groupby_agency(filt_df, metadata=METADATA, significant=GET_SIGNIFICANT)
+    return grouped

get_rules_in_window.py ADDED Viewed

	@@ -0,0 +1,319 @@

+from datetime import date
+from pathlib import Path
+from fr_toolbelt.api_requests import get_documents_by_date
+from fr_toolbelt.preprocessing import process_documents, AgencyMetadata
+from numpy import array
+from pandas import DataFrame, to_datetime
+from plotnine import (
+    ggplot,
+    aes,
+    geom_col,
+    labs,
+    coord_flip,
+    scale_x_discrete,
+    theme_light,
+    )
+try:
+    from search_columns import search_columns, SearchError
+    from significant import get_significant_info
+except ModuleNotFoundError:
+    from .search_columns import search_columns, SearchError
+    from .significant import get_significant_info
+METADATA, _ = AgencyMetadata().get_agency_metadata()
+START_DATE = "2024-03-01"
+GET_SIGNIFICANT = True if date.fromisoformat(START_DATE) >= date(2023, 4, 6) else False
+class DataAvailabilityError(Exception):
+    pass
+def get_date_range(start_date: str):
+    start_year = date.fromisoformat(start_date).year
+    end_year = start_year + 1
+    date_range = {
+        "start": start_date,
+        "end": f"{end_year}-01-31",
+        "transition_year": end_year,
+        }
+    return date_range
+def get_rules(date_range: dict) -> list[dict]:
+    results, _ = get_documents_by_date(
+        start_date=date_range.get("start"),
+        end_date=date_range.get("end"),
+        document_types=("RULE", )
+        )
+    return results
+def format_documents(documents: list[dict]):
+    """Format Federal Register documents to generate count by presidential year.
+    Args:
+        documents (list[dict]): List of documents.
+    Returns:
+        DataFrame: Pandas DataFrame with formatted data.
+    """
+    # process agency info in documents
+    documents = process_documents(
+        documents,
+        which=("agencies", "presidents"),
+        return_values_as_str=False
+        )
+    # create dataframe
+    df = DataFrame(documents)
+    # convert publication date to datetime format
+    df.loc[:, "publication_dt"] = to_datetime(df["publication_date"])
+    df.loc[:, "publication_date"] = df.apply(lambda x: x["publication_dt"].date(), axis=1)
+    df.loc[:, "publication_year"] = df.apply(lambda x: x["publication_dt"].year, axis=1)
+    df.loc[:, "publication_month"] = df.apply(lambda x: x["publication_dt"].month, axis=1)
+    df.loc[:, "publication_day"] = df.apply(lambda x: x["publication_dt"].day, axis=1)
+    # return dataframe
+    return df
+def filter_new_admin_rules(
+        df: DataFrame,
+        transition_year: int,
+        date_col: str = "publication_date",
+    ):
+    admin_transitions = {
+        2001: "george-w-bush",
+        2009: "barack-obama",
+        2017: "donald-trump",
+        2021: "joe-biden",
+        }
+    bool_date = array(df[date_col] >= date(transition_year, 1, 20))
+    bool_prez = array(df["president_id"] == admin_transitions.get(transition_year))
+    bool_ = bool_date & bool_prez
+    return df.loc[~bool_]
+def filter_corrections(df: DataFrame):
+    """Filter out corrections from Federal Register documents.
+    Identifies corrections using `corrrection_of` field and regex searches of `document_number`, `title`, and `action` fields.
+    Args:
+        df (DataFrame): Federal Register data.
+    Returns:
+        tuple: DataFrame with corrections removed, DataFrame of corrections
+    """
+    # get original column names
+    cols = df.columns.tolist()
+    # filter out corrections
+    # 1. Using correction fields
+    bool_na = array(df["correction_of"].isna())
+    # 2. Searching other fields
+    search_1 = search_columns(df, [r"^[crxz][\d]{1,2}-(?:[\w]{2,4}-)?[\d]+"], ["document_number"],
+                                 return_column="indicator1")
+    search_2 = search_columns(df, [r"(?:;\scorrection\b)|(?:\bcorrecting\samend[\w]+\b)"], ["title", "action"],
+                                 return_column="indicator2")
+    bool_search = array(search_1["indicator1"] == 1) | array(search_2["indicator2"] == 1)
+    # separate corrections from non-corrections
+    df_no_corrections = df.loc[(bool_na & ~bool_search), cols]  # remove flagged documents
+    df_corrections = df.loc[(~bool_na | bool_search), cols]
+    # return filtered results
+    if len(df) == len(df_no_corrections) + len(df_corrections):
+        return df_no_corrections, df_corrections
+    else:
+        raise SearchError(f"{len(df)} != {len(df_no_corrections)} + {len(df_corrections)}")
+def get_significant_rules(df, start_date):
+    process_columns = ("significant", "3f1_significant", )
+    if date.fromisoformat(start_date) < date(2023, 4, 6):
+        raise DataAvailabilityError("This program does not calculate significant rule counts prior to Executive Order 14094 of April 6, 2023.")
+    else:
+        document_numbers = df.loc[:, "document_number"].to_list()
+        df, last_updated = get_significant_info(df, start_date, document_numbers)
+        for col in process_columns:
+            bool_na = df[col].isna()
+            df.loc[bool_na, col] = "0"
+            df.loc[:, col] = df[col].replace(".", "0").astype("int64")
+        bool_3f1 = df["3f1_significant"] == 1
+        bool_sig = df["significant"] == 1
+        df.loc[:, "3f1_significant"] = 0
+        df.loc[bool_3f1, "3f1_significant"] = 1
+        df.loc[:, "other_significant"] = 0
+        df.loc[(bool_sig & ~bool_3f1), "other_significant"] = 1
+    return df, last_updated
+def get_agency_metadata_values(
+        df: DataFrame,
+        agency_column: str,
+        metadata: dict,
+        metadata_value: str,
+    ):
+    if metadata_value == "acronym":
+        metadata_value = "short_name"
+    return df.loc[:, agency_column].apply(
+        lambda x: metadata.get(x, {}).get(metadata_value)
+        )
+def groupby_agency(
+        df: DataFrame,
+        group_col: str = "parent_slug",
+        value_col: str = "document_number",
+        aggfunc: str = "count",
+        significant: bool = True,
+        metadata: dict | None = None,
+        metadata_value: str = "acronym",
+    ):
+    aggfunc_dict = {value_col: aggfunc, }
+    if significant:
+        aggfunc_dict.update({
+            "3f1_significant": "sum",
+            "other_significant": "sum",
+            })
+    df_ex = df.explode(group_col, ignore_index=True)
+    grouped = df_ex.groupby(
+        by=group_col
+    ).agg(
+        aggfunc_dict
+        ).reset_index()
+    grouped = grouped.sort_values(value_col, ascending=False).rename(
+        columns={
+            group_col: "agency",
+            value_col: "rules",
+            }, errors="ignore"
+        )
+    if metadata is not None:
+        grouped.loc[:, metadata_value] = get_agency_metadata_values(
+        grouped,
+        agency_column="agency",
+        metadata=metadata,
+        metadata_value=metadata_value
+        )
+        cols = ["agency", metadata_value, "rules", "3f1_significant", "other_significant"]
+        grouped = grouped.loc[:, [c for c in cols if c in grouped.columns]]
+    return grouped
+def groupby_ym(
+        df: DataFrame,
+        group_col: tuple | list = ("publication_year", "publication_month", ),
+        value_col: str = "document_number",
+        aggfunc: str = "count",
+        significant: bool = True
+    ):
+    aggfunc_dict = {value_col: aggfunc, }
+    if significant:
+        aggfunc_dict.update({
+            "3f1_significant": "sum",
+            "other_significant": "sum",
+            })
+    grouped = df.groupby(
+        by=list(group_col)
+    ).agg(
+        aggfunc_dict
+        ).reset_index()
+    grouped = grouped.rename(columns={
+        value_col: "rules",
+        }, errors="ignore")
+    return grouped
+def save_csv(path: Path, df_all: DataFrame, df_agency: DataFrame, df_ym: DataFrame, transition_year: int):
+    files = (
+        f"rules_{transition_year - 1}_{transition_year}.csv",
+        f"rules_by_agency_{transition_year - 1}_{transition_year}.csv",
+        f"rules_by_month_{transition_year - 1}_{transition_year}.csv"
+        )
+    dataframes = (df_all, df_agency, df_ym)
+    for data, file in zip(dataframes, files):
+        data.to_csv(path / file, index=False)
+def plot_agency(df, group_col = "acronym", value_col = "rules"):
+    order_list = df.loc[:, group_col].to_list()[::-1]
+    plot = (
+        ggplot(
+            df,
+            aes(x=group_col, y=value_col),
+            )
+        + geom_col()
+        + coord_flip()
+        + scale_x_discrete(limits=order_list)
+        + labs(y="", x="", title="Number of Rules Published by Agency")
+        + theme_light()
+        )
+    return plot
+def plot_month(df, group_cols = ("publication_year", "publication_month"), value_col = "rules"):
+    df.loc[:, "ym"] = df[group_cols[0]].astype(str) + "-" + df[group_cols[1]].astype(str).str.pad(2, fillchar="0")
+    order_list = df.loc[:, "ym"].to_list()
+    plot = (
+        ggplot(
+            df,
+            aes(x="ym", y=value_col),
+            )
+        + geom_col()
+        + scale_x_discrete(limits=order_list)
+        + labs(y="", x="", title="Number of Rules Published by Month")
+        + theme_light()
+        )
+    return plot
+def get_rules_in_window(start_date: str, get_significant: bool = True):
+    date_range = get_date_range(start_date)
+    transition_year = date_range.get("transition_year")
+    results = get_rules(date_range)
+    df = format_documents(results)
+    df, _ = filter_corrections(df)
+    df = filter_new_admin_rules(df, transition_year)
+    if get_significant:
+        df, last_updated = get_significant_rules(df, start_date)
+    else:
+        last_updated = date.today()
+    return df, last_updated
+DF, LAST_UPDATED = get_rules_in_window(START_DATE, get_significant=GET_SIGNIFICANT)
+def main(start_date, save_data: bool = True, path: Path | None = None, metadata: dict | None = None, significant: bool = True):
+    if date.fromisoformat(start_date) < date(2023, 4, 6):
+        significant = False
+    date_range = get_date_range(start_date)
+    transition_year = date_range.get("transition_year")
+    df, _ = get_rules_in_window(start_date, get_significant=significant)
+    df_agency = groupby_agency(df, metadata=metadata, significant=significant)
+    df_ym = groupby_ym(df, significant=significant)
+    if save_data:
+        if path is None:
+            path = Path(__file__).parent
+        save_csv(path, df, df_agency, df_ym, transition_year)
+    return df, df_agency, df_ym
+if __name__ == "__main__":
+    pass

requirements.txt ADDED Viewed

Binary file (2.01 kB). View file

search_columns.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import itertools
+import re
+from numpy import array
+from pandas import DataFrame
+class SearchError(Exception):
+    """Search returned misaligned results."""
+    pass
+# Defining a function to search for string patterns within dataframe columns
+def search_columns(df: DataFrame,
+                   patterns: list,
+                   columns: list,
+                   return_as: str = "indicator_column",
+                   return_column: str = "indicator",
+                   re_flags = re.I | re.X):
+    """Search columns for string patterns within dataframe columns.
+    Args:
+        df (DataFrame): Input data in format of pandas dataframe.
+        patterns (list): List of string patterns to input, compatible with regex.
+        columns (list): List of column names to search for input patterns.
+        return_as (str, optional): Return a DataFrame with indicator column ("indicator_column") or filtered by the search terms ("filtered_df"). Defaults to "indicator_column".
+        re_flags (optional): Regex flags to use. Defaults to re.I | re.X.
+    Raises:
+        TypeError: Raises exception when `patterns` or `columns` parameters are not lists.
+        ValueError: Raises exception when `patterns` or `columns` parameters have incorrect length.
+        ValueError: Raises exception when `return_as` parameter receives an incorrect value.
+    Returns:
+        DataFrame: DataFrame with "indicator" column or filtered by search terms.
+    """
+    # create list object for appending boolean arrays
+    bool_list = []
+    # ensure that input patterns and columns are formatted as lists
+    if not (isinstance(patterns, list) and isinstance(columns, list)):
+        raise TypeError('Inputs for "patterns" and "columns" keywords must be lists.')
+    if len(patterns) == len(columns):
+        # create list of inputs in format [(pattern1, column1),(pattern2, column2), ...]
+        inputs = list(zip(patterns,columns))
+        # loop over list of inputs
+        for i in inputs:
+            searchre = df[i[1]].str.contains(i[0], regex=True, case=False, flags=re_flags)
+            searchbool = array([True if n is True else False for n in searchre])
+            bool_list.append(searchbool)
+    elif (len(patterns) == 1) and (len(patterns) != len(columns)):
+        # create list of inputs in format [(pattern, column1),(pattern, column2), ...]
+        inputs = list(itertools.product(patterns, columns))
+        # loop over list of inputs
+        for i in inputs:
+            searchre = df[i[1]].str.contains(i[0], regex=True, case=False, flags=re_flags)
+            searchbool = array([True if n is True else False for n in searchre])
+            bool_list.append(searchbool)
+    else:  # eg, patterns formatted as a list of len(n>1) but does not match len(columns)
+        raise ValueError("Length of inputs are incorrect. Lengths of 'patterns' and 'columns' can either match or a single pattern can map to multiple columns.")
+    # combine each "searchbool" array elementwise
+    # we want a positive match for any column to evaluate as True
+    # equivalent to (bool_list[0] | bool_list[1] | bool_list[2] | ... | bool_list[n-1])
+    filter_bool = array(bool_list).any(axis=0)
+    if return_as == "indicator_column":
+        dfResults = df.copy(deep=True)
+        dfResults.loc[:, return_column] = 0
+        dfResults.loc[filter_bool, return_column] = 1
+        #print(f"Count {return_column}: {sum(dfResults[return_column].values)}")
+        return dfResults
+    elif return_as == "filtered_df":
+        # filter results
+        dfResults = df.loc[filter_bool, :].copy(deep=True)
+        #print(f"Count {return_column}: {len(dfResults)}")
+        return dfResults
+    else:
+        raise ValueError("Incorrect input for 'return_as' parameter.")

settings.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "[python]": {
+        "editor.defaultFormatter": "ms-python.black-formatter"
+    },
+    "python.formatting.provider": "none"
+}

significant.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# gather details on rule significance from FR tracking document
+# see: https://github.com/regulatorystudies/Reg-Stats/blob/main/data/fr_tracking/fr_tracking.csv
+from datetime import date
+import polars as pl
+from pandas import (
+    DataFrame as pd_DataFrame,
+    read_csv as pd_read_csv,
+    to_datetime,
+    )
+def read_csv_data(
+    start_date: date | str,
+    retrieve_columns: list | tuple = (
+        "publication_date",
+        "document_number",
+        "significant",
+        "econ_significant",
+        "3(f)(1) significant",
+        "Major"
+        ),
+    url: str = r"https://raw.githubusercontent.com/regulatorystudies/Reg-Stats/main/data/fr_tracking/fr_tracking.csv"
+    ):
+    # handle dates formatted as str
+    if isinstance(start_date, str):
+        start_date = date.fromisoformat(start_date)
+    # drop econ_significant column for dates on or after EO 14094
+    if start_date >= date.fromisoformat("2023-04-06"):
+        cols = [col for col in retrieve_columns if col != "econ_significant"]
+    else:
+        cols = list(retrieve_columns)
+    # read csv; try different encoding if raises error
+    try:
+        df_pd = pd_read_csv(url, usecols=cols)
+    except UnicodeDecodeError:
+        df_pd = pd_read_csv(url, usecols=cols, encoding="latin")
+    df_pd.loc[:, "publication_dt"] = to_datetime(df_pd["publication_date"], format="mixed", dayfirst=False, yearfirst=False)
+    max_date = max(df_pd.loc[:, "publication_dt"].to_list()).date()
+    #print(max_date)
+    cols.remove("publication_date")
+    df = pl.from_pandas(df_pd.loc[:, cols])
+    if df.shape[1] == len(cols):
+        # rename columns if they exist
+        rename_cols = {"3(f)(1) significant": "3f1_significant", "Major": "major"}
+        if all(True if rename in cols else False for rename in rename_cols.keys()):
+            df = df.rename(rename_cols)
+            cols = [rename_cols.get(col, col) for col in cols]
+        return df, cols, max_date
+    else:
+        return None, cols, max_date
+def clean_data(df: pl.DataFrame,
+               document_numbers: list,
+               clean_columns: list | tuple,
+               #format_not_available_values: str = ".",
+               return_optimized_plan = False
+               ):
+    # start a lazy query
+    lf = (
+        df.lazy()
+        # strip whitespace
+        .with_columns(pl.col("document_number").str.strip_chars())
+        # only keep document_numbers from input
+        .filter(pl.col("document_number").is_in(document_numbers))
+        # temporarily format "not available" data (input as dots)
+        #.with_columns(pl.col(c for c in clean_columns if c != "document_number").str.replace_all(".", f"{format_not_available_values}", literal=True))
+        # cast to nullable int dtype
+        #.with_columns(pl.col(c for c in clean_columns if c != "document_number").cast(pl.Int64, strict=False))
+        )
+    # return optimized query plan instead of df
+    if return_optimized_plan:
+        return lf.explain(optimized=True)
+    # call collect to return df
+    return lf.collect()
+def merge_with_api_results(pd_df: pd_DataFrame,
+                           pl_df: pl.DataFrame
+                           ):
+    main_df = pl.from_pandas(pd_df)
+    df = main_df.join(pl_df, on="document_number", how="left", validate="1:1")
+    return df.to_pandas()
+def get_significant_info(input_df, start_date, document_numbers):
+    pl_df, clean_cols, max_date = read_csv_data(start_date)
+    if pl_df is None:
+        print("Failed to integrate significance tracking data with retrieved documents.")
+        return input_df
+    pl_df = clean_data(pl_df, document_numbers, clean_cols)
+    pd_df = merge_with_api_results(input_df, pl_df)
+    return pd_df, max_date
+if __name__ == "__main__":
+    date_a = "2023-04-05"
+    date_b = "2023-04-06"
+    numbers = [
+        "2021-01303",
+        '2023-28006',
+        '2024-00149',
+        '2024-00089',
+        '2023-28828',
+        '2024-00300',
+        '2024-00045',
+        '2024-00192',
+        '2024-00228',
+        '2024-00187'
+        ]
+    # test for dates before EO 14094
+    df_a, clean_cols = read_csv_data(date_a)
+    df_a = clean_data(df_a, numbers, clean_cols)
+    # test for dates after EO 14094
+    df_b, clean_cols = read_csv_data(date_b)
+    df_b = clean_data(df_b, numbers, clean_cols)
+    #df_b.rename({"test": "test1"})
+    #print(df_a.shape, df_b.shape)