Spaces:

davidna22
/

dna-casestudy

Sleeping

App Files Files

davidna22 commited on May 2, 2024

Commit

dad00c5

verified ·

1 Parent(s): 9cb35ba

Upload folder using huggingface_hub

Browse files

Files changed (38) hide show

.README +31 -0
Analysis.ipynb +0 -0
Dockerfile +10 -0
README.md +3 -9
__pycache__/utils.cpython-310.pyc +0 -0
app.py +1255 -0
code/Agency.json +34 -0
code/Borough.json +12 -0
code/all_vars.json +35 -0
code/build_service.py +167 -0
code/build_weather.py +98 -0
code/create_maps.py +177 -0
code/future_features.py +21 -0
code/past_features.py +21 -0
code/recurse_predict.py +22 -0
code/target_features.py +27 -0
custom.css +43 -0
data/data_final.csv +0 -0
data/data_merged_full.csv +0 -0
data/docs.csv +1315 -0
data/drop_vars.xlsx +0 -0
data/weather_aggregated_2010-2018.csv +0 -0
figures/bounded_map.html +95 -0
figures/final_map.html +0 -0
figures/map1.html +0 -0
figures/map2.html +0 -0
figures/model_performance.png +0 -0
models/BERTopic/config.json +17 -0
models/BERTopic/ctfidf.safetensors +3 -0
models/BERTopic/ctfidf_config.json +408 -0
models/BERTopic/topic_embeddings.safetensors +3 -0
models/BERTopic/topics.json +1671 -0
models/final_model.json +0 -0
reports/311_data_1.html +0 -0
reports/weather_data_after2016_ts.html +0 -0
reports/weather_data_ts.html +0 -0
requirements.txt +26 -0
utils.py +1028 -0

.README ADDED Viewed

	@@ -0,0 +1,31 @@

+# Steps to run
+## Pip install requirements
+```bash
+pip install -r requirements.txt
+```
+## Follow the Analysis.ipynb notebook for notebook format results
+### Recommended to view the gradio application for fuller view
+## Option 1: Run gradio app locally
+```bash
+python /path/to/app.y
+```
+## Option 2: Access web application at (https://dna-casestudy.com/)[https://dna-casestudy.com/]
+## Option 3: Build Docker container
+```bash
+cd /path/to/folder
+docker build -t my-case-study .
+docker run -p 7860:7860 my-case-study
+```
+## If ran locally, app will run via: (http://localhost:7860)[http://localhost:7860]

Analysis.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Dockerfile ADDED Viewed

	@@ -0,0 +1,10 @@

+FROM python:3.10.2
+workdir /app
+COPY . .
+RUN python -m pip install -U pip
+RUN pip install -r /app/requirements.txt
+EXPOSE 7860
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Dna Casestudy
-emoji: 🐢
-colorFrom: green
-colorTo: yellow
-sdk: gradio
-sdk_version: 4.28.3
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: dna-casestudy
 app_file: app.py
+sdk: gradio
+sdk_version: 4.27.0
 ---

__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (26.8 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,1255 @@

+import sys
+import inspect
+import math
+import pandas as pd
+import numpy as np
+import polars as pl
+import seaborn as sns
+import matplotlib
+import utils
+from matplotlib import pyplot as plt
+import sklearn
+import gradio as gr
+from IPython.display import display
+import plotly.figure_factory as ff
+from sklearn.impute import SimpleImputer
+from utils import create_seasons
+from bs4 import BeautifulSoup
+from IPython.display import display, HTML
+from bertopic import BERTopic
+import html
+import xgboost as xgb
+from xgboost import plot_importance
+from sklearn.metrics import r2_score, mean_absolute_percentage_error
+from utils import find_variable_data, build_temporal_features, create_datetime, map_vals
+import plotly.express as px
+import plotly.graph_objects as go
+import plotly.figure_factory as ff
+from plotly.subplots import make_subplots
+import plotly.io as pio
+import folium
+import gc
+import json
+from utils import MyNaiveImputer
+matplotlib.use('agg')
+dark_mode = """
+function refresh() {
+    const url = new URL(window.location);
+    if (url.searchParams.get('__theme') !== 'dark') {
+        url.searchParams.set('__theme', 'dark');
+        window.location.href = url.href;
+    }
+}
+"""
+# Imputation Variables
+wd_full_local = pd.read_csv("data/weather_aggregated_2010-2018.csv", index_col=0)
+wd_full_local = wd_full_local.reset_index()
+wd_full_local["Datetime"] = pd.to_datetime(wd_full_local["Datetime"], format="%Y-%m-%d")
+wd_full_local = build_temporal_features(wd_full_local, "Datetime")
+impute_cols = ['MeanTemp', 'MinTemp', 'MaxTemp', 'DewPoint',
+               'Percipitation', 'WindSpeed', 'MaxSustainedWind',
+               'Gust', 'Rain', 'SnowDepth', 'SnowIce']
+my_imputer = MyNaiveImputer(wd_full_local, time_steps=49+7)
+imputers = {
+    "Mean": my_imputer.impute_all(impute_cols, strategy="mean"),
+    "Median": my_imputer.impute_all(impute_cols, strategy="median"),
+    "Max": my_imputer.impute_all(impute_cols, strategy="max"),
+    "Min": my_imputer.impute_all(impute_cols, strategy="min")
+}
+# Merged Data Variables
+data_merged = pd.read_csv("data/data_merged_full.csv", index_col=0)
+data_merged = create_datetime(data_merged, "Datetime", format="%Y-%m-%d")
+data_merged["Day Of Week"] = data_merged["Datetime"].dt.day_name()
+data_merged["Year String"] = data_merged["Year"].astype(str)
+data_merged["Month String"] = data_merged["Datetime"].dt.month_name()
+data_merged["Rain Bool"] = data_merged["Rain"].astype(bool)
+data_merged["SnowIce Bool"] = data_merged["SnowIce"].astype(bool)
+data_merged = data_merged.set_index("Datetime")
+weather_full_df = data_merged.loc[data_merged["Year"] <= 2018].copy()
+data_merged_eda = data_merged.loc[(data_merged["Year"] <= 2018) & (data_merged["Year"] >= 2016)]
+# Feature Preprocessing
+data_preprocess = data_merged.loc[(data_merged["Year"] >= 2016)].copy()
+data_preprocess["Gust_lin"] = data_preprocess["Gust"].interpolate(method="linear")
+data_preprocess["Gust_spline3"] = data_preprocess["Gust"].interpolate(method="spline", order=3)
+data_preprocess["Gust_spline5"] = data_preprocess["Gust"].interpolate(method="spline", order=5)
+data_preprocess["Gust_quad"] = data_preprocess["Gust"].interpolate(method="quadratic")
+data_preprocess["Gust"] = data_preprocess["Gust"].interpolate(method="linear")
+data_preprocess["DewPoint_old"] = data_preprocess["DewPoint"]
+data_preprocess["DewPoint_diff7d"] = data_preprocess["DewPoint"] - data_preprocess["DewPoint"].shift(7)
+data_preprocess["DewPoint"] = data_preprocess["DewPoint_diff7d"]
+data_preprocess["MinTemp_old"] = data_preprocess["MinTemp"]
+data_preprocess["MinTemp_log"] = data_preprocess["MinTemp"].apply(np.log1p)
+data_preprocess["MinTemp_log_diff7d"] = data_preprocess["MinTemp_log"] - data_preprocess["MinTemp_log"].shift(7)
+data_preprocess["MinTemp"] = data_preprocess["MinTemp_log_diff7d"]
+# Final Preprocessed Variables
+data_final = pd.read_csv("data/data_final.csv")
+data_final = create_datetime(data_final, "Datetime", format="%Y-%m-%d")
+data_final = data_final.set_index("Datetime")
+test = data_final[-7:]
+dataset = data_final[:-7]
+split_point = int(len(data_final[:-7])*0.75)
+train, val = dataset[:split_point], dataset[split_point:]
+X_train, y_train = train.drop(columns="Target"), train["Target"]
+X_val, y_val = val.drop(columns="Target"), val["Target"]
+X_test, y_test = test.drop(columns="Target"), test["Target"]
+forecast_model = xgb.XGBRegressor()
+forecast_model.load_model("models/final_model.json")
+# Current Predictions
+global r2_val, r2_train, mape_train, mape_val
+r2_train = 0.8691238468740025
+mape_train = 0.04889510400934162
+r2_val = 0.6072642783665692
+mape_val = 0.6072642783665692
+# Initial Variables
+reports = {
+    "weather_2011-2018": BeautifulSoup(open("reports/weather_data_ts.html"), "html.parser"),
+    "weather_2016-2018": BeautifulSoup(open("reports/weather_data_after2016_ts.html"), "html.parser"),
+    "service_full": BeautifulSoup(open("reports/311_data_1.html"), "html.parser")
+}
+iframe_dp_weather, _ = find_variable_data(reports["weather_2011-2018"], "MeanTemp")
+iframe_dp_service, _ = find_variable_data(reports["service_full"], "Created Date")
+# Code Variables to show in app
+load_code = """
+# Load Weather Data in pandas
+# No need for polars because data is sufficiently small
+weather_data = pd.read_csv("data/weather_NY_2010_2018Nov.csv")
+# Load Service data in polars for speed optimization
+# Loading directly with polars leads to errors
+# Load in pandas then convert to polars
+service_data_pd = pd.read_csv("data/311-2016-2018.csv")
+assert service_data_pd["Unique Key"].nunique() == len(service_data_pd)
+# This casting is done just because of some errors when loading pl from pandas
+service_data_pd["Incident Zip"] = service_data_pd["Incident Zip"].astype("string")
+service_data_pd["BBL"] = service_data_pd["BBL"].astype("string")
+service_data = pl.DataFrame(service_data_pd)
+# Clear some ram
+del service_data_pd
+gc.collect()"""
+map_code = """
+lat_min = service_data["Latitude"].min()
+lat_max = service_data["Latitude"].max()
+long_min = service_data["Longitude"].min()
+long_max = service_data["Longitude"].max()
+mincon_lat = weather_data["Latitude"] >= lat_min
+maxcon_lat = weather_data["Latitude"] <= lat_max
+mincon_long = weather_data["Longitude"] >= long_min
+maxcon_long = weather_data["Longitude"] <= long_max
+wd_localized = weather_data.loc[mincon_lat & maxcon_lat & mincon_long & maxcon_long]
+"""
+Closed_Ticket_Code = """
+# Fill null and Typos with mean time diff (13 days)
+service_data = service_data.with_columns(
+    Closed_Date_New = pl.when(pl.col("Created Date") - pl.col("Closed Date")  > pl.duration(days=1))
+                        .then(pl.col("Created Date") + pl.duration(days=mean_diff))
+                        .otherwise(pl.col("Closed Date")).fill_null(pl.col("Created Date") + pl.duration(days=mean_diff))
+)
+# Check for no null values
+assert service_data["Closed_Date_New"].is_null().sum() == 0
+# Pair wise GroupBy and Filter
+closed_tickets = service_data.group_by(["Closed_Date_New", "Created Date"]) \
+    .agg((pl.when(pl.col("Created Date") <= pl.col("Closed_Date_New")).then(1).otherwise(0)).sum().alias("count")) \ # FILTER Created Date < Closed Date Here
+    .sort("Closed_Date_New") \ # Sort by new column Closed Date New
+    .filter((pl.col("Closed_Date_New").dt.year() >= 2016) & (pl.col("Closed_Date_New").dt.year() < 2019)) \ # Filter for only Closed Dates in time window
+    .group_by("Closed_Date_New").agg(pl.col("count").sum().alias("num_closed_tickets")) # Final Group By Closed date after filtering
+ct_df = closed_tickets.with_columns(
+    pl.col("num_closed_tickets") # Rename Column
+)
+"""
+global topic_model
+topic_model = BERTopic.load("models/BERTopic")
+def plot_imputations(var, data, imputers=imputers):
+    plt.close('all')
+    fig = plt.figure(figsize=(15,5))
+    plt.plot(data["Datetime"][-800:], data[var][-800:], label="Actual")
+    plt.title(f"{var} Imputation")
+    for method in imputers:
+        plt.plot(imputers[method]["Datetime"], imputers[method][var], label=method)
+    plt.legend()
+    return gr.update(value=fig)
+def plot_timeseries(data, var, data_name="My", all_vars=[], height=800, width=600):
+    plt.close('all')
+    if var == "":
+        return gr.update()
+    from utils import plot_timeseries
+    fig = plot_timeseries(data, var, data_name, all_vars, height, width)
+    return gr.update(value=fig)
+def plot_bivariate(data, x, y, subset=None, trendline=True):
+    plt.close('all')
+    map_var = {
+        "Year": "Year String",
+        "Season": "Season",
+        "Month": "Month String",
+        "Day Of Week": "Day Of Week",
+        "Weekend": "is_weekend",
+        "Holiday": "is_holiday",
+        "Rain": "Rain Bool",
+        "SnowIce": "SnowIce Bool",
+        "None": None,
+        "": None,
+    }
+    subset = map_var[subset]
+    from utils import plot_bivariate
+    fig = plot_bivariate(data, x, y, subset, trendline)
+    return gr.update(value=fig)
+def plot_seasonality(data, x, y, show_box=True, show_outliers=False):
+    plt.close('all')
+    map_var = {
+        "Year": "Year String",
+        "Season": "Season",
+        "Month": "Month String",
+        "Day Of Week": "Day Of Week",
+        "Weekend": "is_weekend",
+        "Holiday": "is_holiday",
+        "Rain": "Rain Bool",
+        "SnowIce": "SnowIce Bool",
+        "None": None,
+    }
+    x = map_var[x]
+    from utils import plot_seasonality
+    fig = plot_seasonality(data, x, y, show_box, show_outliers)
+    return gr.update(value=fig)
+def plot_correlations(data, covar, target="Target", lags=[0,1,2,3,4,5,6,7,8,13,14,15,21], method="pearson"):
+    plt.close('all')
+    from utils import plot_correlations
+    fig = plot_correlations(data, covar, target, lags, method)
+    return gr.update(value=fig)
+def plot_autocorr(data, var, apply=None):
+    plt.close('all')
+    from utils import plot_acf, plot_pacf
+    time_series = data.loc[:, var].to_frame().copy()
+    if apply:
+        time_series[var] = time_series[var].apply(apply)
+    fig, ax = plt.subplots(2, 1, figsize=(12, 8))
+    _ = plot_acf(time_series[var], lags=30, ax=ax[0])
+    _ = plot_pacf(time_series[var], lags=30, method="ols-adjusted", ax=ax[1])
+    _ = plt.suptitle(f"{var}", y=0.95)
+    return gr.update(value=fig)
+def plot_all_correlations(data, data_name="weather", method="pearson"):
+    plt.close('all')
+    from utils import plot_all_correlations
+    fig = plot_all_correlations(data, data_name, method)
+    return fig
+def run_report(report_base, variable_name, report_category="full"):
+    report_name = report_base + "_" + report_category
+    iframe, _ = find_variable_data(reports[report_name], variable_name)
+    return gr.update(value=iframe)
+def test_stationary(data, var):
+    from utils import test_stationary
+    df = test_stationary(data, var)
+    return df
+def plot_interpolation(data):
+    plt.close('all')
+    from utils import plot_gust_interpolation
+    fig = plot_gust_interpolation(data)
+    return fig
+def plot_model_feature_importance():
+    plt.close('all')
+    from utils import plot_final_feature_importance
+    fig = plot_final_feature_importance(forecast_model)
+    return fig
+def plot_final_predictions():
+    plt.close('all')
+    from utils import predict_recurse
+    next_7_day_prediction = predict_recurse(dataset, test, forecast_model)
+    fig = plt.subplots(figsize=(15, 5))
+    data_final.loc[data_final.index[-7:], "Target"]= next_7_day_prediction
+    ax = data_final.loc[data_final.index[-96:-6], "Target"].plot(label="Real", title="311 Service Volume: 7 Day Prediction")
+    data_final.loc[data_final.index[-7:], "Target"].plot(label="Forecast", ax=ax)
+    ax.legend()
+    curr_fig = plt.gcf()
+    plt.close()
+    return curr_fig
+def plot_train_split():
+    plt.close('all')
+    from utils import plot_train_split
+    fig = plot_train_split(train, val)
+    return fig
+def plot_val_predicitons():
+    data = val.copy()
+    data["Prediction"] = preds_val
+    from utils import plot_predictions
+    fig = plot_predictions(train, val, preds_val)
+    return fig
+curr_theme = gr.themes.Default(
+    text_size=gr.themes.sizes.text_lg
+)
+with gr.Blocks(theme=curr_theme, js=dark_mode, css=open("custom.css", "r").read()) as app:
+    title = gr.HTML("""<h1 align="center">Point72 Case Study</h1>""")
+    with gr.Tabs() as pages:
+        with gr.Tab("Overview") as toc_page:
+            gr.Markdown("# My Point72 Case Study Results")
+            gr.Markdown("""
+            * Please follow the tabs sequentially left to right to get the full story of my work
+            * There will be many interactive parts where you will be able to test and view different parameters
+            * This app may also be built and ran locally
+            * This app is hosted and served from a cloud server VM Instance
+            * Any questions please email me: davidna22@gmail.com
+            """)
+        with gr.Tab("Data Preprocessing") as data_preprocessing_page:
+            with gr.Tab("Data Loading") as dp_overview:
+                gr.HTML("<h1 style=\"text-align: center;\">Loading the Data</h1>")
+                gr.Markdown("## Goal: Load the Data as efficiently as possible")
+                gr.Markdown("""
+                * Using Pandas alone is **slow and inefficient**.
+                * With small datasets, pandas is great because the API is robust.
+                * With medium datasets, using a library like polars (a Rust based module with 10x pandas speed) is much faster.
+                * As data gets even larger, multi-processing languages like Spark are required.
+                * For this dataset, I use pandas for the weather data and polars for the 311 data. After the aggregation and merge, I revert back to pandas for API compatibility.
+                """)
+                with gr.Accordion("Code", open=False):
+                    gr.Code(load_code, language="python")
+            with gr.Tab("Location Mapping") as dp_overview:
+                src_doc = html.escape(open("figures/map1.html","r").read())
+                iframe1 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>'
+                src_doc = html.escape(open("figures/map2.html","r").read())
+                iframe2 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>'
+                src_doc = html.escape(open("figures/bounded_map.html","r").read())
+                iframe3 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>'
+                src_doc = html.escape(open("figures/final_map.html","r").read())
+                iframe4 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>'
+                gr.HTML("<h1 style=\"text-align: center;\">Location Mapping for Both Datasets</h1>")
+                with gr.Row(elem_classes="map-legend"):
+                    gr.Markdown("""
+                    **Legend:**
+                    * <span style=\"color: red\">Red:</span> Weather records
+                    * <span style=\"color: #5989ff\">Blue:</span> 311 Service records
+                    """, elem_classes="map-legend-text")
+                with gr.Row():
+                    with gr.Column():
+                        gr.HTML("<h1 style=\"text-align: center; margin: 0px;\">Map of New York State</h1>")
+                        map1 = gr.HTML(iframe1, elem_classes="map")
+                    with gr.Column():
+                        gr.HTML("<h1 style=\"text-align: center; margin: 0px;\">Map of New York City</h1>")
+                        map2 = gr.HTML(iframe2, elem_classes="map")
+                with gr.Row():
+                    gr.Markdown("""
+                    Juxtaposing these two maps and seeing the approximate distributions of data observations,
+                    its easy to see the problem. The weather dataset encompasses a larger area than the 311 Service call dataset.
+                    Once this problem was diagnosed the solution was simple. First you find the max coordinate (Lat, Long) bounds
+                    from the 311 Service Dataset. Then, you just filter the weather dataset to only include points from within
+                    these bounds. This was one of my initial discoveries when analyzing the dataset and crucial to ensure
+                    congruity between the two. **Below you can see the bounding box I created and how the new weather data
+                    observations fit in this bounding box.**
+                    """)
+                with gr.Row():
+                    with gr.Column():
+                        map3 = gr.HTML(iframe3, elem_classes="map")
+                    with gr.Column():
+                        map4 = gr.HTML(iframe4, elem_classes="map")
+                with gr.Accordion("Code", open=False):
+                    gr.Code(map_code, language="python")
+            with gr.Tab("Variable Pruning") as var_pruning:
+                gr.HTML("<h1 style=\"text-align: center;\">How I pruned the datasets</h1>")
+                gr.Markdown("## Goal: Remove as many useless features as possible")
+                gr.HTML("<h3 style=\"color: darkorange;\">Key Factors for Feature Removal</h3>")
+                gr.Markdown("""
+                * Percentage of missing data points
+                * Distribution Imbalance
+                * Irrelevance
+                * Number of distinct categories
+                * Another variable was chosen as replacement <br/><br/>
+                NOTE: Look in the appendix for visualizations of individual variables
+                """)
+                droped_var_df = pd.read_excel("data/drop_vars.xlsx")
+                gr.Dataframe(
+                    droped_var_df,
+                    wrap=True,
+                    label="Dropped Variables & Justification (Weather on Bottom)"
+                )
+            with gr.Tab("Time Aggregation") as time_agg:
+                gr.HTML("<h1 style=\"text-align: center;\">Aggregate Data by Date</h1>")
+                gr.Markdown("## Goal: Aggregate data by Date")
+                gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: 311 Service data is not inherently formatted to provide Created Ticket Counts</h3>")
+                gr.HTML("""
+                <ul style="font-size: 18px">
+                    <li>Data must be aggregated by day to find ticket counts</li>
+                    <li>Covariate features need a special transformation</li>
+                    <li>Final Aggregations Mapping</li>
+                    <ul style="padding-inline-start: 40px;">
+                        <li>Created Date ==> groupby.count ==> Target (Created ticket count)</li>
+                        <li>Closed Date ==> Agg* ==> Number of closed tickets (Agg* explained in next tabs)</li>
+                        <li>Agency ==> Agg* ==> Number of tickets by Agency (Agg* explained in next tabs)</li>
+                        <li>Borough ==> Agg* ==> Number of tickets by Boroguh (Agg* explained in next tabs)</li>
+                        <li>Descriptor ==> Agg* ==> Number of tickets by Descriptor Group/Category (Agg* explained in next tabs)</li>
+                    </ul>
+                </ul>""")
+                gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Weather data is not aggregated by day</h3>")
+                gr.HTML("""
+                <ul style="font-size: 18px">
+                    <li>To merge with 311 Service data, both datasets must be aggregated</li>
+                    <li>Additional transformations may be applied only after time aggregation</li>
+                    <li>Aggregation function needs to be handled feature by feature</li>
+                    <li>Final Aggregation Mapping</li>
+                    <ul style="padding-inline-start: 40px;">
+                        <li>MaxTemp, MaxSustainedWind  ==> groupby.max ==> Variables have an inherent max feature</li>
+                        <li>MinTemp ==> groupby.min ==> Variable has an inherent min feature</li>
+                        <li>Rain, SnowIce ==> groupby.mean.round ==> Binary variables are first aggregated than rounded back to binary</li>
+                        <li>All Other Variables ==> groupy.mean ==> Mean used by default as it is the least lossy pooling method</li>
+                    </ul>
+                </ul>""")
+            with gr.Tab("Weather Data: Imputation") as wd_impute:
+                gr.HTML("<h1 style=\"text-align: center;\">Data Imputation</h1>")
+                gr.Markdown("## Goal: Impute missing values in Weather Data")
+                gr.HTML("<h3 style=\"color: darkorange;\">Issue: Weather data is incomplete, 49 days are missing in 2018</h3>")
+                gr.Markdown("#### Proposed Solution: Use a simple imputer to fill these missing days + 7 more days into the \"future\"")
+                gr.HTML("""
+                <ul style="font-size: 18px">
+                    <li>Use a simple imputer rather than a robust imputation method to reduce model complexity</li>
+                    <ul style="padding-inline-start: 40px;">
+                        <li>Using a robust imputer = Conducting a multivariate forcast, Very complex & can be slow</li>
+                        <li>Using a simple imputer = Low complexity, low latency</li>
+                    </ul>
+                    <li>Simple imputer applies an aggregate function using Day Of Year (1-366) as the interval</li>
+                    <li>4 different Imputation Methods: Mean, Median, Min, Max</li>
+                    <li>7 additional days are imputed so the weather data can be used as a future covariate in our model</li>
+                    <li>Final Aggregation Mapping</li>
+                    <ul style="padding-inline-start: 40px;">
+                        <li>WindSpeed, MaxSustainedWind, Gust, SnowDepth => Use Mean => Noisy Variables, Non-Mean/Median methods are too biased, curve best fit with Mean</li>
+                        <li>Rain => Use Max => Binary Variables with noise, min/mean/median imputes 0, which does not follow the trend</li>
+                        <li>SnowIce => Use Min (impute 0) => Binary variables but mostly 0's, any other imputation is visually inaccurate</li>
+                        <li>MeanTemp, MinTemp, MaxTemp, DewPoint, Percipitation => Use Min => Perhaps helping to remove non-stationarity (global warming), Winter is colder now than before, Curve best fits with min</li>
+                    </ul>
+                </ul>""")
+                gr.Markdown("Use plots below to view the plots used to help justify above reasoning")
+                with gr.Accordion("Show Plots", open=False):
+                    impute_data = gr.State(wd_full_local)
+                    impute_choices = ["None"]
+                    impute_choices.extend(impute_cols)
+                    wd_impute_col = gr.Dropdown(
+                        choices=impute_choices,
+                        value="None",
+                        label="Choose a Variable to plot all imputation methods"
+                    )
+                    wd_impute_plot = gr.Plot()
+                    wd_impute_col.change(
+                        plot_imputations,
+                        [wd_impute_col, impute_data],
+                        [wd_impute_plot]
+                    )
+            with gr.Tab("311: Closed Ticket Counting") as ct_date:
+                gr.HTML("<h1 style=\"text-align: center;\">Closed Ticket Feature</h1>")
+                gr.Markdown("## The Closed Ticket Feature is built from the Closed Date column similarly to how Created Date was used to generate new 311 Call Volume")
+                gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Data Error, Typos, and/or Null valuess</h3>")
+                gr.HTML("""
+                <ul style="font-size: 18px">
+                    <li>Number of Null Values: </li>
+                    <li>Number of Closed Dates where Closed Date > Created Date: </li>
+                    <ul style="padding-inline-start: 40px;">
+                        <li>These values were most likely typos/data recording errors</li>
+                        <li>For instance, some of these values dated to 1900</li>
+                    </ul>
+                    <li>SOLUTION: For every data error, impute with the mean difference (recompute Closed Date based off Created)</li>
+                    <li>Mean is calculated as the mean time differential between all valid Closed & Created Dates</li>
+                    <li>Mean Time Differential: 13 Days</li>
+                </ul>""")
+                gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Data Leakage - Future into Past</h3>")
+                gr.HTML("""
+                <ul style="font-size: 18px">
+                    <li>Most of the Closed Date values are 13 days ahead relative to Created Date</li>
+                    <li>GroupBy Closed Date only will lead to some closed ticket counts leaking into future created dates</li>
+                    <li>SOLUTION: GroupBy [Closed Date, Created Date] pairwise, filter so Created Date < Closed Date</li>
+                </ul>""")
+                with gr.Accordion("Code", open=False):
+                    gr.Code(Closed_Ticket_Code, language="python")
+            with gr.Tab("311: Categorical Grouping") as cat_groups:
+                BERTopic = gr.State(BERTopic.load("models/BERTopic"))
+                gr.HTML("<h1 style=\"text-align: center;\">Categorical Features</h1>")
+                gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Categorical Features have too many categories</h3>")
+                gr.Markdown("#### Create a mapping of categories into groups to reduce total number (Viewable at the bottom of the page)")
+                gr.HTML("""
+                <ul style="font-size: 18px">
+                    <li>Borough:</li>
+                    <ul style="padding-inline-start: 40px;">
+                        <li>Only 9 Categories without grouping</li>
+                        <li>Four Categories are either typos or just null => Group all into OTHER</li>
+                    </ul>
+                    <li>Agency:</li>
+                    <ul style="padding-inline-start: 40px;">
+                        <li>30 Agencies in total are listed</li>
+                        <li>Manual Research to group each Agency by Category of what they typically do</li>
+                        <li>30 Agencies down to 7 Agency Groupings, based on frequency and research</li>
+                    </ul>
+                    <li>Complaint Type: Removed because analysis showed complaints were too related to the agency</li>
+                    <ul style="padding-inline-start: 40px;">
+                        <li>299 unique pairs out of 271 unique complaints => only ~10% difference in distribution</li>
+                    </ul>
+                    <li>Descriptor: Over 1000+ unique categories. Only way to realistically group is to use NLP</li>
+                    <ul style="padding-inline-start: 40px;">
+                        <li>Pretrained a BERTopic model to extract topics from the text</li>
+                        <li>BERTopic uses TF-IDF & Transformers to extract topics from text</li>
+                        <li>BERTopic reduced 1000 categories into 8 groups</li>
+                    </ul>
+                </ul>""")
+                gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: How do we aggregate by day these features when there are multiple repeated categories per day</h3>")
+                gr.Markdown("#### One Hot Encode and Sum per category")
+                gr.HTML("""
+                <ul style="font-size: 18px">
+                    <li>Step 1: One hot encode all the features before aggregation</li>
+                    <li>Step 2: GroupBy date and Sum for each encoding</li>
+                    <ul style="padding-inline-start: 40px;">
+                        <li>Example: A categorical group with 4 categories</li>
+                        <li>One Sum column per category representing the frequency of that category per day</li>
+                    </ul>
+                    <li>Main Downside: Highly correlated with Created Ticket data; aggregation method was essentially the same</li>
+                    <ul style="padding-inline-start: 40px;">
+                        <li>Summing across the four feature categories in the example above would just equal the ticket count</li>
+                    </ul>
+                    <li>Solution: Leave some categories out of final vector to reduce bias (Shown in feature engineering stage)</li>
+                </ul>""")
+                with gr.Accordion("View Feature Groups", open=False):
+                    with gr.Accordion("Borough", open=False):
+                        gr.JSON(json.loads(open("code/Borough.json", "r").read()))
+                    with gr.Accordion("Agency", open=False):
+                        gr.JSON(open("code/Agency.json", "r").read())
+                    with gr.Accordion("Descriptor", open=False):
+                        gr.Dataframe(topic_model.get_topic_info().loc[:, ["Count", "Name", "Representation"]])
+                        gr.Plot(topic_model.visualize_barchart(list(range(-1,6,1))))
+            with gr.Tab("All Code") as code_preprocess:
+                gr.Markdown("# View Full Code for building Weather Data")
+                with gr.Accordion(open=False):
+                    gr.Code(open("code/build_weather.py", "r").read())
+                gr.Markdown("# View Full Code for building 311 Service Data")
+                with gr.Accordion(open=False):
+                    gr.Code(open("code/build_service.py", "r").read())
+        with gr.Tab("Exploratory Data Analysis", id="eda_page") as eda_page:
+            bivar_data = gr.State(data_merged_eda)
+            with gr.Tab("Overview", id="eda_overview") as eda_overview:
+                gr.Markdown("# The EDA Section is intended to be a set of interactive visualizations")
+                gr.Markdown("The tabs are interactive plots and tables that were used to generate the key insights below.")
+                gr.HTML("<h3 style=\"color: darkorange;\">Key Insights</h3>")
+                gr.HTML("""
+                <ul style="font-size: 18px">
+                    <li>Missing Values:</li>
+                    <ul style="padding-inline-start: 40px; font-size: 18px;">
+                        <li>Gust if used may need interpolation to fill missing values</li>
+                    </ul>
+                    <li>Stationarity</li>
+                    <ul style="padding-inline-start: 40px; font-size: 18px;">
+                        <li>Weather variables exhibit various levels of non-stationarity (mostly based on trend but some constant)</li>
+                        <ul style="padding-inline-start: 60px; font-size: 18px;">
+                            <li>Trends are clear for some like Temperature and DewPoint</li>
+                            <li>Possible cause of constant non-stationarity are factors such as global warming</li>
+                        </ul>
+                        <li>311 Calls may exhibit some forms of weekly non-stationarity</li>
+                        <ul style="padding-inline-start: 60px; font-size: 18px;">
+                            <li>Potentially weekly and monthly non-stationarity</li>
+                            <li>Affected by Holidays and Weekends</li>
+                            <li>More robust tests needed</li>
+                        </ul>
+                        <li>Action Item: Test for stationarity and remove</li>
+                    </ul>
+                    <li>Bivariate Interactions:</li>
+                    <ul style="padding-inline-start: 40px; font-size: 18px;">
+                        <li>311 Calls have stronger relationships with certain Agency, Borough and Descriptor categories</li>
+                        <li>311 calls exhibit weak overal linear relationships with weather</li>
+                        <ul style="padding-inline-start: 60px; font-size: 18px;">
+                            <li>Monthly and Seasonal relationship is strongest in winter months</li>
+                            <li>Month Of January: strongest linear relationship between MinTemp, DewPoint</li>
+                        </ul>
+                    </ul>
+                    <li>Seasonality:</li>
+                    <ul style="padding-inline-start: 40px; font-size: 18px;">
+                        <li>Weather variables exhibit a strong Yearly and Seasonal seasonality</li>
+                        <li>311 Service Variables exhibit Weekly Seasonality</li>
+                        <li>311 Variables affected strongly by holidays and weekends (less 311 calls on weekends and holidays)</li>
+                    </ul>
+                    <li>Correlation:</li>
+                    <ul style="padding-inline-start: 40px; font-size: 18px;">
+                        <li>Heavy Collinearity among weather variables (especially Min, Mean, MaxTemp)</li>
+                        <li>Varying degrees of correlation among 311 covariates and 311 volume</li>
+                    </ul>
+                    <li>Lags & Autocorrelation:</li>
+                    <ul style="padding-inline-start: 40px; font-size: 18px;">
+                        <li>311 Service Calls have highest correlation with 7,14,21 weekly lags</li>
+                        <li>6,8 day lag intervals second strongest relationship. 8 day exhibits some negative correlation</li>
+                        <li>1 day lag exhibits similar correlation with 6,7 day lags</li>
+                    </ul>
+                </ul>""")
+            with gr.Tab("Univariate", id="eda_univar") as eda_univar:
+                with gr.Tab("Weather Data") as eda_uni_weather:
+                    eda_univar_weatherdf = gr.State(weather_full_df)
+                    gr.Markdown("# Use the Interactive plot below")
+                    eda_uni_weather_name = gr.State("Weather")
+                    weather_vars = [
+                        "", 'MeanTemp', 'DewPoint', 'Percipitation', 'WindSpeed', 'Gust', 'SnowDepth',
+                        'MinTemp', 'MaxTemp', 'MaxSustainedWind'
+                    ]
+                    select_weather_var = gr.Dropdown(
+                        choices=weather_vars,
+                        value="",
+                        label="Select a Variable to View"
+                    )
+                    weather_uniplot = gr.Plot()
+                    select_weather_var.change(
+                        plot_timeseries,
+                        inputs=[
+                            eda_univar_weatherdf,
+                            select_weather_var,
+                            eda_uni_weather_name
+                        ],
+                        outputs=[
+                            weather_uniplot
+                        ]
+                    )
+                with gr.Tab("311 Service Data") as eda_uni_weather:
+                    eda_univar_servicedf = gr.State(data_merged_eda)
+                    gr.Markdown("# Use the Interactive plot below")
+                    gr.Markdown("**NOTE: Target is the count of 311 service records**")
+                    eda_uni_service_name = gr.State("Weather")
+                    service_vars = [
+                        "", 'Target', 'num_closed_tickets',
+                        # Agency Group Counts
+                        'AG_Buildings', 'AG_Environment & Sanitation', 'AG_Health',
+                        'AG_Parks', 'AG_Security', 'AG_Transportation',
+                        'AG_Other',
+                        # Borough Counts
+                        'Borough_BRONX', 'Borough_BROOKLYN', 'Borough_MANHATTAN',
+                        'Borough_QUEENS', 'Borough_STATEN ISLAND',
+                        'Borough_OTHER',
+                        # Descriptor Group Counts
+                        'DG_damaged_sign_sidewalk_missing',
+                        'DG_english_emergency_spanish_chinese',
+                        'DG_exemption_commercial_tax_business',
+                        'DG_license_complaint_illegal_violation', 'DG_noise_animal_truck_dead',
+                        'DG_odor_food_air_smoke', 'DG_order_property_inspection_condition',
+                        'DG_water_basin_litter_missed'
+                    ]
+                    select_service_var = gr.Dropdown(
+                        choices=service_vars,
+                        value="",
+                        label="Select a Variable to View"
+                    )
+                    service_uniplot = gr.Plot()
+                    select_service_var.change(
+                        plot_timeseries,
+                        inputs=[
+                            eda_univar_servicedf,
+                            select_service_var,
+                            eda_uni_service_name
+                        ],
+                        outputs=[
+                            service_uniplot
+                        ]
+                    )
+            with gr.Tab("Bivariate", id="eda_bivar") as eda_bivar:
+                gr.Markdown("# Use the Interactive plot below")
+                gr.Markdown("Use this tab to view relationships between the Target variable (number of tickets created daily) and a Covariate")
+                with gr.Column():
+                    with gr.Row() as bivar_params:
+                        bivar_dist_target = gr.Dropdown(
+                            choices=["Target"],
+                            value="Target",
+                            label="Target Variable (One option)"
+                        )
+                        all_bivars = ['num_closed_tickets', "Agency", "Borough", "Descriptor"]
+                        all_bivars.extend(weather_vars)
+                        all_bivars = sorted(all_bivars)
+                        all_bivars = all_bivars[1:]
+                        bivar_dist_cov = gr.Dropdown(
+                            choices=all_bivars,
+                            value="MeanTemp",
+                            label="Select Covariate"
+                        )
+                        bivar_trendline = gr.Dropdown(
+                            choices=[True, False],
+                            value=True,
+                            label="Graph with OLS Trendline"
+                        )
+                    with gr.Accordion("Add Seasonality", open=False):
+                        bivar_subset = gr.Dropdown(
+                            choices=["None", "Year", "Season", "Month", "Day Of Week", "Weekend", "Holiday"],
+                            value="None",
+                            label="Seasonality Options (Disabled for Agency, Borough and Descriptor)"
+                        )
+                bivar_submit = gr.Button("Run")
+                bivar_plot = gr.Plot()
+                bivar_submit.click(
+                    plot_bivariate,
+                    [bivar_data, bivar_dist_cov, bivar_dist_target, bivar_subset, bivar_trendline],
+                    bivar_plot
+                )
+            with gr.Tab("Seasonality") as bivar_season:
+                gr.Markdown("## Exploring the affect of Seasonality")
+                with gr.Row() as bivar_season_params:
+                    bivar_season_var = gr.Dropdown(
+                        choices=["Target", 'MeanTemp', 'DewPoint',
+                        'Percipitation', 'WindSpeed', 'Gust', 'SnowDepth',
+                        'MinTemp', 'MaxTemp', 'MaxSustainedWind'],
+                        value="Target",
+                        label="Variable"
+                    )
+                    bivar_season_cov = gr.Dropdown(
+                        choices=["Year", "Season", "Month", "Day Of Week", "Weekend", "Holiday", "Rain", "SnowIce"],
+                        value="Year",
+                        label="Seasonality"
+                    )
+                    with gr.Column():
+                        season_boxplot = gr.Checkbox(value=True, label="Show Boxplot")
+                        season_outlier = gr.Checkbox(value=False, label="Show Outliers")
+                bivar_season_btn = gr.Button("Run")
+                bivar_season_plot = gr.Plot()
+                bivar_season_btn.click(
+                    plot_seasonality,
+                    [bivar_data, bivar_season_cov, bivar_season_var, season_boxplot, season_outlier],
+                    [bivar_season_plot]
+                )
+            with gr.Tab("Correlation") as corr:
+                with gr.Tab("Weather Correlations") as corr_weather:
+                    gr.Plot(plot_all_correlations(data_merged_eda, "weather", method="pearson"))
+                with gr.Tab("311 Service Correlations") as corr_service:
+                    gr.Plot(plot_all_correlations(data_merged_eda, "service", method="pearson"))
+                with gr.Tab("Lag Correlations") as corr_dynamic:
+                    gr.Markdown("## Use this to dynamically view correlations based on Lag")
+                    gr.Markdown("By Default, we will analyze lags of [0,1,2,3,4,5,6,7,8,13,14,15,21] days for chosen variable")
+                    gr.Markdown("Scroll Down For AutoCorrelation Graphs")
+                    with gr.Row():
+                        corr_vars = [
+                            "None", 'Target', 'num_closed_tickets',
+                            # Weather Variables
+                            'MeanTemp', 'DewPoint', 'Percipitation',
+                            'WindSpeed', 'Gust', 'SnowDepth',
+                            'MinTemp', 'MaxTemp', 'MaxSustainedWind',
+                            # Agency Group Counts
+                            'AG_Buildings', 'AG_Environment & Sanitation', 'AG_Health',
+                            'AG_Parks', 'AG_Security', 'AG_Transportation',
+                            'AG_Other',
+                            # Borough Counts
+                            'Borough_BRONX', 'Borough_BROOKLYN', 'Borough_MANHATTAN',
+                            'Borough_QUEENS', 'Borough_STATEN ISLAND',
+                            'Borough_OTHER',
+                            # Descriptor Group Counts
+                            'DG_damaged_sign_sidewalk_missing',
+                            'DG_english_emergency_spanish_chinese',
+                            'DG_exemption_commercial_tax_business',
+                            'DG_license_complaint_illegal_violation', 'DG_noise_animal_truck_dead',
+                            'DG_odor_food_air_smoke', 'DG_order_property_inspection_condition',
+                            'DG_water_basin_litter_missed'
+                        ]
+                        corr_vars = gr.Dropdown(
+                            choices=corr_vars,
+                            value="Target",
+                            label="Variable"
+                        )
+                    corr_btn = gr.Button("Run")
+                    corr_plot = gr.Plot()
+                    autocorr_plot = gr.Plot()
+                    corr_btn.click(
+                        plot_correlations,
+                        [bivar_data, corr_vars],
+                        [corr_plot]
+                    )
+                    corr_btn.click(
+                        plot_autocorr,
+                        [bivar_data, corr_vars],
+                        [autocorr_plot]
+                    )
+        with gr.Tab("Feature Engineering") as feature_engineer_page:
+            with gr.Tab("Feature Selection") as feature_select:
+                gr.HTML("<h1 style=\"text-align: center;\">Select Features Based on EDA</h1>")
+                gr.Markdown("### Below is the logic used in our model feature selection")
+                gr.HTML("""
+                <ul style="font-size: 18px">
+                    <li>Weather Covariates</li>
+                    <ul style="padding-inline-start: 30px; font-size: 18px;">
+                        <li>Weather variables exhibit various levels of non-stationarity (mostly based on trend but some constant)</li>
+                        <li>MeanTemp, MaxTemp: High collinearity with MinTemp. MinTemp has highest correlation of 3 => REMOVE</li>
+                        <ul style="padding-inline-start: 50px; font-size: 18px;">
+                            <li>Possible Reason: High temps, people stay indoors. A/C doesn't break nowadays. Lower Temps lead to building/tech failure more often</li>
+                        </ul>
+                        <li>Percipitation: Bivariate plot shows weak relationship, outliers no effect on 311 => REMOVE</li>
+                        <li>SnowDepth: High number missing values, low correlation => REMOVE</li>
+                        <li>Rain, SnowIce: Binary, plots (look in Seasonality Tab) show weak relationship, SnowIce heavily imbalanced (99% 0's) => REMOVE</li>
+                    </ul>
+                    <li>311 Service Covariates:</li>
+                    <ul style="padding-inline-start: 30px; font-size: 18px;">
+                        <li>LOO (Leave One - or many - Out) Encoding:</li>
+                        <ul style="padding-inline-start: 50px; font-size: 18px;">
+                            <li>Remove weakest features from our categorical covariates</li>
+                            <li>Reduces bias and removes multicollinearity inherent to One-Hot Encoding</li>
+                            <li>Candidates For Removal:</li>
+                            <ul style="padding-inline-start: 70px; font-size: 18px;">
+                                <li>AG_Health, AG_Other: Lowest Correlation, lowest counts => REMOVE</li>
+                                <li>AG_Parks: Lowest Correlation, but low multi-collinearity => KEEP</li>
+                                <li>Borough_OTHER: Weakest Correlation, lowest count => REMOVE</li>
+                                <li>DG_english_emergency, DG_exemption_commercial: Weakest Correlation, lowest counts => REMOVE</li>
+                                <li>DG_odor_food_air_smoke: Lowest Count, but high correlation => KEEP</li>
+                            </ul>
+                        </ul>
+                    </ul>
+                </ul>""")
+                with gr.Accordion("Show Final Variable List", open=False):
+                    gr.JSON(json.loads(open("code/all_vars.json","r").read()))
+            with gr.Tab("Feature Preprocessing") as feature_prep:
+                data_feature_prep = gr.State(data_preprocess)
+                gr.HTML("<h1 style=\"text-align: center;\">Preprocess Features</h1>")
+                gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Missing Values</h3>")
+                gr.HTML("""
+                <ul style="font-size: 18px">
+                    <li>Only One value has missing values to impute: Gust</li>
+                        <ul style="padding-inline-start: 30px; font-size: 18px;">
+                            <li>Various interpolation methods were tested</li>
+                            <li>Methods like Spline and Polynomial over-estimated some values, breaking inherent data ranges</li>
+                            <li>Turns out Simple Linear interpolation was best</li>
+                        </ul>
+                    <li>SOLUTION: Interpolate Gust with Linear method</li>
+                </ul>""")
+                with gr.Accordion("Show Interpolation Plots", open=False):
+                    gr.Plot(plot_interpolation(data_preprocess))
+                gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Remove Non-Stationarity</h3>")
+                gr.HTML("""
+                <ul style="font-size: 18px">
+                    <li>Variables that are non-stationary change over time, they have a trend</li>
+                    <li>Ideal to transform non-stationarity variables for modeling</li>
+                    <li>Ignore Categorical Variables (simply to keep model complexity low)</li>
+                    <li>Numerical Variables were tested for Non-Stationarity using two methods: ADF and KPSS</li>
+                    <ul style="padding-inline-start: 30px; font-size: 18px;">
+                        <li>Using ADF and KPSS together can reveal what kind of trend exists in the data</li>
+                        <li>Only 1 Case Met: Pass KPSS, Fail ADF = Trend Stationary (most likely by season)</li>
+                    </ul>
+                    <li>Only Two Variables failed the tests: DewPoint & MinTemp</li>
+                    <li>SOLUTION: Use Differencing (7d lag) + Log for MinTemp and Differencing (7d lag) for DewPoint (Log caused many NaNs)</li>
+                </ul>""")
+                with gr.Accordion("View Results Below", open=False):
+                    gr.Markdown("### MinTemp (Log) Tests Before and After Transformation")
+                    with gr.Row():
+                        with gr.Column():
+                            gr.Dataframe(test_stationary(data_preprocess, "MinTemp_old"), label="MinTemp No Augments")
+                        with gr.Column():
+                            gr.Dataframe(test_stationary(data_preprocess, "MinTemp"), label="Log + 7 Day Lag Differencing")
+                    gr.Markdown("### DewPoint Tests Before and After Transformation")
+                    with gr.Row():
+                        with gr.Column():
+                            gr.Dataframe(test_stationary(data_preprocess, "DewPoint_old"), label="DewPoint No Augments")
+                        with gr.Column():
+                            gr.Dataframe(test_stationary(data_preprocess, "DewPoint"), label="7 Day Lag Differencing")
+            with gr.Tab("Feature Engineering") as feature_eng:
+                with gr.Tab("Past Covariates") as fe_past:
+                    gr.HTML("<h1 style=\"text-align: center;\">Past Covariate Features</h1>")
+                    gr.Markdown("""
+                    * Past Covariates are datapoints that are implied to be only related to past information
+                    * For Instance, using past sales of product B to predict futures sales of product A
+                    * There are two ways to use past covariates
+                    * *Option 1:* Build a multi-variate forecast to predict these variables simultaneously
+                    * *Option 2:* Use a sliding window and lags to provide past data (especially for multi-step forecasts)
+                    """)
+                    gr.Markdown("**I will use Option 2 to avoid building a very complex multi-variate model**")
+                    gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Leaking Future Data into the past</h3>")
+                    gr.Markdown("""
+                    * By using lags, I can shift my data in a way to avoid leaking past data into the future
+                    * For predicting 7 days into the future, I must lag my data by at least 7 days
+                    * Use a rolling window that will reset over time
+                    """)
+                    gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Curse of Dimensionality</h3>")
+                    gr.Markdown("""
+                    * Possible to use many variations of lags, rolling and differences to generate many features
+                    * Too many features leads to the curse of dimensionality, i.e. Overfitting
+                    * Thus, I keep my Feature Set as simple as possible
+                    """)
+                    gr.Markdown("""
+                    ### Feature Set
+                    * Lags: 7D, 14D, 21D
+                    * Rolling (Shifted 7 Days forward): Mean of 14D (14 because mean(Created - Closed Date) = 13 days)
+                    * Differencing (7D difference = 7D lag - 14D lag): 7D
+                    """)
+                    with gr.Accordion("Open to view implementation code", open=False):
+                        gr.Code(open("code/past_features.py","r").read())
+                with gr.Tab("Future Covariates") as fe_past:
+                    gr.HTML("<h1 style=\"text-align: center;\">Past Covariate Features</h1>")
+                    gr.Markdown("""
+                    * Future Covariates are data that I have about the future
+                    * For Instance, I can use the projected revenue of Company A to predict daily sales
+                    * For Future Covariates, I do not need to shift variables. I will provide a shift up to 2 days.
+                    * I apply a rolling and expanding window as more features
+                    * Also, I use mean and min to follow the logic learned in EDA. Minimum temp values seem to be more impactful on 311 volume
+                    """)
+                    gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Curse of Dimensionality</h3>")
+                    gr.Markdown("""
+                    * Similar to the Past Covaraiates, I keep my features as simple as possible with as little as possible
+                    * The more features, the more we may overfit
+                    """)
+                    gr.Markdown("""
+                    ### Feature Set
+                    * Lags: 0D, 1D, 2D
+                    * Rolling: Mean & Min of last 14D
+                    * Expanding Window: Max, Min (min-length of 14)
+                    * Differencing already performed to remove trends
+                    """)
+                    with gr.Accordion("Open to view implementation code", open=False):
+                        gr.Code(open("code/future_features.py","r").read())
+                with gr.Tab("Target Variable") as fe_past:
+                    gr.HTML("<h1 style=\"text-align: center;\">311 Service Calls Features</h1>")
+                    gr.Markdown("""
+                    * For providing feature transformations of our Target, we can follow a similar process as above
+                    * Main Difference: Lags of < prediction window need to be recomputed at each iteration
+                    * So, for predicting at time (t+1) we need the predicted value at time (t)
+                    * For a recursive prediction model, this means the model cannot make batch predictions without iterating
+                    """)
+                    gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: More variables increase complexity for prediction</h3>")
+                    gr.Markdown("""
+                    * The more features, the more overfitting & more computation
+                    * As I will use a recursive model, these values must be recomputed at each step t+1
+                    * In favor of a less complex model, I will choose as minimal features as possible (excluding rolling features as its prone to error with recalculation)
+                    """)
+                    gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Leaking Future Data into the past</h3>")
+                    gr.Markdown("""
+                    * Must be careful about how these features are computed
+                    * For instance, for rolling mean, I would shift the data up by 1 lag first then compute the rolling sum
+                    * For differencing, a 7D lag difference is really the 1D - 8D lag. (For t=8, 7D diff = t7-t1 not t8-t2)
+                    """)
+                    gr.Markdown("""
+                    ### Feature Set
+                    * Lags: 1D, 6D, 7D, 8D, 14D, 21D (based on highest correlations and weekly seasonality)
+                    * Differencing: 7D, 14D
+                    """)
+                    with gr.Accordion("Open to view implementation code", open=False):
+                        gr.Code(open("code/target_features.py","r").read())
+        with gr.Tab("Forecast Model") as model_select_train_page:
+            with gr.Tab("Splitting the data") as model_data_split:
+                gr.HTML("<h1 style=\"text-align: center;\">Splitting Time-Series Data</h1>")
+                gr.HTML("""
+                <ul style="font-size: 18px">
+                    <li>Splitting Time-Series Data is different than splitting other data</li>
+                    <li>Rather than splitting on random samples, you split the data by time with order consistent</li>
+                    <li>I took a 75% splitting approach where I split my data at the date that sits on the 75% of data length</li>
+                </ul>""")
+                gr.Markdown("#### As an example, I provide a graph showing exactly how I split my data")
+                gr.Plot(plot_train_split())
+            with gr.Tab("Model Selection") as model_data_split:
+                gr.HTML("<h1 style=\"text-align: center;\">Choosing the Right Model</h1>")
+                gr.Markdown("### Types of Forecast Models for Multi-Step Prediction")
+                gr.HTML("""
+                <ul style="font-size: 18px">
+                    <li>Parallel Models: Train a model for each prediction (one for 1 day ahead, another for 2, etc.)</li>
+                    <li>Recursive Models: Model makes a forecast, fills any values it needs for the next prediction, predicts again</li>
+                    <ul style="padding-inline-start: 40px; font-size: 18px;">
+                        <li>One of the assumptions was to build a model that was reasonable for production</li>
+                        <li>Parallel models are hard to maintain as the steps of prediction increase</li>
+                    </ul>
+                    <li>Decision: Recursive Modele</li>
+                </ul>""")
+                gr.Markdown("### My Model Choice: XGBoost")
+                gr.HTML("""
+                <ul style="font-size: 18px">
+                    <li>Reasons for choosing:</li>
+                    <ul style="padding-inline-start: 40px; font-size: 18px;">
+                        <li>Industry standard for regression</li>
+                        <li>Lightweight and relatively fast</li>
+                        <li>Many parameters to tune, such as tree depth and regularization</li>
+                        <li>Scale invariant - Data does not have to be scaled</li>
+                        <li>Allows NaN values and categorical features without encodings (unused in my implementation)</li>
+                        <li>Provides key explainability in its feature importance metrics</li>
+                    </ul>
+                    <li>Decision: Use XGBoost</li>
+                </ul>""")
+            with gr.Tab("Model Training") as model_data_split:
+                gr.HTML("<h1 style=\"text-align: center;\">Training the Model</h1>")
+                gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Overfitting</h3>")
+                gr.HTML("""
+                <ul style="font-size: 18px">
+                    <li>Main Cause: High number of variables and XGBoost's tendency to overfit without tuning</li>
+                    <li>While training, effort was made to watch the validation and training set's relative performance</li>
+                    <li>Steps Taken to avoid Overfitting</li>
+                    <ul style="padding-inline-start: 40px; font-size: 18px;">
+                        <li>Low Learning Rate</li>
+                        <li>Low Tree Depth</li>
+                        <li>Keeping Val score relatively close to Training score</li>
+                        <li>Increased l2-lambda parameter, boosting regularization</li>
+                        <li>Many trials to get best set of parameters</li>
+                        <li>Implementing Early Stopping</li>
+                    </ul>
+                </ul>""")
+                gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Choosing a Metric</h3>")
+                gr.HTML("""
+                <ul style="font-size: 18px">
+                    <li>Three metrics I considered: MAPE, MAE and MSE</li>
+                    <li>MAPE seemed to show the most consistent and visually accurate results</li>
+                    <li>Decision: MAPE</li>
+                    <li>Justification: 311 Service volume is quite noisy and MAPE better estimates fit to a very noisy curve than the others</li>
+                </ul>""")
+            with gr.Tab("Model Prediction") as model_data_split:
+                gr.HTML("<h1 style=\"text-align: center;\">Recursive Model Prediction</h1>")
+                gr.Markdown("""
+                * Below is the code I wrote to implement the Recursive prediction explained in previous tabs
+                * Predictions are made one step at a time, where the prediction t depends on prediction t-1
+                * To view the final predictions made by the model see below
+                """)
+                gr.Code(open("code/recurse_predict.py","r").read())
+                with gr.Accordion("View 7 Day Model Forecast", open=False):
+                    gr.Plot(plot_final_predictions())
+            with gr.Tab("Model Evaluation") as model_eval_page:
+                gr.HTML("<h1 style=\"text-align: center;\">Forecast Results</h1>")
+                gr.Markdown("Overall, the model seemed to have performed pretty well. The MAPE is also <10% for both Validation and Training sets.")
+                gr.Markdown("The model did suffer from a low validation R2, but this was difficult to resolve without compromising overall performance of the model.")
+                gr.Markdown("The predictions seem to visually pass most backtests, which can be viewed in the graph below.")
+                with gr.Accordion("Model Prediction Scores", open=False):
+                    gr.JSON({"Train R2": r2_train, "Train MAPE": mape_train, "Validation R2": r2_val, "Validation MAPE": mape_val})
+                gr.Image("figures/model_performance.png", show_download_button=False)
+            with gr.Tab("Feature Importance") as model_eval_page:
+                gr.HTML("<h1 style=\"text-align: center;\">Feature Importance</h1>")
+                gr.Markdown("""
+                * Below you can view the feature importance metrics from the XGBoost model
+                * It seems there is significant impact of the weather variables on 311 Service Call Volume
+                * Interestingly, it seems some categories were more impactful than others as well
+                """)
+                gr.Plot(plot_model_feature_importance())
+        with gr.Tab("Future Work & Limitations") as future_limitations_page:
+            gr.Markdown("# Future Work")
+            gr.Markdown("""
+            * **Multi-Variate Time Series Forecasting** rather than imputing values naively
+            * Testing more kinds of models such as LightGBM
+            * Robustly testing parameters of current model using GridSearchCV
+            * Comparing performance of my forecast model to others
+            * More Data! Having more 311 Call data may help find other indicators
+            """)
+            gr.Markdown("# Future Deployments")
+            gr.Markdown("""
+            * Containerize the model and load onto an API for ingestion
+            * Containerize data preprocessing and load into a Spark Cluster
+            * Create triggers and view tables to verify data preprocessing
+            * Create functions to monitor model performance
+            """)
+        with gr.Tab("Appendix") as future_limitations_page:
+            with gr.Tab("Weather Data Analysis") as dp_weather:
+                dp_weather_state = gr.State("weather")
+                with gr.Column():
+                    with gr.Row():
+                        dp_weather_category = gr.Dropdown(
+                            choices=["2011-2018", "2016-2018"],
+                            value="2011-2018",
+                            label="Time Range"
+                        )
+                        dp_weather_var = gr.Dropdown(
+                            choices = ["MeanTemp", "MinTemp", "MaxTemp", "DewPoint", "Percipitation", "WindSpeed", "MaxSustainedWind", "Gust", "Rain", "SnowDepth", "SnowIce"],
+                            value = "MeanTemp",
+                            label = "Variable"
+                        )
+                    dp_weather_btn = gr.Button("Run")
+                dp_weather_report = gr.HTML(value=iframe_dp_weather)
+                dp_weather_btn.click(
+                    run_report,
+                    [dp_weather_state, dp_weather_var, dp_weather_category],
+                    dp_weather_report,
+                )
+            with gr.Tab("Service Data Analysis") as dp_service:
+                dp_service_state = gr.State("service")
+                dp_service_category = gr.State("full")
+                with gr.Column():
+                    dp_service_var = gr.Dropdown(
+                        choices = [
+                            "Created Date", "Closed Date", "Agency", "Agency Name",
+                            "Complaint Type", "Descriptor", "Location Type", "Landmark",
+                            "Facility Type", "Status", "Community Board", "Borough",
+                            "Open Data Channel Type", "Park Facility Name", "Park Borough",
+                            "Vehicle Type", "Taxi Company Borough", "Taxi Pick Up Location",
+                            "Bridge Highway Name", "Bridge Highway Direction", "Road ramp",
+                            "Bridge Highway Segment"
+                        ],
+                        value = "Created Date",
+                        label = "Select Variable and Run"
+                    )
+                    dp_service_btn = gr.Button("Run")
+                dp_service_report = gr.HTML(value=iframe_dp_service)
+                dp_service_btn.click(
+                    run_report,
+                    [dp_service_state, dp_service_var, dp_service_category],
+                    dp_service_report,
+                )
+def main():
+    app.launch(share=False)
+    return app
+if __name__=="__main__":
+    main()

code/Agency.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+    "Agency": {
+        "NYPD": "Security",
+        "HPD": "Buildings",
+        "DOT": "Transportation",
+        "DSNY": "Environment & Sanitation",
+        "DEP": "Environment & Sanitation",
+        "DOB": "Buildings",
+        "DOE": "Buildings",
+        "DPR": "Parks",
+        "DOHMH": "Health",
+        "DOF": "Other",
+        "DHS": "Security",
+        "TLC": "Transportation",
+        "HRA": "Other",
+        "DCA": "Other",
+        "DFTA": "Other",
+        "EDC": "Other",
+        "DOITT": "Other",
+        "DCAS": "Other",
+        "NYCEM": "Other",
+        "ACS": "Other",
+        "3-1-1": "Other",
+        "TAX": "Other",
+        "DCP": "Other",
+        "DORIS": "Other",
+        "FDNY": "Other",
+        "TAT": "Other",
+        "COIB": "Other",
+        "CEO": "Other",
+        "MOC": "Other",
+        "OMB": "Other"
+    }
+}

code/Borough.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "Borough": {
+        "BRONX" : "BRONX",
+        "BROOKLYN": "BROOKLIN",
+        "QUEENS": "QUEENS",
+        "STATEN ISLAND": "STATEN ISLAND",
+        "2017": "OTHER",
+        "2018": "OTHER",
+        "undefined": "OTHER",
+        "null": "OTHER"
+    }
+}

code/all_vars.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+    "y": ["Target"],
+    "past_covariates": [
+        "num_closed_tickets",
+        "AG_Buildings", "AG_Environment & Sanitation", "AG_Health",
+        "AG_Parks", "AG_Security", "AG_Transportation",
+        "AG_Other",
+        "Borough_BRONX", "Borough_BROOKLYN", "Borough_MANHATTAN",
+        "Borough_QUEENS", "Borough_STATEN ISLAND",
+        "Borough_OTHER",
+        "DG_damaged_sign_sidewalk_missing",
+        "DG_english_emergency_spanish_chinese",
+        "DG_exemption_commercial_tax_business",
+        "DG_license_complaint_illegal_violation", "DG_noise_animal_truck_dead",
+        "DG_odor_food_air_smoke", "DG_order_property_inspection_condition",
+        "DG_water_basin_litter_missed"
+    ],
+    "future_covariates":  [
+        "DewPoint",
+        "WindSpeed",
+        "Gust",
+        "SnowDepth",
+        "MinTemp"
+    ],
+    "temporal": [
+        "Year",
+        "Month",
+        "Day",
+        "DayOfWeek",
+        "DayOfYear",
+        "is_weekend",
+        "is_holiday",
+        "Season"
+    ]
+}

code/build_service.py ADDED Viewed

	@@ -0,0 +1,167 @@

+def build_service_data(filename):
+    # Loading data directly with polars leads to errors
+    # Some rows end up missing for an unknown reason
+    # FIX: Load in pandas then convert to polars
+    service_data_pd = pd.read_csv(filename)
+    # Quick test to assure the unique key is in fact unique
+    assert service_data_pd["Unique Key"].nunique() == len(service_data_pd)
+    # Load from pandas Dataframe
+    service_data_pd["Incident Zip"] = service_data_pd["Incident Zip"].astype("string")
+    service_data_pd["BBL"] = service_data_pd["BBL"].astype("string")
+    service_data = pl.DataFrame(service_data_pd)
+    # Clear some ram
+    del service_data_pd
+    gc.collect()
+    drop_cols = [
+        "Unique Key", "Agency Name", "Location Type", "Incident Zip",
+        "Incident Address", "Street Name", "Cross Street 1",
+        "Cross Street 2", "Intersection Street 1", "Intersection Street 2",
+        "Address Type", "City", "Landmark", "Facility Type",
+        "Status", "Due Date", "Resolution Description",
+        "Resolution Action Updated Date", "Community Board",
+        "BBL", "X Coordinate (State Plane)", "Y Coordinate (State Plane)",
+        "Open Data Channel Type", "Park Facility Name", "Park Borough",
+        "Vehicle Type", "Taxi Company Borough", "Taxi Pick Up Location",
+        "Bridge Highway Name", "Bridge Highway Direction", "Road Ramp",
+        "Bridge Highway Segment", "Location", "Created Year"
+    ]
+    # Drop columns and create the date variable
+    service_data = service_data.drop(drop_cols)
+    service_data = create_datetime(service_data, "Created Date")
+    service_data = create_datetime(service_data, "Closed Date")
+    # Group by date to get the number of Created tickets (as target)
+    sd_grouped = service_data.rename({"Created Date": "Datetime"}).group_by("Datetime").agg(
+        pl.len().alias("Target"),
+    ).sort(by="Datetime")
+    # Calculate the number of closed tickets
+    # Mean diff used to filter service data
+    # mean_diff = service_data.with_columns(
+    #     diff_created_closed = pl.col("Closed Date") - pl.col("Created Date")
+    # ).filter((pl.col("Closed Date").dt.year() >= 2016) & (pl.col("Closed Date").dt.year() < 2020))["diff_created_closed"].mean().days
+    # Mean diff precalculated as
+    mean_diff = 13
+    # Create new Closed date with errors filled using the mean diff above
+    service_data = service_data.with_columns(
+        Closed_Date_New = pl.when(pl.col("Created Date") - pl.col("Closed Date")  > pl.duration(days=1))
+                                .then(pl.col("Created Date") + pl.duration(days=mean_diff))
+                                .otherwise(pl.col("Closed Date")).fill_null(pl.col("Created Date") + pl.duration(days=mean_diff))
+    )
+    # Filter tickets such that the closed date < the created date to prevent future data leakage in our dataset
+    # We want to make sure future data is not accidentally leaked across other points in our data
+    closed_tickets = service_data.group_by(["Closed_Date_New", "Created Date"]) \
+        .agg((pl.when(pl.col("Created Date") <= pl.col("Closed_Date_New")).then(1).otherwise(0)).sum().alias("count")) \
+        .sort("Closed_Date_New") \
+        .filter((pl.col("Closed_Date_New").dt.year() >= 2016) & (pl.col("Closed_Date_New").dt.year() < 2019)) \
+        .group_by("Closed_Date_New").agg(pl.col("count").sum().alias("num_closed_tickets"))
+    # Rename this column to num closed tickets
+    ct_df = closed_tickets.with_columns(
+        pl.col("num_closed_tickets")
+    )
+    # Concat the new columns into our data
+    sd_df = pl.concat([sd_grouped, ct_df.drop("Closed_Date_New")], how="horizontal")
+    assert len(sd_grouped) == len(ct_df)
+    # CATEGORICAL FEATURE MAPPING
+    # MAPPING FOR BOROUGH
+    Borough_Map = {
+        "Unspecified": "OTHER",
+        "2017": "OTHER",
+        None: "OTHER",
+        "2016": "OTHER"
+    }
+    service_data = service_data.with_columns(
+        pl.col("Borough").replace(Borough_Map)
+    )
+    # MAPPING FOR AGENCY
+    # This mapping was done Manually
+    Agency_Map = {
+        "NYPD": "Security", "HPD": "Buildings", "DOT": "Transportation",
+        "DSNY": "Environment & Sanitation", "DEP": "Environment & Sanitation",
+        "DOB": "Buildings", "DOE": "Buildings", "DPR": "Parks",
+        "DOHMH": "Health", "DOF": "Other", "DHS": "Security",
+        "TLC": "Transportation", "HRA": "Other", "DCA": "Other",
+        "DFTA": "Other", "EDC": "Other", "DOITT": "Other", "OMB": "Other",
+        "DCAS": "Other", "NYCEM": "Other", "ACS": "Other", "3-1-1": "Other",
+        "TAX": "Other", "DCP": "Other", "DORIS": "Other", "FDNY": "Other",
+        "TAT": "Other", "COIB": "Other", "CEO": "Other", "MOC": "Other",
+    }
+    service_data = service_data.with_columns(
+        pl.col("Agency").replace(Agency_Map).alias("AG") # AG Shorthand for Agency Groups
+    )
+    # Mapping for Descriptor using BERTopic
+    # Store descriptors as pandas dataframe (polars not supported)
+    # Drop any nan values, and we only care about the unique values
+    descriptor_docs = service_data["Descriptor"].unique().to_numpy()
+    # Build our topic mapping using the pretrained BERTopic model
+    # Load model and get predictions
+    topic_model = BERTopic.load("models/BERTopic")
+    topics, probs = topic_model.transform(descriptor_docs)
+    # Visualize if wanted
+    # topic_model.visualize_barchart(list(range(-1,6,1)))
+    # Create a topic to ID map
+    topic_df = topic_model.get_topic_info()
+    topic_id_map = {row["Topic"]: row["Name"][2:] for _, row in topic_df.iterrows()}
+    topic_id_map[-1] = topic_id_map[-1][1:] # Fix for the -1 topic case
+    # For each document (descriptor string) get a mapping of topics
+    doc_to_topic_map = defaultdict(str)
+    for topic_id, doc in zip(topics, descriptor_docs):
+        topic = topic_id_map[topic_id]
+        doc_to_topic_map[doc] = topic
+    service_data = service_data.with_columns(
+        pl.col("Descriptor").replace(doc_to_topic_map).alias("DG") # DG Shorthand for descriptor Groups
+    )
+    # One Hot Encode Features
+    cat_features = ["AG", "Borough", "DG"]
+    service_data = service_data.to_dummies(columns=cat_features)
+    # Group by Date and create our Category Feature Vector
+    cat_df = service_data.rename({"Created Date": "Datetime"}).group_by("Datetime").agg(
+        # Categorical Features Sum
+        pl.col('^AG_.*$').sum(),
+        pl.col('^Borough_.*$').sum(),
+        pl.col('^DG_.*$').sum(),
+    ).sort(by="Datetime")
+    # Concat our category features to our current dataframe
+    sd_df = pl.concat([sd_df, cat_df.drop("Datetime")], how="horizontal")
+    # Now that our dataframe is significantly reduced in size
+    # We can finally convert back to a pandas dataframe
+    # as pandas is usable across more python packages
+    sd_df = sd_df.to_pandas()
+    # Set index to datetime
+    sd_df = sd_df.set_index("Datetime")
+    # NOTE we added 7 new rows to our weather df
+    # These 7 new rows will essentially be our final pred set
+    # The Target for these rows will be null -> indicating it needs to be predicted
+    # Add these rows to the service dataframe
+    preds_df = pd.DataFrame({'Datetime': pd.date_range(start=sd_df.index[-1], periods=8, freq='D')})[1:]
+    sd_df = pd.concat([sd_df, preds_df.set_index("Datetime")], axis=0)
+    return sd_df

code/build_weather.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Build all weather data from file
+def build_weather_data(filename):
+    # Use pandas to read file
+    weather_data = pd.read_csv(filename)
+    # Quickly aggregate Year, Month, Day into a datetime object
+    # This is because the 311 data uses datetime
+    weather_data["Datetime"] = weather_data["Year"].astype("str") + "-" + weather_data["Month"].astype("str") + "-" + weather_data["Day"].astype("str")
+    weather_data = create_datetime(weather_data, "Datetime", format="%Y-%m-%d")
+    # LOCALIZE
+    # Pre-recorded min/max values from the service data (so we don't need again)
+    lat_min = 40.49804421521046
+    lat_max = 40.91294056699566
+    long_min = -74.25521082506387
+    long_max = -73.70038354802529
+    # Create the conditions for location matching
+    mincon_lat = weather_data["Latitude"] >= lat_min
+    maxcon_lat = weather_data["Latitude"] <= lat_max
+    mincon_long = weather_data["Longitude"] >= long_min
+    maxcon_long = weather_data["Longitude"] <= long_max
+    # Localize our data to match the service data
+    wd_localized = weather_data.loc[mincon_lat & maxcon_lat & mincon_long & maxcon_long]
+    drop_cols = [
+        "USAF",
+        "WBAN",
+        "StationName",
+        "State",
+        "Latitude",
+        "Longitude"
+    ]
+    wd_localized = wd_localized.drop(columns=drop_cols)
+    # AGGREGATE
+    # Map columns with aggregation method
+    mean_cols = [
+        'MeanTemp',
+        'DewPoint',
+        'Percipitation',
+        'WindSpeed',
+        'Gust',
+        'SnowDepth',
+    ]
+    min_cols = [
+        'MinTemp'
+    ]
+    max_cols = [
+        'MaxTemp',
+        'MaxSustainedWind'
+    ]
+    round_cols = [
+        'Rain',
+        'SnowIce'
+    ]
+    # Perform Aggregation
+    mean_df = wd_localized.groupby("Datetime")[mean_cols].mean()
+    min_df = wd_localized.groupby("Datetime")[min_cols].min()
+    max_df = wd_localized.groupby("Datetime")[max_cols].max()
+    round_df = wd_localized.groupby("Datetime")[round_cols].mean().round().astype(np.int8)
+    wd_full = pd.concat([mean_df, min_df, max_df, round_df], axis=1)
+    # Add seasonal features
+    wd_full = build_temporal_features(wd_full, "Datetime")
+    wd_full["Season"] = wd_full["Season"].astype("category")
+    wd_full = wd_full.set_index("Datetime")
+    # We will calculate the imputation for the next 7 days after 12/31/2018
+    # Along with the 49 missing days
+    # This will act as our "Weather Forecast"
+    time_steps = 49 + 7
+    # Impute Cols
+    impute_cols = [
+        'MeanTemp', 'MinTemp', 'MaxTemp', 'DewPoint',
+        'Percipitation', 'WindSpeed', 'MaxSustainedWind',
+        'Gust', 'Rain', 'SnowDepth', 'SnowIce',
+    ]
+    # Mean Vars
+    mean_vars = ["WindSpeed", "MaxSustainedWind", "Gust", "SnowDepth"]
+    min_vars = ["SnowIce", "MeanTemp", "MinTemp", "MaxTemp", "DewPoint", "Percipitation"]
+    max_vars = ["Rain"]
+    # Use the imported function to create the imputed data
+    preds_mean = impute_missing_weather(wd_full, strategy="mean", time_steps=time_steps, impute_cols=mean_vars)
+    preds_min = impute_missing_weather(wd_full, strategy="min", time_steps=time_steps, impute_cols=min_vars)
+    preds_max = impute_missing_weather(wd_full, strategy="max", time_steps=time_steps, impute_cols=max_vars)
+    all_preds = pd.concat([preds_mean, preds_min, preds_max], axis=1)
+    all_preds = build_temporal_features(all_preds.loc[:, impute_cols], "Datetime")
+    all_preds = all_preds.set_index("Datetime")
+    wd_curr = wd_full.loc[wd_full["Year"] >= 2016]
+    wd_df = pd.concat([wd_full, all_preds], axis=0, join="outer")
+    return wd_df

code/create_maps.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import math
+import pandas as pd
+import numpy as np
+import folio
+from utils import map_vals
+from matplotlib import pyplot as plt
+# NOTE
+# This only needed to be ran once to generate the maps
+# Maps are saved in the figures folder and loaded as html
+service_data_pd = pd.read_csv("data/311-2016-2018.csv")
+service_data_pd["Incident Zip"] = service_data_pd["Incident Zip"].astype("string")
+service_data_pd["BBL"] = service_data_pd["BBL"].astype("string")
+service_data_raw = pl.DataFrame(service_data_pd)
+# service_data_raw = pl.read_csv("data/311-2016-2018.csv", null_values="", infer_schema_length=0)
+# service_data_raw = service_data_raw.with_columns(
+#     pl.col("Latitude").cast(pl.Float64),
+#     pl.col("Longitude").cast(pl.Float64)
+# )
+# Clear some ram
+del service_data_pd
+gc.collect()
+weather_data_raw = pd.read_csv("data/weather_NY_2010_2018Nov.csv")
+def get_map_1():
+    fig, weather_map = map_vals(
+        weather_data_raw.loc[weather_data_raw["Year"] >= 2016],
+        cols=["Latitude", "Longitude"],
+        label_cols=["StationName"],
+        sample_size=1000,
+        color='red',
+        radius=3,
+        weight=4
+    )
+    fig, combined_map = map_vals(
+        service_data_raw,
+        cols=["Latitude", "Longitude"],
+        color="blue", submap=weather_map,
+        sample_size=1000,
+        weight=2,
+        radius=1
+    )
+    fig.save("figures/map1.html")
+    return fig
+def get_map_2():
+    fig, service_map = map_vals(
+        service_data_raw,
+        cols=["Latitude", "Longitude"],
+        color="blue",
+        weight=2,
+        radius=1,
+        start_loc=[40.7128, -74.0060],
+        sample_size=1000,
+        zoom_start=10
+    )
+    fig, weather_map = map_vals(
+        weather_data_raw.loc[weather_data_raw["Year"] >= 2016],
+        cols=["Latitude", "Longitude"],
+        submap=service_map,
+        label_cols=["StationName"],
+        color='red',
+        radius=5,
+        weight=2,
+        sample_size=1000,
+    )
+    fig.save("figures/map2.html")
+    return fig
+def get_bounded_map():
+    # Get prerecorded coords for the mins/max to maximize speed here
+    # In notebook this is recorded via code
+    lat_min = 40.49804421521046
+    lat_max = 40.91294056699566
+    long_min = -74.25521082506387
+    long_max = -73.70038354802529
+    fig = folium.Figure(height=500, width=750)
+    service_bounds_map = folium.Map(
+        location=[40.7128, -74.0060],
+        zoom_start=10,
+        tiles='cartodbpositron',
+        zoom_control=False,
+        scrollWheelZoom=False,
+        dragging=False
+    )
+    kw = {
+        "color": "#F1807E",
+        "line_cap": "round",
+        "fill": True,
+        "fill_color": "blue",
+        "weight": 3,
+        "popup": "Service Data Coverage Zone",
+    }
+    folium.Rectangle(
+        bounds=[[lat_min, long_min], [lat_max, long_max]],
+        line_join="round",
+        dash_array="5 5",
+        **kw,
+    ).add_to(service_bounds_map)
+    fig.add_child(service_bounds_map)
+    fig.save("figures/bounded_map.html")
+    return fig
+def get_final_map():
+    lat_min = 40.49804421521046
+    lat_max = 40.91294056699566
+    long_min = -74.25521082506387
+    long_max = -73.70038354802529
+    mincon_lat = weather_data_raw["Latitude"] >= lat_min
+    maxcon_lat = weather_data_raw["Latitude"] <= lat_max
+    mincon_long = weather_data_raw["Longitude"] >= long_min
+    maxcon_long = weather_data_raw["Longitude"] <= long_max
+    service_bounds_map = folium.Map(
+        location=[40.7128, -74.0060],
+        zoom_start=10,
+        tiles='cartodbpositron',
+        zoom_control=False,
+        scrollWheelZoom=False,
+        dragging=False
+    )
+    kw = {
+        "color": "#F1807E",
+        "line_cap": "round",
+        "fill": True,
+        "fill_color": "blue",
+        "weight": 3,
+        "popup": "Service Data Coverage Zone",
+    }
+    folium.Rectangle(
+        bounds=[[lat_min, long_min], [lat_max, long_max]],
+        line_join="round",
+        dash_array="5 5",
+        **kw,
+    ).add_to(service_bounds_map)
+    wd_localized = weather_data_raw.loc[mincon_lat & maxcon_lat & mincon_long & maxcon_long]
+    fig, wd_local_map = map_vals(
+        wd_localized,
+        submap=service_bounds_map,
+        label_cols=["StationName"],
+        color='red',
+        radius=5,
+        weight=2,
+        sample_size=1000,
+    )
+    fig.save("figures/final_map.html")
+    return fig
+def build_maps():
+    get_map_1()
+    get_map_2()
+    get_bounded_map()
+    get_final_map()
+build_maps()

code/future_features.py ADDED Viewed

	@@ -0,0 +1,21 @@

+FEATURES["future_covariates_final"] = []
+for col in FEATURES["future_covariates"]:
+    new_features = data_preprocess[col].to_frame().copy()
+    # Lag Features
+    new_features[col+"_L0D"] = new_features[col].shift(0)
+    new_features[col+"_L1D"] = new_features[col].shift(1)
+    new_features[col+"_L2D"] = new_features[col].shift(2)
+    # Rolling Features (No shift needed for future vars)
+    new_features[col+"_RMean14D"] = new_features[col].rolling('14D').mean()
+    new_features[col+"_RMin14D"] = new_features[col].rolling('14D').min()
+    # Expanding Window (No shift needed for future vars)
+    new_features[col+"_EMean14D"] = new_features[col].expanding(min_periods=14).mean()
+    new_features[col+"_EMin14D"] = new_features[col].expanding(min_periods=14).min()
+    FEATURES["future_covariates_final"].extend([col+"_L0D", col+"_L1D", col+"_L2D", col+"_RMean14D", col+"_RMin14D", col+"_EMean14D", col+"_EMin14D"])
+    new_features = new_features.drop(columns=col)
+    data_preprocess = pd.concat([data_preprocess, new_features], axis=1)
+assert len(data_preprocess.loc[:, FEATURES["future_covariates_final"]].columns) == len(FEATURES["future_covariates"])*7

code/past_features.py ADDED Viewed

	@@ -0,0 +1,21 @@

+FEATURES["past_covariates_final"] = []
+for col in FEATURES["past_covariates"]:
+    new_features = data_preprocess[col].to_frame().copy()
+    # Lag Features
+    new_features[col+"_L7D"] = new_features[col].shift(7)
+    new_features[col+"_L14D"] = new_features[col].shift(14)
+    new_features[col+"_L21D"] = new_features[col].shift(21)
+    # Rolling Features
+    # Shift to move the new features into the prediction space (2019-01-01 to 2019-01-07)
+    new_features[col+"_RMean14D"] = new_features[col].shift(7).rolling('14D').mean()
+    # Differencing Features
+    # Shift to move the new features into the prediction space (2019-01-01 to 2019-01-07)
+    new_features[col+"_Diff7D"] = (new_features[col].shift(7) - new_features[col].shift(7).shift(7))
+    FEATURES["past_covariates_final"].extend([col+"_L7D", col+"_L14D", col+"_L21D", col+"_RMean14D", col+"_Diff7D"])
+    new_features = new_features.drop(columns=col)
+    data_preprocess = pd.concat([data_preprocess, new_features], axis=1)
+assert len(data_preprocess.loc[:, FEATURES["past_covariates_final"]].columns) == len(FEATURES["past_covariates"])*5

code/recurse_predict.py ADDED Viewed

	@@ -0,0 +1,22 @@

+def predict_recurse(dataset, test, model, features_to_impute=['Target_L1D', 'Target_Diff7D', 'Target_Diff14D'], last_feature='Target_L6D'):
+    n_steps = len(test)
+    merged_data = pd.concat([dataset[-14:], test], axis=0)
+    all_index = merged_data.index
+    X_test = test.drop(columns="Target")
+    sd = -6 # Starting point for filling next value
+    # For each step, get the predictions
+    for i in range(n_steps-1):
+        pred = final_model.predict(X_test)[i]
+        # For the three features needed, compute the new value
+        X_test.loc[all_index[sd+i], features_to_impute[0]] = pred
+        X_test.loc[all_index[sd+i], features_to_impute[1]] = pred - merged_data.loc[all_index[sd+i-7], features_to_impute[1]]
+        X_test.loc[all_index[sd+i], features_to_impute[2]] = pred - merged_data.loc[all_index[sd+i-14], features_to_impute[2]]
+        # In the last iteration compute the Lag6D value
+        if i == 5:
+            X_test.loc[all_index[sd+i], last_feature] = pred - merged_data.loc[all_index[sd+i-6], last_feature]
+    final_preds = final_model.predict(X_test)
+    return final_preds

code/target_features.py ADDED Viewed

	@@ -0,0 +1,27 @@

+FEATURES["y_features"] = []
+col = FEATURES["y"][0]
+new_features = data_preprocess[col].to_frame().copy()
+# Lag Features
+new_features[col+"_L1D"] = new_features[col].shift(1)
+new_features[col+"_L6D"] = new_features[col].shift(6)
+new_features[col+"_L7D"] = new_features[col].shift(7)
+new_features[col+"_L8D"] = new_features[col].shift(8)
+new_features[col+"_L14D"] = new_features[col].shift(14)
+# Rolling Features
+# After computing shift by 1 to indicate its computed based off a 1 day lag
+new_features[col+"_RMean14D"] = new_features[col].shift(1).rolling(window='14D').mean()
+# The last 6 days, I need the prediction from time t-1
+# For now set to nan
+new_features[col+"_RMean14D"][-6:] = np.nan
+# Differencing features
+new_features[col+"_Diff7D"] = (new_features[col].shift(1) - new_features[col].shift(1).shift(7))
+new_features[col+"_Diff14D"] = (new_features[col].shift(1) - new_features[col].shift(1).shift(14))
+new_features = new_features.drop(columns=col)
+FEATURES["y_features"].extend([col+"_L1D", col+"_L6D", col+"_L7D", col+"_L8D", col+"_L14D", col+"_RMean14D", col+"_Diff7D", col+"_Diff14D"])
+data_preprocess = pd.concat([data_preprocess, new_features], axis=1)
+assert len(data_preprocess.loc[:, FEATURES["y_features"]].columns) == len(FEATURES["y"])*8

custom.css ADDED Viewed

	@@ -0,0 +1,43 @@

+.gr-describe-tb {
+    overflow: hidden !important;
+}
+.row.spacing {
+    border: 0px;
+}
+.plot-container {
+    width: 100vw
+}
+.map * {
+    text-align: -webkit-center;
+}
+.map-legend * {
+    width: fit-content;
+    max-width: 215px;
+    padding: 5px;
+    background: var(--border-color-primary);
+    margin-top: -50px
+}
+.map-legend-text * {
+    width: fit-content;
+    padding: 0px;
+    margin-botton: 0px;
+    font-size: 16px;
+    margin-top: 0px;
+}
+.prose {
+    # font-size: 16px;
+}
+.no-padding * {
+    padding: 0px;
+    margin: 0px;
+}
+.low-padding * {
+    padding: 2px;
+    margin: 0px;
+}

data/data_final.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/data_merged_full.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/docs.csv ADDED Viewed

	@@ -0,0 +1,1315 @@

+,docs
+0,Request Large Bulky Item Collection
+1,Sewage Odor
+2,Sidewalk Violation
+3,Blocked Hydrant
+4,For One Address
+5,Blocked Sidewalk
+6,Commercial Overnight Parking
+7,Noise: Construction Before/After Hours (NM1)
+8,Posted Parking Sign Violation
+9,Congestion/Gridlock
+10,Ped Multiple Lamps
+11,Building Shaking/Vibrating/Structural Stability
+12,Egress - Doors Locked/Blocked/Improper/No Secondary Means
+13,Working Contrary To Stop Work Order
+14,E15 Illegal Postering
+15,E3 Dirty Sidewalk
+16,Pothole - Highway
+17,Pothole
+18,No Access
+19,Lamppost Base Door/Cover Missing
+20,Branch Cracked and Will Fall
+21,Application Renewal
+22,Illegal Conversion Of Residential Building/Space
+23,Zoning - Non-Conforming/Illegal Vehicle Storage
+24,Excessive Water In Basement (WEFB)
+25,Mouse Sighting
+26,Administration
+27,No Receipt
+28,Site Conditions Endangering Workers
+29,15I Street Condition Ice-Non Storm
+30,Food Contaminated
+31,1 Missed Collection
+32,E3A Dirty Area/Alleyway
+33,Street Light Out
+34,Controller
+35,Electrical Wiring Defective/Exposed
+36,With License Plate
+37,Driver Complaint
+38,Engine Idling
+39,Loud Music/Party
+40,Banging/Pounding
+41,15S Re-Plow/Spread/Ice-Snow Cond.
+42,Loud Talking
+43,Noise: air condition/ventilation equipment (NV1)
+44,Other (complaint details)
+45,"Air: Odor/Fumes, Vehicle Idling (AD3)"
+46,Lead Kit Request (Residential) (L10)
+47,LOW WATER PRESSURE - WLWP
+48,Use Indoor
+49,Cave-in
+50,Noise: Construction Equipment (NC1)
+51,15 Street Cond/Dump-Out/Drop-Off
+52,Sewer Backup (Use Comments) (SA)
+53,Trees and Sidewalks Program
+54,Hitting Building
+55,Water Meter Broken/Leaking - Other (CMO)
+56,Water Meter Broken/Leaking - Private Residence (CMR)
+57,Exchange/Refund/Return
+58,Car/Truck Horn
+59,St Name - Attached to Pole
+60,Graffiti
+61,Dirty Water (WE)
+62,Hydrant Defective (WC2)
+63,Other/Unknown
+64,Hydrant Running (WC3)
+65,Cons - Contrary/Beyond Approved Plans/Permits
+66,Sidewalk Shed/Pipe Scafford - Inadequate Defective/None
+67,Leak (Use Comments) (WA2)
+68,Possible Water Main Break (Use Comments) (WA1)
+69,Curb Cut/Driveway/Carport - Illegal
+70,Plate Condition - Shifted
+71,1R Missed Recycling-All Materials
+72,Planted Less Than 2 Years Ago
+73,Fence - None/Inadequate
+74,Recycling Electronics
+75,LED Lense
+76,Other
+77,Parking Permit Improper Use
+78,Other Housing Options
+79,No Certificate Of Occupancy/Illegal/Contrary To CO
+80,10 Litter Basket / Request
+81,Special Events
+82,E2 Receptacle Violation
+83,E5 Loose Rubbish
+84,Boiler - Defective/Inoperative/No Permit
+85,SRO - Illegal Work/No Permit/Change In Occupancy/Use
+86,Flood Light Lamp Out
+87,Entire Tree Has Fallen Down
+88,Coin or Card Did Not Register
+89,Tree Leaning/Uprooted
+90,Line/Marking - Faded
+91,Plumbing
+92,"No Parking, Standing, Stopping"
+93,Branch or Limb Has Fallen Down
+94,Food Worker Illness
+95,Use Outside
+96,Pedestrian Signal
+97,Plumbing-Defective/Leaking/Not Maintained
+98,Inadequate or No Heat
+99,Out of Order
+100,New Bus Stop Shelter Placement
+101,Food Spoiled
+102,Failed Street Repair
+103,Unclean Condition
+104,Price Not Posted
+105,Painted Line/Marking
+106,Rodents/Insects/Garbage
+107,Illegal Hotel Rooms In Residential Building
+108,Air: Other Air Problem (Use Comments) (AZZ)
+109,"Noise, Barking Dog (NR5)"
+110,Planted More Than 2 Years Ago
+111,Catch Basin Clogged/Flooding (Use Comments) (SC)
+112,Manhole Sunken/Damaged/Raised (SB1)
+113,Wear & Tear
+114,E9 Snow / Icy Sidewalk
+115,No Water (WNW)
+116,Driver Report
+117,Double Parked Blocking Vehicle
+118,"Rough, Pitted or Cracked Roads"
+119,1RG Missed Recycling Paper
+120,Traffic Signal Light
+121,Glassware Broken
+122,Insurance Information Requested
+123,Demolition - Unsafe
+124,Photocell (PEC) Missing
+125,"Air: Smoke, Chimney or vent (AS1)"
+126,Assisted Living
+127,1RO Missed Recycling Organics
+128,Broken Sidewalk
+129,Partial Access
+130,Blocked Bike Lane
+131,Tattoo Artist Unlicensed
+132,1 or 2
+133,E14 ASP/Restricted Parking
+134,Wood Pole Missing
+135,Derelict Vehicles
+136,Veh Signal Head
+137,Loud Television
+138,1RB Missed Recycling - M/G/Pl
+139,LED Pedestrian Unit
+140,Ped Flasher
+141,Rat Sighting
+142,14B Derelict Bicycle
+143,Street Flooding (SJ)
+144,Vehicle Signal
+145,Hydrant Leaking (WC1)
+146,Hydrant Running Full (WA4)
+147,Double Parked Blocking Traffic
+148,2 Bulk-Missed Collection
+149,Overnight Commercial Storage
+150,Trunk Damaged
+151,2R Bulk-Missed Recy Collection
+152,E1 Improper Disposal
+153,E8 Canine Violation
+154,E11 Litter Surveillance
+155,Snow/Ice
+156,12 Dead Animals
+157,E12 Illegal Dumping Surveillance
+158,Pigeon Waste
+159,Neglected
+160,Timer Defect - Fast/Fail
+161,Blocked - Construction
+162,Plumbing Work - Illegal/No Permit/Standpipe/Sprinkler
+163,Glassware Missing
+164,Lamppost Damaged
+165,Ventilation
+166,Street Light Cycling
+167,Veh Signal Lamp
+168,Structure - Indoors
+169,Other School Condition
+170,Chemical Vapors/Gases/Odors
+171,Property Refunds and Credits
+172,Car/Truck Music
+173,"Air: Odor/Fumes, Restaurant (AD2)"
+174,Chemical Odor (HD1)
+175,Manhole Cover Broken/Making Noise (SB)
+176,Cloudy Or Milky Water (QB1)
+177,Failure To Maintain
+178,Litter
+179,Defective Hardware
+180,Street Light Lamp Dim
+181,Gas Hook-Up/Piping - Illegal Or Defective
+182,Defacement
+183,Plumbing Problem
+184,E10 Street Obstruction
+185,Dead Animal
+186,Noise: Alarms (NR3)
+187,E3B Sidewalk Obstruction
+188,Detached Trailer
+189,Non-Delivery Goods/Services
+190,Interest Dispute
+191,Tree Alive - in Poor Condition
+192,Condition Attracting Rodents
+193,Aided/Injury
+194,One Way
+195,Property - Other Billing Issue
+196,Pesticide
+197,Heating Problem
+198,Trespassing
+199,Fixture/Luminaire Out Of Position
+200,Lamppost Base Door/Cover Open
+201,Unlicensed
+202,Broken Curb
+203,Illegal Tow
+204,Failure To Retain Water/Improper Drainage- (LL103/89)
+205,Structural Stability Impacted - New Building Under Construction
+206,Hitting Power/Phone Lines
+207,Lamppost Knocked Down
+208,Vehicle Complaint
+209,Lamppost Wire Exposed
+210,Dishwashing/Utensils
+211,Other (Explain Below)
+212,Blocking Street
+213,Canopy Complaint
+214,Manhole Overflow (Use Comments) (SA1)
+215,Other Water Problem (Use Comments) (WZZ)
+216,15R Street Cond/Ref.W Door
+217,C1 Request Xmas Trees Collection
+218,Rent Discrepancy
+219,Food Contains Foreign Object
+220,Unauthorized Bus Layover
+221,Veh Signal Sec Door
+222,Post
+223,Fixture/Luminaire Door Open
+224,Chronic Speeding
+225,Truck Route Violation
+226,Fixture/Luminaire Hanging
+227,Suspended (Hanging) Scaffolds - No Pmt/Lic/Dangerous/Accident
+228,Street Cleaning - ASP
+229,Illegal. Commercial Use In Resident Zone
+230,"Building - Vacant, Open And Unguarded"
+231,Bare Hands in Contact w/ Food
+232,Adult Establishment
+233,Sign/Awning/Marquee - Illegal/No Permit
+234,Privately Owned Public Space/Non-Compliance
+235,Wall/Retaining Wall - Bulging/Cracked
+236,Property Value Dispute
+237,Stop
+238,Nursing Home
+239,Electronics/Phones
+240,False Advertising
+241,Flashing Hazard
+242,Unsafe Worksite
+243,Labor violation
+244,Public Complaint - Comm Location
+245,"Unsafe Chemical, Abandoned (HC2)"
+246,Cable
+247,Chained
+248,Tortured
+249,"Oil Spill On Street, Large (HQL)"
+250,Noise: Private Carting Noise (NQ1)
+251,22 Weeds
+252,Dust from Construction
+253,Multiple Street Lights Out
+254,Smoking Ban - Smoking on Construction Site
+255,After Hours - Licensed Est
+256,Lamppost Missing
+257,Pet/Animal
+258,Toxic Chemical/Material
+259,Tree Trunk Split
+260,Metal Protruding - Sign Stump
+261,Vent/Exhaust - Illegal/Improper
+262,Sprinkler System - Inadequate
+263,No Shelter
+264,Bicycle Chained to Tree
+265,Bus Stop
+266,In Car
+267,Sidewalk Grating - Defective
+268,General Maintenance
+269,Rooster
+270,Damaged/Defective Goods
+271,Overcharge
+272,E2A Storage Of Receptacles
+273,Food Worker Hygiene
+274,Base Door
+275,Hydrant Knocked Over/Missing (WC)
+276,News Gathering
+277,Sewage Leak
+278,Dog
+279,Chronic Stoplight Violation
+280,Asbestos
+281,Copy of Approval Order
+282,Fixture/Luminaire Damaged
+283,Billing Dispute
+284,Personal SCHE Exemption
+285,Safety Netting/Guard Rails - Damaged/Inadequate/None (6 Stories/75 Feet Or Less)
+286,Excavation Undermining Adjacent Building
+287,Plate Condition - Anti-Skid
+288,Plate Condition - Noisy
+289,Plate Condition - Open
+290,Car Service Company Complaint
+291,Damaged Vehicle
+292,Demand for Cash
+293,Manhole Cover Missing (Emergency) (SA3)
+294,Support Bracket
+295,"Cloudy Or Milky, Other (Use Comments) (QBZ)"
+296,Affecting Sewer or Foundation
+297,Signs of Rodents
+298,Miscellaneous
+299,Illegal Use Of Hose - Other (CCO)
+300,Odor In Sewer/Catch Basin (ICB)
+301,Street Light Dayburning
+302,Veh Sgnl Mult Lamps
+303,Odor
+304,Maintenance Cover
+305,Dumpster - Construction Waste
+306,Contract Dispute
+307,Real Property Tax Assessment/Correction
+308,E6 Commercial Waste Disposal
+309,Obstructing Public Use
+310,Temporary
+311,Veh Signal Visor
+312,Plumbing Work - Unlicensed/Illegal/Improper Work In Progress
+313,Failure to Comply with Vacate Order
+314,Street Light Feed
+315,Unleashed Dog in Public
+316,ID Requirement Not Posted
+317,Safety Netting/Guard Rails - Damaged/Inadequate/None (Over 6 Stories/75 Feet)
+318,Sidewalk Staircase
+319,Debris - Falling Or In Danger Of Falling
+320,Smoking Violation
+321,Guard Rail - Street
+322,Illegal Conversion Of Commercial Bldg/Space To Other Uses
+323,Lights From Parking Lot Shining On Building
+324,"Air: Dust, Construction/Demolition (AE4)"
+325,Asbestos Complaint (B1)
+326,Car Service Company Report
+327,Enclosure Cap
+328,Foreign Attachment On Lamppost
+329,unknown odor/taste in drinking water (QA6)
+330,3A Sweeping/Missed
+331,Dead Branches in Tree
+332,Sidewalk Collapsed
+333,Underground
+334,Over Capacity
+335,Noise: Jack Hammering (NC2)
+336,Catch Basin Sunken/Damaged/Raised (SC1)
+337,21 Collection Truck Noise
+338,Curb Defect-Metal Protruding
+339,Too Few on Duty
+340,Update Tenant Information
+341,Defective Street Cut (WZZ1)
+342,Snow or Ice
+343,Boiler - Fumes/Smoke/Carbon Monoxide
+344,Damaged/Defective Parts
+345,Illness Caused by Drinking Water
+346,Structure - Outdoors
+347,"Taste/Odor, Chlorine (QA1)"
+348,Turn Signal
+349,E1A Litter Basket / Improper Use
+350,Contact Sign Not Posted
+351,Graffiti - Bridge
+352,8 Request to Clean Vacant Lot
+353,Personal Other Exemption
+354,Letter Grading
+355,Food Temperature
+356,Pedestrian Ramp Defective
+357,Food Protection
+358,School Crossing
+359,Cars Parked on Sidewalk/Street
+360,Mast Arm
+361,TAC Report
+362,ER2 Resident Recyc. (Tenant)
+363,E13 Throw-Out
+364,In Prohibited Area
+365,In Public
+366,Ped Lamp
+367,Credit Card Limitations Not Posted
+368,ELECTRIC/GAS RANGE
+369,APARTMENT ONLY
+370,WINDOW GUARD BROKEN/MISSING
+371,RADIATOR
+372,STEAM PIPE/RISER
+373,TOILET
+374,DOOR
+375,BASIN/SINK
+376,FLOOR
+377,REFRIGERATOR
+378,Other Animal
+379,Line/Marking - After Repaving
+380,Push Button
+381,DOOR FRAME
+382,WINDOW FRAME
+383,WINDOW PANE
+384,LIGHTING
+385,NO LIGHTING
+386,OUTLET/SWITCH
+387,WIRING
+388,POWER OUTAGE
+389,MAINTENANCE
+390,BELL/BUZZER/INTERCOM
+391,STAIRS
+392,CABINET
+393,COOKING GAS
+394,JANITOR/SUPER
+395,MAILBOX
+396,ENTIRE BUILDING
+397,Wood Pole Wires Exposed
+398,Vehicle
+399,Damaged Telephone
+400,Open Excavation (WZZ2)
+401,Sewer Odor (SA2)
+402,CEILING
+403,ROOFING
+404,WALL
+405,WINDOW/FRAME
+406,BATHTUB/SHOWER
+407,Rodents/Mice
+408,WATER SUPPLY
+409,CARBON MONOXIDE DETECTOR
+410,SMOKE DETECTOR
+411,GARBAGE/RECYCLING STORAGE
+412,FIRE ESCAPE
+413,MOLD
+414,Fire Alarm Lamp Out
+415,PESTS
+416,HEAVY FLOW
+417,SEP - Professional Certification Compliance Audit
+418,DAMP SPOT
+419,SLOW LEAK
+420,St Name - Over Intersection
+421,BOILER
+422,Highway Fence
+423,Bag/Wallet
+424,Installation/Work Quality
+425,Veh Signal Lens
+426,Noise:  lawn care equipment (NCL)
+427,SEWAGE
+428,PAVEMENT
+429,DOOR/FRAME
+430,ROOF DOOR/HATCH
+431,Lamppost Base Door/Cover Damaged
+432,Facility Maintenance
+433,Permit/License/Certificate
+434,Rodent Sighting
+435,Allergy Information
+436,Receipt Incomplete/Not Given
+437,Street Light Lamp Missing
+438,Illegal Conversion Of Manufacturing/Industrial Space
+439,Facility Construction
+440,VENTILATION SYSTEM
+441,Clothing Damage
+442,3B Sweeping/Inadequate
+443,Unauthorized Tree Pruning
+444,Concrete In Catch Basin (IEA)
+445,"Taste/Odor, Chemical (QA2)"
+446,Flood Light Lamp Cycling
+447,"\E4 18\""\"" Law\"""""
+448,Fixture/Luminaire Missing
+449,3 or More
+450,Toilet Facility
+451,GUTTER/LEADER
+452,Hydrant Locking Device Request (Use Comments) (WC5)
+453,Water Meter Stolen/Missing - Private Residence (CLR)
+454,New Con Ed Service Request
+455,Graffiti or Vandalism
+456,Illegal Use Of A Hydrant (CIN)
+457,Cigarette Sale to Minor
+458,SIGNAGE MISSING
+459,Fire Globe Missing
+460,Locker Break-in/Incident
+461,APS
+462,Roots Damaged
+463,ER1 Resident Recyc. (Owner/Manager
+464,Do Not Enter
+465,Branches Damaged
+466,Junction Box
+467,Food Preparation Location
+468,Underage - Licensed Est
+469,Sidewalk Blocked
+470,Human Capital
+471,Police Report Requested
+472,Car Not Available
+473,Warning Buzzer
+474,"Education Support, Policy, and Practice"
+475,Lamppost Leaning
+476,WiFi/Internet Not Working/Slow
+477,"Air: Smoke, Vehicular (AA4)"
+478,Credit Card Stuck in Meter
+479,PORCH/BALCONY
+480,Kitchen/Food Prep Area
+481,RAIN GARDEN DEBRIS (SRGDBR)
+482,Defective/Missing Curb Piece (SC4)
+483,Food Worker Activity
+484,Wastewater Into Catch Basin (IEB)
+485,SPRINKLER
+486,Plants- Odor Related Problems (PO1)
+487,SKYLIGHT
+488,Yield
+489,Beach/Pool Water
+490,Dogs or Cats Sold
+491,Garbage or Litter
+492,Plate Missing/Moved-Exposing Hole (WF4)
+493,Plants- Noise Related Problems (PN1)
+494,NYPD
+495,On Messenger
+496,Dirty/Graffiti
+497,Playing in Unsuitable Place
+498,Building
+499,Injured Wildlife
+500,6 Overflowing Litter Baskets
+501,Improper Sale of Items
+502,Advertising Sign/Billboard/Posters/Flexible Fabric - Illegal
+503,DOOR TO DUMBWAITER
+504,Natural Gas In Sewer/Catch Basin (IFB)
+505,No Permit or License
+506,Noise: Manufacturing Noise (NK1)
+507,Broken Glass
+508,Illegal/Unfair Booting
+509,Cat
+510,Speed Limit
+511,Unauthorized Tree Removal
+512,Plate Missing/Moved-Exposing Hole (SB4)
+513,"Air: Odor, Sweet From Unknown Source (AZ1)"
+514,EEO
+515,Lighting
+516,FOIL Request - Request for Records
+517,Scale Inaccurate/Broken
+518,"Unsafe Chemical, Storage (HC1)"
+519,Hours of Operation
+520,Unsecured Facility
+521,Safety Equipment/Signs
+522,Posted Notice Or Order Removed/Tampered With
+523,"Wasting Faucets,Sinks,Flushometer,Urinal,Etc. - Other (CWO)"
+524,House/Property Damaged
+525,Cellar Door Defective
+526,Multiple St Lts Dayburning
+527,Crane/Suspension Scaffold - No Permit/License/Cert./Unsafe/Illegal
+528,Noise: Other Noise Sources (Use Comments) (NZZ)
+529,Damaged Other
+530,No Consent Form
+531,Debt Not Owed
+532,High Water Pressure (WHP)
+533,Closed without Notice
+534,Property
+535,Real Estate Services
+536,SEWER
+537,Hyd Valve Box Cover Missing (WV2)
+538,Office of Preventive Technical Assistance/OPTA
+539,Traffic Sign or Signal Blocked
+540,Dissatisfaction with Provider
+541,Grass/Weeds
+542,Catch Basin Grating Missing (SA4)
+543,Bracket Arm Loose
+544,Graffiti - Highway
+545,Initial Application
+546,Snow Removal Requested
+547,Non-Compliance w/TTPN 1/00 - Vertical Enlargements
+548,Landmark Bldg - Illegal Work
+549,Damaged/Defective
+550,"Dirt, Debris, Litter Complaint"
+551,Illegal Tree Removal/Topo. Change in SNAD
+552,Relocation of Bus Stop Shelter
+553,Controller Flasher
+554,Annual Report
+555,Facilities Management
+556,Culvert Blocked/Needs Cleaning (SE)
+557,Sign - In Danger Of Falling
+558,Loose Plate
+559,Commercial ICP or ICAP Exemption
+560,Time Switch
+561,Fiscal and Business Management
+562,Crash Cushion Defect
+563,Glassware Hanging
+564,"Noise, Other Animals (NR6)"
+565,Missing/Stump
+566,ECR Commercial Routing Sticker
+567,Water Meter Stolen/Missing - Other (CLO)
+568,Clear Street Light
+569,Rates Not Posted
+570,"No Sampling Required, Requested Information (QG2)"
+571,Damaged Leg or Pole Bent
+572,Rooftank Leak Or Overflow (CKO)
+573,"Wasting Faucets,Sinks,Flushometer,Urinal,Etc. - Private Residence (CWR)"
+574,Smoking
+575,E30 Transfer Station
+576,Equipment Not Safe
+577,Domestic Strays
+578,Weather Head
+579,Broken Lock
+580,Dog Off Leash
+581,Oil Spill Into Basin/Sewer - Large (IABL)
+582,Plate Noisy/Sunken/Raised (SB5)
+583,Personal STAR Exemption
+584,Handwashing
+585,"Taste/Odor, Musty/Stale (QA4)"
+586,Citywide Procurement
+587,"Taste/Odor, Bitter/Metallic (QA3)"
+588,Animal Waste
+589,Parking Card Stuck in Meter
+590,Door Open with Air Conditioning On
+591,Inadequate Support Shoring
+592,Non-Delivery of Papers
+593,1C Uncollected Xmas Trees
+594,Personal Exemptions
+595,Controller Cabinet
+596,Executive
+597,Chemical Spill/Release (HA1)
+598,Tax Commission Rules
+599,Co-op or Condo Abatement
+600,Guard Rail - Bridge
+601,Chemical Spill (IAC)
+602,Door
+603,In-Line Fuse Missing
+604,"Oil Spill On Street, Small (HQS)"
+605,Drag Racing
+606,Cellar Door Open/Unprotected
+607,Bag
+608,Disclosure Not Provided
+609,Contrary To LL 58/87(Handicapped Access)
+610,Lack of Supplies
+611,Vehicle Report
+612,Accident - Elevator
+613,Unauthorized Film Shoot
+614,Clear Water With Other Particles (Use Comments) (QEZ)
+615,Catch Basin Search (SC2)
+616,Sidewalk Grating - Missing
+617,Dirty/Inadequate Equip./Facility
+618,Removing Flowers/Plants
+619,Foundation
+620,Time Clock Maladjusted
+621,Unlicensed Day Care
+622,Graffiti/Litter on Phone
+623,Sway Bar
+624,Unlicensed Vendors
+625,Turtle Under 4 inches Long
+626,Biking/Rollerblading off Path
+627,Mandated Reporters
+628,Waterway-Sewage (IHA)
+629,Layaway Terms Not Provided
+630,Grease In Sewer/Catch Basin (IDG)
+631,ER5 Comm. Recyc. (Bldg Mgmt)
+632,Police Report Not Requested
+633,Failure to Post Calorie Information
+634,FDNY Referral - Pilot
+635,Wildlife Sighting
+636,Oil Spill Into Basin/Sewer - Small (IABS)
+637,Flood Light Lamp Missing
+638,Noise: Loud Music/Nighttime(Mark Date And Time) (NP1)
+639,Removing Wildlife
+640,Detour
+641,Foreign Attachment On Wood Pole
+642,Mandatory Tip
+643,10A Adopt-A-Basket
+644,Bike Rack Repair
+645,Honorary
+646,Rough Pavement
+647,Warning Signal Lamp
+648,Equipment Complaint
+649,Harassment
+650,Other Sewer Problem (Use Comments) (SZZ)
+651,"Air: Smoke, Other (Use Comments) (AA5)"
+652,Damaged Bench
+653,Snow on Overpass
+654,Exposure Unnecessary
+655,Bracket Arm Broken
+656,Door Lock
+657,Exit/Route
+658,Returns Not Filed
+659,Remove Hydrant Locking Device (WC6)
+660,Reflector/Louvre
+661,Illegal Activity by Phone
+662,Fallen Debris from Bridge
+663,"Air: Odor, Nail Salon (AD8)"
+664,Sign Missing or Defective
+665,General Counsel
+666,Unauthorized Posting of Signs
+667,Wood Pole Knocked Down
+668,Conduit
+669,"Taste/Odor, Sewer (QA5)"
+670,Sign
+671,Warning Signal
+672,Prohibited Item Sale to Minor
+673,Investigative Inspection
+674,Hummock
+675,Dogs or Cats Not Sold
+676,Other Water Problem (Use Comments) (QZZ)
+677,Sidewalk CafÃ©
+678,Newspaper Box Complaint
+679,Leaky Roof
+680,Concrete Barrier
+681,Illness/Injury
+682,Pigeon Odor
+683,Ticket Scalping
+684,Bent/Loose
+685,Ped Visor
+686,Milk Not Pasteurized
+687,Sewage
+688,Projects
+689,ER6 Comm. Recyc. (Comm. Tenant)
+690,Material Storage - Unsafe
+691,TAL 2 Wheelchair
+692,Noise: Loud Music/Daytime (Mark Date And Time) (NN1)
+693,Community Outreach
+694,Bracket Arm Missing
+695,Relocation of Parking Meter
+696,Dry Cleaning Vapors (PERC)
+697,Gasoline Spill (IAA)
+698,MCI Abatement
+699,Required Signage Not Posted
+700,Advice Request
+701,Beach/Pool/Sauna Unpermitted
+702,Illegal Use Of Hose - Private Residence  (CCR)
+703,Flood Light Lamp Dayburning
+704,Control Panel Damaged
+705,installation of hydrant side post (WHFP)
+706,Non-Disclosure of Fees
+707,Flood Light Lamp Dim
+708,"Noise, Ice Cream Truck (NR4)"
+709,"Air: Odor/Fumes, Dry Cleaners (AD1)"
+710,"Clear Water With Organisms (Insects, Worms) (QE2)"
+711,"Oil, Grease In Water (QD1)"
+712,Nuisance/Truant
+713,Air Conditioning Problem
+714,Gender Pricing
+715,American Flag
+716,Water
+717,Traffic Camera
+718,Highway Flooding (SH)
+719,Elevator - Multiple Devices On Property
+720,Misleading Appraisal
+721,Animal Odor
+722,Information Technology
+723,Media Inquiries
+724,About NYC Opportunity
+725,9 Spill/Oil etc
+726,Plate Noisy/Sunken/Raised (WF5)
+727,Touchscreen/Button Not Working
+728,Broken/Defective
+729,Street Con Game
+730,Fleet
+731,Paid in Advance
+732,Jewelry
+733,Sewer Break (SBR)
+734,Broken Water Fountain
+735,Sidewalk Pull Box Co
+736,Stalled Construction Site
+737,Large Number of Mosquitoes
+738,Telco Connection Blk
+739,In Post Base
+740,Personal DHE Exemption
+741,Construction
+742,Blocking Sidewalk
+743,ER4 City Agency (Inst. Recycling)
+744,No Dial Tone
+745,Lane Control Signal
+746,Unrequested Services Provided
+747,Tenant Refusal
+748,Unsafe Use of Playground
+749,MICROWAVE
+750,"Dirt, Litter, Debris - Lot"
+751,"Noise: Boat(Engine,Music,Etc) (NR10)"
+752,Lost/Missing Person
+753,Clothing/Glasses
+754,High Pressure to Take on Loan/Debt
+755,Time Insufficient
+756,Blocked - ATM
+757,Fire Alarm Lamp Missing
+758,Bees/Wasps - Not a beekeper
+759,CMU Communication
+760,Sodium Warning
+761,Facility General
+762,Lost Property
+763,Capital Construction
+764,Business Tax
+765,Non-Compliance w/Lightweight Materials
+766,Dead End Signal
+767,Apply Payment or Credit
+768,Payment Not Posted
+769,Refund/Credit Info or Status
+770,EFT or Online Payment Problem
+771,DAMAGE STRUCTURE/RAILING (SRGDM)
+772,Request To Open A Hydrant (WC4)
+773,Amount Owed Dispute
+774,Payment Misapplied
+775,Other Agency Charge
+776,Bill Received in Error
+777,"\Smoking Signs - \""\""No Smoking\""\"" Signs Not Observed on Construction Site\"""""
+778,Application Appeal
+779,Speed Board Sign
+780,OUTLET COVER
+781,Material Stored Improperly
+782,Other Health Matters
+783,Damaged or Missing Ad Box
+784,Lack of Safety Equipment
+785,Wrong Amount Paid or Withdrawn
+786,Missing Complaint Sign
+787,Unsanitary Condition
+788,RAIN GARDEN FLOODING (SRGFLD)
+789,Property Value
+790,Commercial Rent Tax- Refund
+791,Accident - Cranes/Derricks/Suspension Scaffold
+792,Ferret
+793,Hangers
+794,In Conduit
+795,Farm Animal
+796,No or Defective Headphones
+797,Phone Blocking Sidewalk
+798,Equipment Malfunction
+799,Beekeeping - Honeybees
+800,Damaged Door
+801,Guard Rail - Highway
+802,Illegal Dumping
+803,6R Overflowing Recycling Baskets
+804,Ewaste appointment
+805,Graffiti/Dirty Condition
+806,Application Portability
+807,Public Event Seating
+808,Inattentive
+809,Equipment Maintenance
+810,Snake
+811,Defective Water Sampling Station (QSS)
+812,Fence
+813,BBQ Outside Authorized Area
+814,Bracket Arm Bent
+815,Not Received - Vending Machine
+816,E7 Private Carter Spillage
+817,Lane Station
+818,Injury/Safety
+819,Condulet Cover
+820,Absent
+821,Labor Violation
+822,Swimming Pool - Unmaintained
+823,Tie Rods
+824,Other - Explain Below
+825,Elevator - Dangerous Condition/Shaft Open/Unguarded
+826,Puddle on Driveway
+827,Puddle on Roof
+828,Basement
+829,Container - Over 5 Gallons
+830,Commercial Other Exemption
+831,Puddle in Ground
+832,Elevator - Single Device On Property/No Alternate Service
+833,Lien Sale
+834,Sewer or Drain
+835,Flooded
+836,Box Cover
+837,Container - Under 5 Gallons
+838,Special Agency Projects/Initiatives
+839,Bird Bath
+840,Swimming Pool Cover
+841,Lighting - Garage
+842,Advance Fee
+843,Roof Gutters
+844,Building Foundation
+845,Lost Coin
+846,Puddle on Sidewalk
+847,Shisha
+848,Transducer-Loop
+849,Decorative Necklace Lighting
+850,Monkey
+851,Property Misclassified
+852,Flavored Tobacco
+853,Taste
+854,"Dirt, Litter, Debris - Garage"
+855,Fountain - Over 5 Gallons
+856,Inaccurate Meter
+857,Stop Temporary
+858,Language Access Coordinator
+859,Ped Lens
+860,Tires
+861,Damaged Toilet/Sink
+862,Broken Fence
+863,Seizure of Funds
+864,Flower Planters
+865,Scale Inaccurate
+866,High Grass
+867,Minor Received Tattoo
+868,Wood Pole Leaning
+869,Waterway-Color (IHD)
+870,User Unlicensed
+871,Supervisory
+872,Unauthorized Climbing
+873,Complaint
+874,Red Lt Camera Feed
+875,No Idling
+876,RPIE - Filing and Technical Issues
+877,General Business Tax - Other
+878,Fountain - Under 5 Gallons
+879,No Bill of Rights
+880,Poison Ivy
+881,Spanish Transaction
+882,Book/Stationery
+883,SCRIE Miscellaneous
+884,New Automatic Public Toilet Request
+885,Personal Clergy Exemption
+886,Color
+887,Fire Hydrant Emergency (FHE)
+888,Documents/Paperwork Missing
+889,City Planning Commission
+890,BBS Failure
+891,Exposure from Nearby Facility
+892,Information on Contracts and Contractors
+893,Deck Inspection
+894,Gas Utility Referral
+895,Sports Equipment
+896,Appeals Division
+897,Marine Lamp
+898,Safety Inspection-Retaining Walls (May 2005)
+899,Integrity Complaint Referral
+900,Property Misclassification
+901,Non-Public Schools
+902,On Structure
+903,421A Exemption
+904,Zoning and Land Use Questions/Information
+905,Contamination Risk
+906,Energy
+907,FENCING
+908,Cellar Door New
+909,Use of Newly Seeded Lawn
+910,Electronic Sign - Overhead
+911,ULURP Project Status Questions
+912,Pedestrian Sign
+913,1RE Recycling Electronics
+914,Bikes in Buildings
+915,Blank Out Matrix Sgn
+916,Broken Window
+917,High Pressure Sales
+918,Building Permit - None
+919,Do Not Block the Box
+920,12P Dead Deer
+921,Ver Message Sign
+922,1RE missed collection for E-waste
+923,Illegal Use Of Hose - Private Residence
+924,Budget
+925,Enforcement Work Order (DOB)
+926,Sign Defect - Garage
+927,1L Missed Recycling Leaves
+928,Debris - Excessive
+929,Adjacent Buildings Not Protected
+930,After Hours Work - Illegal
+931,Accident - Construction/Plumbing
+932,Construction - Change Grade/Watercourse
+933,Landlord Inquiries
+934,Det-Sens Amplifier
+935,Wood Pole Damaged
+936,Contractor Responsibility/VENDEX
+937,Unlicensed/Illegal/Improper Work In Progress
+938,Commercial Exemptions
+939,General Business Tax - Refund
+940,Damaged or Leaking Roof
+941,General Bad Condition
+942,Detector Sensor
+943,Accessibility Accommodations
+944,DRY WEATHER DISCHARGE - DWD
+945,General Business Tax- Audit
+946,Commercial Not For Profit Exemption
+947,Restroom Non-Complaince With Local Law 79/16
+948,Best - DM Tracking Complaint
+949,Best - High-Rise Tracking Complaint
+950,SST Tracking Complaint
+951,M.A.R.C.H. Program (Interagency)
+952,Facade (LL11/98)- Unsafe Notification
+953,Inspection Work Order (DOB)
+954,Plumbing Enforcement Work Order (DOB)
+955,Illegal Conversion No Access Follow - UP
+956,Best - Low-Rise Tracking Complaint
+957,Construction Enforcement Work Order (DOB)
+958,Illegal Activity
+959,Excavation Tracking Complaint
+960,Sustainability Enforcement Work Order
+961,Interior Demo Tracking Complaint
+962,Electrical Enforcement Work Order (DOB)
+963,Sandy: Building Destroyed
+964,Amusement Ride Accident/Incident
+965,Complaince Inspection
+966,Demolition Notification Received
+967,V.E.S.T. Program (DOB & NYPD)
+968,Personal Veteran Exemption
+969,Depression Maintenance
+970,Driver Complaint - Passenger
+971,Elevator - Defective/Not Working
+972,DRIE Exemption
+973,Mailed - Not Reflected
+974,The ABCs of Housing
+975,Full Term Mobile Food Vendor License
+976,Medicaid
+977,Food Stamp
+978,Cash Assistance
+979,Billing Name Incorrect
+980,Waive Penalty for Late Payment
+981,The ABCs of Housing - Chinese
+982,Lost and Found
+983,Heat Bulletin
+984,The ABCs of Housing - Spanish
+985,Homeless Issue
+986,Electronic Fund Transfer (EFT) Problem
+987,Copy of Account Information
+988,Condo or Co-op Abatement
+989,Copy of Statement
+990,Property Address Incorrect
+991,Other Billing Issue
+992,Card - DOF Confirmation Number Issued
+993,Mitchell-Lama Housing List
+994,Billing Address Incorrect
+995,Waterway-Oil/Gasoline (IHB)
+996,Status of Payment Adjustment
+997,Cleanliness
+998,Barbershop License
+999,Food Service Establishment License
+1000,Debt Collection Agency License
+1001,Housing Information Guide For Tenants and Owners Notice
+1002,The ABCs of Housing - Arabic
+1003,Applied to Wrong Ticket
+1004,Tax Exemption
+1005,Commercial ICIP or ICAP Exemption
+1006,Misapplied Payment
+1007,Remove Mortgage
+1008,Frozen Dessert Manufacturer License
+1009,General Inquiry
+1010,Image of Ticket
+1011,Incorrect Amount Paid
+1012,Status of Appeal
+1013,Card - No DOF Confirmation Number Issued
+1014,Status of PV Refund
+1015,Filing and Technical Issues
+1016,Full Term Mobile Food Unit Permit
+1017,General Street Vendor License
+1018,Ready NY - English - Full Size
+1019,Condo Billing Issue
+1020,Locksmith License
+1021,Status of Hearing
+1022,General Complaint
+1023,Home Ownership Kit
+1024,Registration Clearance Request
+1025,Commercial Green Roof or Solar Panel Exemption
+1026,Driver Compliment
+1027,Commercial 421A Exemption
+1028,HomeFirst Down Payment Information
+1029,Ready NY - Businesses - English
+1030,Copy of Notice of Property Value
+1031,Seasonal Mobile Food Vendor License
+1032,Ready NY Guide - Pocket Sized - English
+1033,Newsstand License
+1034,ACRIS Incorrect
+1035,Secondhand Dealer Firearms License
+1036,Catering Establishment License
+1037,Cigarette Retail Dealer License
+1038,Housing Quality Standards (HQS) Inspections FAQs - English
+1039,Commercial CEP or CRP Exemption
+1040,Finance Business Center - Not Reflected
+1041,Street Fair Vendor License
+1042,Stoop Line Stand License
+1043,Home Improvement Contractor License
+1044,Individual Process Server License
+1045,Decision and Order
+1046,Card - Charged Twice
+1047,Status of Request to file Paper RPIE
+1048,Full Term Tattoo License
+1049,List of Outstanding Tickets
+1050,Disruptive Passenger
+1051,Interruption of Essential Services Notice
+1052,Commercial J51 Exemption
+1053,Secondhand Dealer Auto License
+1054,Ready NY My Emergency Plan - English
+1055,Winter Health and Safety Tips Guide
+1056,Sightseeing Guide License
+1057,Home Improvement Salesperson License
+1058,The ABCs of Housing - Russian
+1059,Delays
+1060,General Vendor Distributor License
+1061,Senior Citizen Home Assistance Program (SCHAP) Loan
+1062,Performance
+1063,Employment Agency License
+1064,Secondhand Dealer General License
+1065,Ready NY - English - Pocket Size
+1066,Commercial UDAAP Exemption
+1067,Death Certificate Before 1949 Order Form
+1068,Birth Certificate Before 1910 Order Form
+1069,Garage or Parking Lot License
+1070,Marriage Certificate Order Form
+1071,Elevator Not Inspected/Illegal/No Permit
+1072,Certificate of No Harassment or Exemption - SRO
+1073,Dead/Dying Tree
+1074,Certificate of No Harassment - Zoning
+1075,Hurricane Preparedness - English
+1076,Tow Truck Driver License
+1077,Temporary Food Service Establishment Permit
+1078,Settlement Reduction Not Shown
+1079,Ready NY - Arabic - Full Size
+1080,Electronic Sign - Portable
+1081,Ready NY - Kids - Middle and High School Students
+1082,Ready NY - French - Full Size
+1083,City Rebate
+1084,Restrooms
+1085,Non Retail Food Processing Establishment License
+1086,Homestead
+1087,Ready NY - Kids - Elementary School Students
+1088,Ready NY - Flooding
+1089,Fallen Debris from Tunnel
+1090,Ready NY - Reference Card
+1091,Genealogy Research Application
+1092,Emergency Notice
+1093,Dealer in Devices for Disabled License
+1094,Ready NY My Emergency Plan - Spanish
+1095,Hurricane Preparedness - Spanish
+1096,Hurricane Preparedness - Haitian Creole
+1097,Sightseeing Bus License
+1098,Hurricane Preparedness - Arabic
+1099,Process Server Organization License
+1100,Licensed Home Improvement Contractor Bumper Sticker
+1101,Ready NY Beat the Heat - English
+1102,Temporary Amusement Device License
+1103,Ready NY- Pandemic Flu
+1104,Pothole - Tunnel
+1105,Shelter for Homeless Animals License
+1106,Auctioneer License
+1107,Marshal - Not Reflected
+1108,Ready NY My Emergency Plan - Traditional Chinese
+1109,Sidewalk Cafe License
+1110,Cabaret License
+1111,Locksmith Apprentice License
+1112,New Lead Law Rights and Requirements
+1113,Ready NY - Spanish - Full Size
+1114,Scrap Metal Processor License
+1115,Announcements
+1116,3 Sweeping/Missed-Inadequate
+1117,Electronics and Home Appliance Service Dealer License
+1118,Ready NY - Pets - English
+1119,Pedicab Driver
+1120,Ready NY - Chinese Traditional - Full Size
+1121,Summer Heat - English
+1122,Laundry License
+1123,Summer Heat - Russian
+1124,Ready NY My Emergency Plan - Russian
+1125,The ABCs of Housing - Korean
+1126,Ready NY - Russian - Pocket Size
+1127,Strip Paving
+1128,Sign Blocked by Tree
+1129,Ready NY - Haitian Creole - Full Size
+1130,Beach/Pool Closure
+1131,Conflict Monitor
+1132,Dead End Sign
+1133,Seasonal Food Cart Vendor Permit
+1134,Amusement Arcade License
+1135,Tow Truck Company License
+1136,Commercial DAMP Exemption
+1137,Waterway-Floatables (IHC)
+1138,Pet Store - New License
+1139,Ready NY - Chinese Traditional - Pocket Size
+1140,The ABCs of Housing - Haitian Creole
+1141,Ready NY - Spanish - Pocket Size
+1142,Ready NY - Small and Mid-Sized Companies
+1143,Commercial 421B Exemption
+1144,Booting Company License
+1145,Animal Grooming License
+1146,Temporary Tattoo License
+1147,Commercial 421G Exemption
+1148,Scale Dealer or Repairer License
+1149,General Vendor Waiting List Application
+1150,Ready NY - Russian - Full Size
+1151,Ready NY My Emergency Plan - Haitian Creole
+1152,Going Out of Business Sale License
+1153,Graffiti - Tunnel
+1154,Compressed Air License
+1155,Det-Sens Cabinet
+1156,Ready NY - Chinese Simplified - Full Size
+1157,Laundry Jobber License
+1158,NO WATER - WNW
+1159,Animal Boarding License
+1160,Electronics Store License
+1161,Auction House License
+1162,Hydrotest
+1163,Pawn Broker License
+1164,Smoke/Odor
+1165,Food Source/Protection
+1166,Garbage
+1167,Equipment
+1168,Pool or Billiard Hall License
+1169,EXPY Sign Fixt Cover
+1170,Personal Crime Victim or Good Samaritan Exemption
+1171,Dust Cover
+1172,Ready NY My Emergency Plan - Italian
+1173,Gaming Cafe License
+1174,Portable Amusement Ride License
+1175,Curb Violation
+1176,Status Call
+1177,No Status Call
+1178,Summer Heat - Spanish
+1179,Electrical - Unlicensed/Illegal/Improper Work In Progress
+1180,Messenger
+1181,Hurricane Preparedness - Chinese
+1182,ABANDONED APARTMENT UNIT
+1183,Hurricane Preparedness - Russian
+1184,License Violation
+1185,Placement
+1186,Insects / Pests
+1187,Driver Complaint - Non Passenger
+1188,Retail Store
+1189,Initial
+1190,To FDNY Approved System
+1191,Ready NY - Korean - Full Size
+1192,For Violation
+1193,Licensee Complaint
+1194,Bodega/Deli/Supermarket
+1195,Multi Agency Joint Inspection
+1196,Inhalation Therapy Supervising Technician License
+1197,Horse Drawn Carriage Driver License
+1198,Bowstring Truss Tracking Complaint
+1199,Retail Laundry License Application
+1200,14 Derelict Vehicles
+1201,Retaining Wall Tracking Complaint
+1202,Ready NY - Businesses - Spanish
+1203,Notice of Housing Code Enforcement Issues
+1204,Pathogens License
+1205,Tobacco Retail Dealer License Application
+1206,Sample Suspected Gas Leak Notice
+1207,Street Cave-In *Dep Internal Use Only* (SG1)
+1208,Housing Quality Standards (HQS) Inspections FAQs - Spanish
+1209,HOUSING QUALITY STANDARDS
+1210,For Letter of Defect
+1211,New Building
+1212,Milk/Dairy Products
+1213,Voluntary
+1214,Business Opportunities/RFPs
+1215,Reinspection
+1216,Construction Safety Compliance Action
+1217,Amusement Ride
+1218,Structurally Compromised Building (LL33/08)
+1219,Non-med Compressed Gas - New
+1220,Other Hazmats
+1221,Cell Phone Store
+1222,Existing Building
+1223,Re-inspection
+1224,Iguana
+1225,Because of Violation
+1226,Disabled Device Dealer
+1227,Debt Collection Agency
+1228,Semi-Annual Homeless Shelter Inspection: Electrical
+1229,Semi-Annual Homeless Shelter Inspection: Construction
+1230,DCP/BSA Compliance Inspection
+1231,Semi-Annual Homeless Shelter Inspection: Plumbing
+1232,Illegal Commercial Or Manufacturing Use In a C1 Or C2 Zone
+1233,Car Dealer - Used
+1234,Permission to Publish Contract
+1235,Ticket Seller Business License Application
+1236,Certificate of No Harassment (CONH) Application
+1237,Certificate of No Harassment (CONH) Exemption
+1238,Commercial Government Exemption
+1239,Tow Truck Exemption License
+1240,EXPY Fixture
+1241,Industrial Laundry Delivery License Application
+1242,Guide Rail
+1243,Wireless Antenna
+1244,Ready NY My Emergency Plan - Korean
+1245,Ready NY My Emergency Plan - Polish
+1246,Accident/Explosion - Boiler
+1247,Summer Heat - Chinese
+1248,Ready NY - Polish - Full Size
+1249,Sidewalk Consultation
+1250,Noise: Vehicle (NR2)
+1251,Con Edison Referral
+1252,Documents Not Returned
+1253,Initial - Construction
+1254,Snow Removal
+1255,Snow Emergency
+1256,Relocation of Muni Meter
+1257,Elevator In (Fdny) Readiness - None
+1258,Suspected Street Cut
+1259,Overexposure During Treatment
+1260,SCRIE Application Denial
+1261,Unincorporated Business Tax - Other
+1262,"Air: Smoke, Residential (AA1)"
+1263,"BUILDING COLLAPSE/FIRE, (ASBESTOS RELATED) *FOR DEP INTERNAL USE ONLY* (HH2)"
+1264,Unincorporated Business Tax - Return Filing
+1265,No Statement of Job Conditions
+1266,Excise Taxes-Refund
+1267,Child or Minor Tanning
+1268,Injury or Illness from Tanning
+1269,Loan Offer
+1270,Defective Streetlight
+1271,EXPY Sign Reflector
+1272,Commercial Rent Tax-Other
+1273,Sediment
+1274,Workplace - 10 or Less Staff
+1275,Failure to Comply with Annual Crane Inspection
+1276,Facility Unregistered
+1277,"Air: Open Fire, Construction/Demolition (AC4)"
+1278,Extra Parts
+1279,Unincorporated Business Tax - Refund
+1280,Musical Instrument
+1281,Green Roof or Solar Panel Exemption
+1282,Sign Defect - Lot
+1283,Crack Sealing
+1284,Cigarette Vending Machine
+1285,Marine Globe
+1286,Mssg Sign Multi Lamp
+1287,Marine Flasher
+1288,Technician Unlicensed
+1289,Dumpster - Causing Damage
+1290,Minor Access
+1291,Excise Taxes-Audit
+1292,Not Certified
+1293,Toy Gun Sale
+1294,Fire Alarm Lamp Cycling
+1295,High Interest Loan
+1296,SCRIE Application Appeal
+1297,Cable Television
+1298,Mapping Information
+1299,NYC.gov Web Site
+1300,EZ PASS READER
+1301,RPIE
+1302,RTMS
+1303,Excise Taxes-Other
+1304,Facility Complaint
+1305,Inspection Requests/Complaints
+1306,Building Information/Construction History
+1307,Birth/Death Certificates
+1308,Communications/Intergovernmental
+1309,Ethernet Cable
+1310,Mental Health
+1311,ALJ Division
+1312,Fixture(S)
+1313,General Business Tax - Return filing

data/drop_vars.xlsx ADDED Viewed

Binary file (10.8 kB). View file

data/weather_aggregated_2010-2018.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

figures/bounded_map.html ADDED Viewed

	@@ -0,0 +1,95 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
+        <script>
+            L_NO_TOUCH = false;
+            L_DISABLE_3D = false;
+        </script>
+    <style>html, body {width: 100%;height: 100%;margin: 0;padding: 0;}</style>
+    <style>#map {position:absolute;top:0;bottom:0;right:0;left:0;}</style>
+    <script src="https://cdn.jsdelivr.net/npm/leaflet@1.9.3/dist/leaflet.js"></script>
+    <script src="https://code.jquery.com/jquery-3.7.1.min.js"></script>
+    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.2.2/dist/js/bootstrap.bundle.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/Leaflet.awesome-markers/2.0.2/leaflet.awesome-markers.js"></script>
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/leaflet@1.9.3/dist/leaflet.css"/>
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@5.2.2/dist/css/bootstrap.min.css"/>
+    <link rel="stylesheet" href="https://netdna.bootstrapcdn.com/bootstrap/3.0.0/css/bootstrap.min.css"/>
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@fortawesome/fontawesome-free@6.2.0/css/all.min.css"/>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/Leaflet.awesome-markers/2.0.2/leaflet.awesome-markers.css"/>
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/python-visualization/folium/folium/templates/leaflet.awesome.rotate.min.css"/>
+            <meta name="viewport" content="width=device-width,
+                initial-scale=1.0, maximum-scale=1.0, user-scalable=no" />
+            <style>
+                #map_1bca46dd8c0ecb99e8cf98a8490d26c6 {
+                    position: relative;
+                    width: 100.0%;
+                    height: 100.0%;
+                    left: 0.0%;
+                    top: 0.0%;
+                }
+                .leaflet-container { font-size: 1rem; }
+            </style>
+</head>
+<body>
+            <div class="folium-map" id="map_1bca46dd8c0ecb99e8cf98a8490d26c6" ></div>
+</body>
+<script>
+            var map_1bca46dd8c0ecb99e8cf98a8490d26c6 = L.map(
+                "map_1bca46dd8c0ecb99e8cf98a8490d26c6",
+                {
+                    center: [40.7128, -74.006],
+                    crs: L.CRS.EPSG3857,
+                    zoom: 10,
+                    zoomControl: false,
+                    preferCanvas: false,
+                    scrollWheelZoom: false,
+                    dragging: false,
+                }
+            );
+            var tile_layer_5610f1ba4421bfdd6b11b0d3a8230311 = L.tileLayer(
+                "https://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}{r}.png",
+                {"attribution": "\u0026copy; \u003ca href=\"https://www.openstreetmap.org/copyright\"\u003eOpenStreetMap\u003c/a\u003e contributors \u0026copy; \u003ca href=\"https://carto.com/attributions\"\u003eCARTO\u003c/a\u003e", "detectRetina": false, "maxNativeZoom": 20, "maxZoom": 20, "minZoom": 0, "noWrap": false, "opacity": 1, "subdomains": "abcd", "tms": false}
+            );
+            tile_layer_5610f1ba4421bfdd6b11b0d3a8230311.addTo(map_1bca46dd8c0ecb99e8cf98a8490d26c6);
+            var rectangle_7a26a5f5f0553f8e9c5a706c1184bf75 = L.rectangle(
+                [[40.49804421521046, -74.25521082506387], [40.91294056699566, -73.70038354802529]],
+                {"bubblingMouseEvents": true, "color": "#F1807E", "dashArray": "5 5", "dashOffset": null, "fill": true, "fillColor": "blue", "fillOpacity": 0.2, "fillRule": "evenodd", "lineCap": "round", "lineJoin": "round", "noClip": false, "opacity": 1.0, "smoothFactor": 1.0, "stroke": true, "weight": 3}
+            ).addTo(map_1bca46dd8c0ecb99e8cf98a8490d26c6);
+        var popup_c20294d340dae6e3dee1251d70105f4e = L.popup({"maxWidth": "100%"});
+                var html_e2caf4fa03251f2359325a8b2c62d96d = $(`<div id="html_e2caf4fa03251f2359325a8b2c62d96d" style="width: 100.0%; height: 100.0%;">Service Data Coverage Zone</div>`)[0];
+                popup_c20294d340dae6e3dee1251d70105f4e.setContent(html_e2caf4fa03251f2359325a8b2c62d96d);
+        rectangle_7a26a5f5f0553f8e9c5a706c1184bf75.bindPopup(popup_c20294d340dae6e3dee1251d70105f4e)
+        ;
+</script>
+</html>

figures/final_map.html ADDED Viewed

The diff for this file is too large to render. See raw diff

figures/map1.html ADDED Viewed

The diff for this file is too large to render. See raw diff

figures/map2.html ADDED Viewed

The diff for this file is too large to render. See raw diff

figures/model_performance.png ADDED Viewed

models/BERTopic/config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "calculate_probabilities": false,
+  "language": null,
+  "low_memory": false,
+  "min_topic_size": 10,
+  "n_gram_range": [
+    1,
+    1
+  ],
+  "nr_topics": 8,
+  "seed_topic_list": null,
+  "top_n_words": 5,
+  "verbose": true,
+  "zeroshot_min_similarity": 0.7,
+  "zeroshot_topic_list": null,
+  "embedding_model": "sentence-transformers/all-MiniLM-L6-v2"
+}

models/BERTopic/ctfidf.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0e5537b25ef16e60f33d219dbc53128240bdd3ef0677273cbcc337157562112
+size 14020

models/BERTopic/ctfidf_config.json ADDED Viewed

	@@ -0,0 +1,408 @@

+{
+  "ctfidf_model": {
+    "bm25_weighting": false,
+    "reduce_frequent_words": false
+  },
+  "vectorizer_model": {
+    "params": {
+      "analyzer": "word",
+      "binary": false,
+      "decode_error": "strict",
+      "encoding": "utf-8",
+      "input": "content",
+      "lowercase": true,
+      "max_df": 1.0,
+      "max_features": null,
+      "min_df": 2,
+      "ngram_range": [
+        1,
+        2
+      ],
+      "stop_words": "english",
+      "strip_accents": null,
+      "token_pattern": "(?u)\\b\\w\\w+\\b",
+      "vocabulary": null
+    },
+    "vocab": {
+      "request": 285,
+      "large": 197,
+      "collection": 63,
+      "posted": 261,
+      "parking": 245,
+      "sign": 312,
+      "violation": 365,
+      "working": 376,
+      "contrary": 83,
+      "stop": 328,
+      "work": 374,
+      "order": 241,
+      "dirty": 110,
+      "sidewalk": 311,
+      "access": 3,
+      "receipt": 273,
+      "site": 316,
+      "street": 331,
+      "condition": 76,
+      "ice": 169,
+      "non": 232,
+      "missed": 224,
+      "area": 20,
+      "license": 203,
+      "engine": 126,
+      "idling": 170,
+      "cond": 75,
+      "lead": 200,
+      "residential": 290,
+      "sewer": 309,
+      "use": 358,
+      "comments": 66,
+      "water": 370,
+      "meter": 221,
+      "broken": 34,
+      "leaking": 202,
+      "private": 263,
+      "residence": 288,
+      "refund": 277,
+      "return": 295,
+      "permit": 249,
+      "improper": 175,
+      "certificate": 50,
+      "occupancy": 236,
+      "illegal": 171,
+      "plumbing": 256,
+      "pedestrian": 248,
+      "signal": 313,
+      "defective": 101,
+      "inadequate": 179,
+      "heat": 159,
+      "new": 230,
+      "bus": 36,
+      "placement": 254,
+      "repair": 284,
+      "building": 35,
+      "damaged": 93,
+      "cracked": 88,
+      "bicycle": 28,
+      "flooding": 140,
+      "overnight": 242,
+      "commercial": 67,
+      "storage": 329,
+      "surveillance": 337,
+      "waste": 369,
+      "blocked": 30,
+      "construction": 79,
+      "school": 306,
+      "property": 270,
+      "cover": 86,
+      "noise": 231,
+      "gas": 147,
+      "problem": 265,
+      "delivery": 103,
+      "goods": 150,
+      "curb": 90,
+      "hitting": 162,
+      "phone": 252,
+      "c1": 40,
+      "trees": 351,
+      "rent": 283,
+      "unauthorized": 354,
+      "chronic": 55,
+      "hanging": 157,
+      "accident": 4,
+      "cleaning": 58,
+      "asp": 22,
+      "establishment": 129,
+      "public": 272,
+      "space": 320,
+      "dispute": 111,
+      "home": 163,
+      "electronics": 123,
+      "chemical": 53,
+      "chained": 51,
+      "smoking": 319,
+      "car": 42,
+      "general": 149,
+      "maintenance": 214,
+      "asbestos": 21,
+      "open": 239,
+      "missing": 226,
+      "emergency": 124,
+      "odor": 237,
+      "catch": 48,
+      "basin": 26,
+      "tax": 342,
+      "temporary": 345,
+      "failure": 134,
+      "debris": 98,
+      "falling": 136,
+      "danger": 95,
+      "air": 9,
+      "defect": 100,
+      "metal": 219,
+      "protruding": 271,
+      "information": 183,
+      "cut": 91,
+      "vacant": 360,
+      "lot": 211,
+      "resident": 289,
+      "pipe": 253,
+      "toilet": 349,
+      "button": 38,
+      "wiring": 373,
+      "buzzer": 39,
+      "vehicle": 361,
+      "carbon": 43,
+      "monoxide": 227,
+      "smoke": 318,
+      "audit": 24,
+      "damp": 94,
+      "leak": 201,
+      "st": 324,
+      "facility": 133,
+      "law": 198,
+      "cigarette": 56,
+      "sale": 303,
+      "minor": 222,
+      "pool": 258,
+      "graffiti": 151,
+      "speed": 322,
+      "scale": 305,
+      "hours": 165,
+      "safety": 301,
+      "equipment": 128,
+      "signs": 314,
+      "notice": 233,
+      "box": 31,
+      "weeds": 372,
+      "grating": 152,
+      "removal": 280,
+      "requested": 286,
+      "controller": 85,
+      "flasher": 139,
+      "loose": 210,
+      "time": 347,
+      "switch": 340,
+      "stump": 335,
+      "sampling": 304,
+      "required": 287,
+      "head": 158,
+      "card": 45,
+      "stuck": 333,
+      "commission": 68,
+      "lack": 195,
+      "litter": 207,
+      "comm": 65,
+      "bldg": 29,
+      "basket": 27,
+      "fallen": 135,
+      "bridge": 33,
+      "warning": 367,
+      "prohibited": 269,
+      "inspection": 187,
+      "roof": 298,
+      "illness": 174,
+      "injury": 185,
+      "ticket": 346,
+      "clear": 59,
+      "insects": 186,
+      "highway": 161,
+      "multiple": 229,
+      "devices": 107,
+      "animal": 13,
+      "lane": 196,
+      "control": 84,
+      "dirt": 108,
+      "clothing": 61,
+      "high": 160,
+      "pressure": 262,
+      "debt": 99,
+      "materials": 218,
+      "agency": 8,
+      "application": 17,
+      "station": 325,
+      "unguarded": 355,
+      "driveway": 117,
+      "gallons": 144,
+      "device": 106,
+      "service": 307,
+      "swimming": 338,
+      "coin": 62,
+      "tobacco": 348,
+      "taste": 341,
+      "filing": 138,
+      "technical": 343,
+      "issues": 192,
+      "rights": 297,
+      "miscellaneous": 223,
+      "color": 64,
+      "division": 112,
+      "retaining": 293,
+      "zoning": 378,
+      "lawn": 199,
+      "status": 326,
+      "enforcement": 125,
+      "excessive": 131,
+      "contractor": 82,
+      "dry": 118,
+      "complaince": 71,
+      "electrical": 121,
+      "amusement": 11,
+      "ride": 296,
+      "incident": 180,
+      "received": 274,
+      "program": 268,
+      "nypd": 235,
+      "issue": 191,
+      "electronic": 122,
+      "transfer": 350,
+      "eft": 120,
+      "address": 6,
+      "incorrect": 181,
+      "wrong": 377,
+      "paper": 244,
+      "list": 206,
+      "passenger": 247,
+      "guide": 154,
+      "assistance": 23,
+      "exemption": 132,
+      "sro": 323,
+      "truck": 352,
+      "driver": 115,
+      "city": 57,
+      "tunnel": 353,
+      "licensed": 204,
+      "improvement": 177,
+      "sticker": 327,
+      "animals": 14,
+      "company": 69,
+      "waterway": 371,
+      "abcs": 1,
+      "housing": 167,
+      "haitian": 155,
+      "creole": 89,
+      "apartment": 16,
+      "unit": 356,
+      "retail": 292,
+      "store": 330,
+      "initial": 184,
+      "fdny": 137,
+      "approved": 18,
+      "multi": 228,
+      "business": 37,
+      "annual": 15,
+      "related": 278,
+      "dep": 104,
+      "internal": 189,
+      "vending": 362,
+      "machine": 213,
+      "marine": 217,
+      "dumpster": 119,
+      "damage": 92,
+      "cable": 41,
+      "missed collection": 225,
+      "street cond": 332,
+      "use comments": 359,
+      "private residence": 264,
+      "improper use": 176,
+      "residential building": 291,
+      "plumbing work": 257,
+      "work illegal": 375,
+      "construction site": 80,
+      "cover missing": 87,
+      "odor sewer": 238,
+      "sewer catch": 310,
+      "catch basin": 49,
+      "danger falling": 96,
+      "metal protruding": 220,
+      "defective street": 102,
+      "carbon monoxide": 44,
+      "safety equipment": 302,
+      "permit license": 250,
+      "grating missing": 153,
+      "card stuck": 46,
+      "stuck meter": 334,
+      "warning signal": 368,
+      "clear water": 60,
+      "dirt litter": 109,
+      "litter debris": 208,
+      "open unguarded": 240,
+      "swimming pool": 339,
+      "amusement ride": 12,
+      "address incorrect": 7,
+      "incorrect status": 182,
+      "driver license": 116,
+      "home improvement": 164,
+      "improvement contractor": 178,
+      "company license": 70,
+      "abcs housing": 2,
+      "haitian creole": 156,
+      "dep internal": 105,
+      "internal use": 190,
+      "vending machine": 363,
+      "unknown": 357,
+      "line": 205,
+      "knocked": 193,
+      "post": 260,
+      "wall": 366,
+      "excavation": 130,
+      "support": 336,
+      "foreign": 142,
+      "dead": 97,
+      "contact": 81,
+      "installation": 188,
+      "break": 32,
+      "house": 166,
+      "change": 52,
+      "management": 215,
+      "conditioning": 77,
+      "condo": 78,
+      "foundation": 143,
+      "referral": 275,
+      "route": 299,
+      "concrete": 74,
+      "panel": 243,
+      "complaint": 72,
+      "basement": 25,
+      "garage": 145,
+      "sink": 315,
+      "reflected": 276,
+      "chinese": 54,
+      "spanish": 321,
+      "arabic": 19,
+      "hqs": 168,
+      "english": 127,
+      "russian": 300,
+      "portable": 259,
+      "korean": 194,
+      "10": 0,
+      "television": 344,
+      "retaining wall": 294,
+      "parking lot": 246,
+      "air conditioning": 10,
+      "location": 209,
+      "manufacturing": 216,
+      "care": 47,
+      "activity": 5,
+      "low": 212,
+      "food": 141,
+      "number": 234,
+      "remove": 281,
+      "pet": 251,
+      "compressed": 73,
+      "illegal use": 173,
+      "illegal improper": 172,
+      "sewage": 308,
+      "drinking": 113,
+      "garbage": 146,
+      "small": 317,
+      "removing": 282,
+      "plants": 255,
+      "problem use": 266,
+      "drinking water": 114,
+      "gas sewer": 148,
+      "ventilation": 364,
+      "problems": 267,
+      "related problems": 279
+    }
+  }
+}

models/BERTopic/topic_embeddings.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94f9c82186355ce319ce3da6352c0a285b91e216bdb680ec4e453d2df2f3c3d1
+size 12376

models/BERTopic/topics.json ADDED Viewed

	@@ -0,0 +1,1671 @@

+{
+  "topic_representations": {
+    "-1": [
+      [
+        "order",
+        0.05415367852300953
+      ],
+      [
+        "property",
+        0.05110874633317529
+      ],
+      [
+        "inspection",
+        0.047957198774650545
+      ],
+      [
+        "condition",
+        0.04684968413874401
+      ],
+      [
+        "construction",
+        0.040871670084454234
+      ]
+    ],
+    "0": [
+      [
+        "damaged",
+        0.12203031954457103
+      ],
+      [
+        "sign",
+        0.10565370415490198
+      ],
+      [
+        "sidewalk",
+        0.09204086251770861
+      ],
+      [
+        "missing",
+        0.08904351211067452
+      ],
+      [
+        "housing",
+        0.08425536080287954
+      ]
+    ],
+    "1": [
+      [
+        "license",
+        0.2485641290752132
+      ],
+      [
+        "complaint",
+        0.14648917413895213
+      ],
+      [
+        "illegal",
+        0.10854741509204496
+      ],
+      [
+        "violation",
+        0.06196592365898547
+      ],
+      [
+        "permit",
+        0.054220183201612294
+      ]
+    ],
+    "2": [
+      [
+        "water",
+        0.20043627364808767
+      ],
+      [
+        "basin",
+        0.1360096285478291
+      ],
+      [
+        "litter",
+        0.12766055935466478
+      ],
+      [
+        "missed",
+        0.12411889941590681
+      ],
+      [
+        "sewer",
+        0.11794480155381776
+      ]
+    ],
+    "3": [
+      [
+        "noise",
+        0.7067969405376407
+      ],
+      [
+        "animal",
+        0.23151186956043018
+      ],
+      [
+        "truck",
+        0.18520949564834413
+      ],
+      [
+        "dead",
+        0.1440316275215734
+      ],
+      [
+        "equipment",
+        0.1267727574626285
+      ]
+    ],
+    "4": [
+      [
+        "odor",
+        0.40165153174580426
+      ],
+      [
+        "food",
+        0.30714528898208565
+      ],
+      [
+        "air",
+        0.29978554690340886
+      ],
+      [
+        "smoke",
+        0.19547149449356388
+      ],
+      [
+        "taste",
+        0.19547149449356388
+      ]
+    ],
+    "5": [
+      [
+        "english",
+        0.4504386781775388
+      ],
+      [
+        "emergency",
+        0.379178358375766
+      ],
+      [
+        "spanish",
+        0.3611251470424905
+      ],
+      [
+        "chinese",
+        0.3317092027769569
+      ],
+      [
+        "heat",
+        0.3317092027769569
+      ]
+    ],
+    "6": [
+      [
+        "exemption",
+        0.693831167446274
+      ],
+      [
+        "commercial",
+        0.49112096865161
+      ],
+      [
+        "tax",
+        0.40939072124701686
+      ],
+      [
+        "business",
+        0.33495604465665013
+      ],
+      [
+        "refund",
+        0.17799030392909884
+      ]
+    ]
+  },
+  "topics": [
+    -1,
+    4,
+    0,
+    2,
+    0,
+    0,
+    0,
+    3,
+    -1,
+    0,
+    0,
+    0,
+    0,
+    -1,
+    1,
+    -1,
+    0,
+    0,
+    -1,
+    0,
+    0,
+    0,
+    1,
+    1,
+    2,
+    3,
+    0,
+    -1,
+    -1,
+    -1,
+    4,
+    -1,
+    -1,
+    0,
+    0,
+    0,
+    -1,
+    1,
+    -1,
+    3,
+    3,
+    -1,
+    3,
+    3,
+    1,
+    4,
+    -1,
+    2,
+    0,
+    0,
+    3,
+    -1,
+    -1,
+    0,
+    0,
+    -1,
+    -1,
+    -1,
+    3,
+    0,
+    0,
+    2,
+    2,
+    0,
+    2,
+    1,
+    0,
+    2,
+    2,
+    0,
+    -1,
+    2,
+    -1,
+    0,
+    2,
+    0,
+    0,
+    -1,
+    0,
+    -1,
+    2,
+    0,
+    2,
+    2,
+    4,
+    1,
+    0,
+    0,
+    1,
+    0,
+    0,
+    -1,
+    0,
+    0,
+    4,
+    -1,
+    -1,
+    -1,
+    -1,
+    0,
+    -1,
+    4,
+    -1,
+    -1,
+    1,
+    0,
+    3,
+    -1,
+    4,
+    3,
+    -1,
+    2,
+    -1,
+    0,
+    -1,
+    2,
+    1,
+    0,
+    -1,
+    2,
+    0,
+    0,
+    1,
+    -1,
+    0,
+    4,
+    0,
+    2,
+    0,
+    -1,
+    0,
+    1,
+    -1,
+    0,
+    0,
+    -1,
+    0,
+    3,
+    2,
+    0,
+    0,
+    3,
+    -1,
+    -1,
+    0,
+    2,
+    2,
+    0,
+    2,
+    -1,
+    0,
+    2,
+    2,
+    1,
+    2,
+    -1,
+    3,
+    -1,
+    -1,
+    -1,
+    0,
+    -1,
+    -1,
+    0,
+    0,
+    4,
+    0,
+    0,
+    0,
+    -1,
+    4,
+    -1,
+    3,
+    4,
+    4,
+    -1,
+    -1,
+    0,
+    2,
+    0,
+    0,
+    -1,
+    0,
+    -1,
+    0,
+    3,
+    3,
+    0,
+    -1,
+    -1,
+    1,
+    0,
+    3,
+    0,
+    -1,
+    1,
+    -1,
+    -1,
+    -1,
+    0,
+    0,
+    1,
+    -1,
+    -1,
+    2,
+    0,
+    -1,
+    0,
+    1,
+    0,
+    -1,
+    0,
+    0,
+    1,
+    -1,
+    2,
+    0,
+    -1,
+    -1,
+    4,
+    -1,
+    0,
+    0,
+    0,
+    -1,
+    1,
+    0,
+    -1,
+    -1,
+    1,
+    0,
+    4,
+    -1,
+    -1,
+    -1,
+    0,
+    -1,
+    0,
+    -1,
+    -1,
+    -1,
+    0,
+    -1,
+    1,
+    1,
+    -1,
+    0,
+    -1,
+    -1,
+    2,
+    3,
+    2,
+    -1,
+    0,
+    -1,
+    1,
+    0,
+    3,
+    -1,
+    0,
+    0,
+    4,
+    -1,
+    -1,
+    0,
+    -1,
+    -1,
+    0,
+    -1,
+    3,
+    0,
+    -1,
+    2,
+    4,
+    0,
+    2,
+    0,
+    2,
+    3,
+    0,
+    -1,
+    -1,
+    0,
+    1,
+    6,
+    0,
+    0,
+    -1,
+    -1,
+    -1,
+    1,
+    0,
+    1,
+    -1,
+    0,
+    -1,
+    2,
+    3,
+    0,
+    1,
+    -1,
+    0,
+    0,
+    4,
+    0,
+    2,
+    1,
+    -1,
+    2,
+    1,
+    -1,
+    0,
+    1,
+    -1,
+    0,
+    3,
+    1,
+    0,
+    0,
+    -1,
+    1,
+    0,
+    1,
+    0,
+    -1,
+    1,
+    1,
+    -1,
+    0,
+    4,
+    -1,
+    0,
+    0,
+    0,
+    -1,
+    3,
+    -1,
+    3,
+    -1,
+    -1,
+    -1,
+    -1,
+    -1,
+    4,
+    0,
+    2,
+    0,
+    4,
+    0,
+    2,
+    0,
+    0,
+    -1,
+    6,
+    -1,
+    4,
+    0,
+    4,
+    0,
+    0,
+    0,
+    1,
+    -1,
+    2,
+    1,
+    0,
+    0,
+    1,
+    4,
+    0,
+    0,
+    -1,
+    -1,
+    -1,
+    0,
+    2,
+    0,
+    -1,
+    3,
+    0,
+    -1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    -1,
+    -1,
+    0,
+    -1,
+    0,
+    -1,
+    4,
+    0,
+    0,
+    0,
+    0,
+    -1,
+    0,
+    0,
+    4,
+    0,
+    -1,
+    0,
+    0,
+    -1,
+    3,
+    2,
+    -1,
+    -1,
+    2,
+    -1,
+    -1,
+    0,
+    3,
+    2,
+    -1,
+    -1,
+    -1,
+    -1,
+    4,
+    0,
+    -1,
+    0,
+    0,
+    3,
+    2,
+    -1,
+    0,
+    0,
+    0,
+    -1,
+    1,
+    3,
+    4,
+    1,
+    0,
+    1,
+    -1,
+    4,
+    0,
+    -1,
+    0,
+    2,
+    4,
+    0,
+    -1,
+    0,
+    -1,
+    -1,
+    0,
+    2,
+    -1,
+    0,
+    0,
+    2,
+    -1,
+    0,
+    0,
+    0,
+    0,
+    0,
+    -1,
+    0,
+    0,
+    0,
+    4,
+    1,
+    0,
+    0,
+    1,
+    -1,
+    0,
+    5,
+    0,
+    -1,
+    4,
+    1,
+    0,
+    4,
+    -1,
+    -1,
+    4,
+    2,
+    -1,
+    4,
+    0,
+    -1,
+    -1,
+    3,
+    2,
+    -1,
+    3,
+    5,
+    -1,
+    -1,
+    -1,
+    0,
+    3,
+    2,
+    -1,
+    -1,
+    0,
+    2,
+    1,
+    3,
+    0,
+    1,
+    3,
+    -1,
+    0,
+    -1,
+    4,
+    0,
+    0,
+    1,
+    -1,
+    -1,
+    -1,
+    -1,
+    -1,
+    -1,
+    -1,
+    0,
+    0,
+    0,
+    -1,
+    3,
+    0,
+    -1,
+    1,
+    2,
+    -1,
+    -1,
+    -1,
+    -1,
+    -1,
+    0,
+    0,
+    -1,
+    -1,
+    -1,
+    0,
+    0,
+    0,
+    -1,
+    -1,
+    1,
+    0,
+    2,
+    0,
+    -1,
+    -1,
+    1,
+    0,
+    -1,
+    0,
+    -1,
+    6,
+    -1,
+    6,
+    0,
+    0,
+    3,
+    -1,
+    0,
+    -1,
+    0,
+    1,
+    -1,
+    0,
+    -1,
+    -1,
+    -1,
+    2,
+    -1,
+    3,
+    -1,
+    0,
+    3,
+    2,
+    -1,
+    6,
+    4,
+    4,
+    -1,
+    4,
+    3,
+    -1,
+    0,
+    0,
+    -1,
+    -1,
+    6,
+    -1,
+    0,
+    2,
+    -1,
+    0,
+    0,
+    2,
+    0,
+    0,
+    2,
+    -1,
+    0,
+    -1,
+    -1,
+    -1,
+    -1,
+    1,
+    -1,
+    1,
+    2,
+    2,
+    0,
+    -1,
+    2,
+    0,
+    -1,
+    1,
+    -1,
+    -1,
+    1,
+    3,
+    -1,
+    0,
+    2,
+    -1,
+    2,
+    -1,
+    1,
+    1,
+    0,
+    3,
+    2,
+    0,
+    3,
+    3,
+    -1,
+    0,
+    -1,
+    -1,
+    0,
+    0,
+    -1,
+    0,
+    1,
+    -1,
+    2,
+    4,
+    0,
+    -1,
+    -1,
+    0,
+    0,
+    0,
+    -1,
+    2,
+    0,
+    1,
+    -1,
+    4,
+    0,
+    0,
+    0,
+    0,
+    -1,
+    4,
+    0,
+    -1,
+    -1,
+    -1,
+    3,
+    3,
+    2,
+    0,
+    1,
+    -1,
+    0,
+    -1,
+    -1,
+    -1,
+    0,
+    0,
+    4,
+    2,
+    0,
+    -1,
+    -1,
+    -1,
+    3,
+    -1,
+    0,
+    -1,
+    4,
+    2,
+    0,
+    0,
+    0,
+    -1,
+    1,
+    0,
+    0,
+    2,
+    -1,
+    0,
+    3,
+    4,
+    -1,
+    2,
+    -1,
+    4,
+    -1,
+    0,
+    2,
+    0,
+    -1,
+    -1,
+    -1,
+    -1,
+    0,
+    0,
+    5,
+    2,
+    -1,
+    0,
+    0,
+    0,
+    -1,
+    1,
+    -1,
+    2,
+    2,
+    0,
+    0,
+    3,
+    0,
+    0,
+    6,
+    0,
+    0,
+    2,
+    -1,
+    -1,
+    -1,
+    -1,
+    -1,
+    -1,
+    -1,
+    3,
+    -1,
+    -1,
+    -1,
+    -1,
+    0,
+    0,
+    3,
+    -1,
+    -1,
+    0,
+    -1,
+    0,
+    6,
+    -1,
+    0,
+    1,
+    1,
+    1,
+    1,
+    0,
+    2,
+    1,
+    1,
+    -1,
+    1,
+    -1,
+    -1,
+    0,
+    0,
+    -1,
+    -1,
+    0,
+    0,
+    1,
+    0,
+    -1,
+    -1,
+    -1,
+    6,
+    -1,
+    3,
+    -1,
+    -1,
+    3,
+    0,
+    0,
+    0,
+    3,
+    0,
+    0,
+    -1,
+    2,
+    0,
+    -1,
+    -1,
+    0,
+    -1,
+    -1,
+    3,
+    2,
+    0,
+    -1,
+    0,
+    1,
+    -1,
+    -1,
+    0,
+    -1,
+    -1,
+    1,
+    2,
+    -1,
+    0,
+    -1,
+    -1,
+    -1,
+    0,
+    -1,
+    6,
+    -1,
+    -1,
+    -1,
+    2,
+    -1,
+    -1,
+    -1,
+    0,
+    -1,
+    -1,
+    0,
+    1,
+    -1,
+    0,
+    -1,
+    -1,
+    -1,
+    -1,
+    0,
+    3,
+    -1,
+    -1,
+    -1,
+    2,
+    2,
+    -1,
+    0,
+    0,
+    0,
+    -1,
+    0,
+    0,
+    1,
+    2,
+    -1,
+    -1,
+    1,
+    0,
+    2,
+    1,
+    0,
+    1,
+    1,
+    0,
+    0,
+    -1,
+    6,
+    2,
+    -1,
+    -1,
+    5,
+    -1,
+    -1,
+    -1,
+    6,
+    -1,
+    2,
+    -1,
+    0,
+    0,
+    -1,
+    -1,
+    -1,
+    0,
+    -1,
+    -1,
+    0,
+    -1,
+    1,
+    -1,
+    5,
+    0,
+    6,
+    -1,
+    -1,
+    -1,
+    0,
+    0,
+    -1,
+    0,
+    -1,
+    0,
+    2,
+    0,
+    -1,
+    0,
+    -1,
+    1,
+    0,
+    3,
+    0,
+    2,
+    1,
+    -1,
+    -1,
+    0,
+    2,
+    -1,
+    0,
+    1,
+    -1,
+    0,
+    0,
+    -1,
+    0,
+    -1,
+    1,
+    6,
+    6,
+    0,
+    -1,
+    -1,
+    -1,
+    -1,
+    6,
+    6,
+    -1,
+    1,
+    1,
+    1,
+    0,
+    -1,
+    -1,
+    -1,
+    1,
+    1,
+    -1,
+    1,
+    1,
+    -1,
+    1,
+    -1,
+    0,
+    -1,
+    1,
+    -1,
+    -1,
+    6,
+    -1,
+    1,
+    -1,
+    6,
+    0,
+    0,
+    1,
+    -1,
+    4,
+    1,
+    1,
+    1,
+    0,
+    -1,
+    5,
+    0,
+    -1,
+    -1,
+    -1,
+    0,
+    -1,
+    -1,
+    1,
+    1,
+    0,
+    1,
+    2,
+    1,
+    -1,
+    1,
+    1,
+    1,
+    0,
+    0,
+    -1,
+    6,
+    6,
+    1,
+    1,
+    1,
+    0,
+    -1,
+    1,
+    -1,
+    1,
+    -1,
+    -1,
+    1,
+    1,
+    5,
+    1,
+    1,
+    3,
+    1,
+    -1,
+    -1,
+    6,
+    1,
+    6,
+    1,
+    5,
+    -1,
+    1,
+    5,
+    1,
+    -1,
+    1,
+    1,
+    1,
+    0,
+    6,
+    1,
+    1,
+    1,
+    1,
+    1,
+    0,
+    1,
+    -1,
+    1,
+    -1,
+    -1,
+    -1,
+    6,
+    1,
+    5,
+    -1,
+    1,
+    1,
+    0,
+    -1,
+    1,
+    -1,
+    0,
+    1,
+    1,
+    5,
+    6,
+    -1,
+    -1,
+    1,
+    -1,
+    -1,
+    -1,
+    0,
+    -1,
+    5,
+    -1,
+    1,
+    1,
+    5,
+    0,
+    5,
+    5,
+    -1,
+    -1,
+    1,
+    0,
+    5,
+    5,
+    -1,
+    5,
+    -1,
+    -1,
+    1,
+    5,
+    5,
+    5,
+    1,
+    5,
+    1,
+    -1,
+    5,
+    1,
+    5,
+    0,
+    -1,
+    1,
+    0,
+    5,
+    1,
+    1,
+    1,
+    1,
+    5,
+    1,
+    0,
+    -1,
+    1,
+    5,
+    0,
+    5,
+    5,
+    1,
+    5,
+    5,
+    0,
+    5,
+    -1,
+    0,
+    5,
+    -1,
+    -1,
+    0,
+    1,
+    1,
+    -1,
+    6,
+    -1,
+    1,
+    5,
+    -1,
+    5,
+    5,
+    6,
+    1,
+    1,
+    1,
+    6,
+    1,
+    1,
+    5,
+    5,
+    1,
+    0,
+    1,
+    -1,
+    5,
+    1,
+    2,
+    1,
+    1,
+    1,
+    2,
+    1,
+    4,
+    4,
+    2,
+    -1,
+    1,
+    0,
+    6,
+    -1,
+    5,
+    1,
+    1,
+    -1,
+    -1,
+    -1,
+    5,
+    1,
+    -1,
+    5,
+    -1,
+    5,
+    1,
+    0,
+    3,
+    1,
+    -1,
+    -1,
+    -1,
+    5,
+    1,
+    1,
+    4,
+    -1,
+    1,
+    1,
+    1,
+    1,
+    -1,
+    1,
+    5,
+    0,
+    1,
+    1,
+    -1,
+    0,
+    0,
+    0,
+    -1,
+    0,
+    4,
+    0,
+    -1,
+    -1,
+    -1,
+    -1,
+    0,
+    4,
+    0,
+    -1,
+    0,
+    -1,
+    3,
+    1,
+    1,
+    1,
+    -1,
+    -1,
+    -1,
+    -1,
+    1,
+    1,
+    1,
+    1,
+    -1,
+    -1,
+    6,
+    -1,
+    0,
+    1,
+    0,
+    -1,
+    5,
+    5,
+    4,
+    5,
+    5,
+    0,
+    3,
+    0,
+    -1,
+    0,
+    -1,
+    -1,
+    -1,
+    -1,
+    -1,
+    -1,
+    -1,
+    6,
+    4,
+    -1,
+    6,
+    -1,
+    6,
+    -1,
+    -1,
+    -1,
+    0,
+    0,
+    6,
+    -1,
+    0,
+    -1,
+    0,
+    -1,
+    0,
+    6,
+    3,
+    6,
+    0,
+    0,
+    -1,
+    0,
+    0,
+    -1,
+    1,
+    -1,
+    -1,
+    6,
+    -1,
+    -1,
+    0,
+    -1,
+    -1,
+    0,
+    -1,
+    5,
+    2,
+    -1,
+    -1,
+    6,
+    1,
+    1,
+    0,
+    -1,
+    0,
+    -1,
+    -1,
+    0,
+    0,
+    6
+  ],
+  "topic_sizes": {
+    "-1": 458,
+    "4": 53,
+    "0": 366,
+    "2": 94,
+    "3": 62,
+    "1": 191,
+    "6": 40,
+    "5": 50
+  },
+  "topic_mapper": [
+    [
+      -1,
+      -1,
+      -1,
+      -1
+    ],
+    [
+      0,
+      0,
+      3,
+      5
+    ],
+    [
+      1,
+      1,
+      2,
+      3
+    ],
+    [
+      2,
+      2,
+      1,
+      0
+    ],
+    [
+      3,
+      3,
+      4,
+      4
+    ],
+    [
+      4,
+      4,
+      4,
+      4
+    ],
+    [
+      5,
+      5,
+      2,
+      3
+    ],
+    [
+      6,
+      6,
+      0,
+      2
+    ],
+    [
+      7,
+      7,
+      0,
+      2
+    ],
+    [
+      8,
+      8,
+      0,
+      2
+    ],
+    [
+      9,
+      9,
+      0,
+      2
+    ],
+    [
+      10,
+      10,
+      1,
+      0
+    ],
+    [
+      11,
+      11,
+      1,
+      0
+    ],
+    [
+      12,
+      12,
+      1,
+      0
+    ],
+    [
+      13,
+      13,
+      1,
+      0
+    ],
+    [
+      14,
+      14,
+      1,
+      0
+    ],
+    [
+      15,
+      15,
+      5,
+      1
+    ],
+    [
+      16,
+      16,
+      1,
+      0
+    ],
+    [
+      17,
+      17,
+      1,
+      0
+    ],
+    [
+      18,
+      18,
+      1,
+      0
+    ],
+    [
+      19,
+      19,
+      5,
+      1
+    ],
+    [
+      20,
+      20,
+      5,
+      1
+    ],
+    [
+      21,
+      21,
+      5,
+      1
+    ],
+    [
+      22,
+      22,
+      6,
+      6
+    ],
+    [
+      23,
+      23,
+      6,
+      6
+    ]
+  ],
+  "topic_labels": {
+    "-1": "-1_order_property_inspection_condition",
+    "0": "0_damaged_sign_sidewalk_missing",
+    "1": "1_license_complaint_illegal_violation",
+    "2": "2_water_basin_litter_missed",
+    "3": "3_noise_animal_truck_dead",
+    "4": "4_odor_food_air_smoke",
+    "5": "5_english_emergency_spanish_chinese",
+    "6": "6_exemption_commercial_tax_business"
+  },
+  "custom_labels": null,
+  "_outliers": 1,
+  "topic_aspects": {}
+}

models/final_model.json ADDED Viewed

The diff for this file is too large to render. See raw diff

reports/311_data_1.html ADDED Viewed

The diff for this file is too large to render. See raw diff

reports/weather_data_after2016_ts.html ADDED Viewed

The diff for this file is too large to render. See raw diff

reports/weather_data_ts.html ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+beautifulsoup4==4.12.3
+bertopic==0.16.1
+bs4==0.0.2
+bokeh==3.4.1
+darts==0.29.0
+folium==0.16.0
+gradio==4.27.0
+ipykernel==6.29.4
+ipywidgets==8.1.2
+jupyterlab==4.1.8
+matplotlib==3.8.4
+nbformat==5.10.4
+nltk==3.8.1
+numpy==1.26.4
+openpyxl==3.1.2
+pandas==2.2.2
+plotly==5.21.0
+polars==0.20.21
+prophet==1.1.5
+pyarrow==16.0.0
+scikit-learn==1.4.2
+scipy==1.13.0
+seaborn==0.13.2
+--extra-index-url https://download.pytorch.org/whl/cu121
+torch==2.2.2
+xgboost==2.0.3

utils.py ADDED Viewed

	@@ -0,0 +1,1028 @@

+import pandas as pd
+import polars as pl
+import numpy as np
+import json
+import gc
+import folium
+import html
+from matplotlib import pyplot as plt
+import seaborn as sns
+import xgboost as xgb
+from xgboost import plot_importance
+from bs4 import BeautifulSoup
+import plotly.express as px
+import plotly.graph_objects as go
+import plotly.figure_factory as ff
+from plotly.subplots import make_subplots
+import plotly.io as pio
+from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
+from statsmodels.tsa.stattools import kpss, adfuller
+from bertopic import BERTopic
+from collections import defaultdict
+color_pal = sns.color_palette("tab10")
+impute_cols = [
+    'MeanTemp', 'MinTemp', 'MaxTemp', 'DewPoint',
+    'Percipitation', 'WindSpeed', 'MaxSustainedWind',
+    'Gust', 'Rain', 'SnowDepth', 'SnowIce',
+]
+def convert_schema_to_polars(schema):
+    pl_schema = {}
+    for k, v in schema.items():
+        if v == "String":
+            pl_schema[k] = pl.String
+        elif v == "Float64":
+            pl_schema[k] = pl.Float64
+        elif v == "Int64":
+            pl_schema[k] = pl.Int64
+    return pl_schema
+def create_datetime(data, dt_col, format="%m/%d/%Y %I:%M:%S %p"):
+    # df type is either pandas or polars
+    df_type = "pandas" if isinstance(data, pd.DataFrame) else "polars"
+    if "datetime" in str(data[dt_col].dtype).lower():
+        return data
+    if df_type == "pandas":
+        data[dt_col] = pd.to_datetime(data[dt_col], format=format)
+    elif df_type == "polars":
+        data = data.with_columns(
+            pl.col(dt_col).str.strptime(pl.Date, format=format).cast(pl.Datetime)
+        )
+    return data
+def create_seasons(data, dt_col="Datetime", out_col="Season", prefix=""):
+    df_type = "pandas" if isinstance(data, pd.DataFrame) else "polars"
+    out_col = prefix + out_col
+    spring_start = pd.to_datetime("2018-3-20", format = "%Y-%m-%d").dayofyear
+    summer_start = pd.to_datetime("2018-6-21", format = "%Y-%m-%d").dayofyear
+    autumn_start = pd.to_datetime("2018-9-22", format = "%Y-%m-%d").dayofyear
+    winter_start = pd.to_datetime("2018-12-21", format = "%Y-%m-%d").dayofyear
+    if df_type == "pandas":
+        def map_season(date):
+            if date.dayofyear < spring_start or date.dayofyear >= winter_start:
+                return "Winter"
+            elif date.dayofyear >= spring_start and date.dayofyear < summer_start:
+                return "Spring"
+            elif date.dayofyear >= summer_start and date.dayofyear < autumn_start:
+                return "Summer"
+            elif date.dayofyear >= autumn_start and date.dayofyear < winter_start:
+                return "Autumn"
+        data[out_col] = data[dt_col].apply(map_season)
+        return data
+    elif df_type == "polars":
+        def map_season(date):
+            # for date in dates:
+            if date.timetuple().tm_yday < spring_start or date.timetuple().tm_yday >= winter_start:
+                return "Winter"
+            elif date.timetuple().tm_yday >= spring_start and date.timetuple().tm_yday < summer_start:
+                return "Spring"
+            elif date.timetuple().tm_yday >= summer_start and date.timetuple().tm_yday < autumn_start:
+                return "Summer"
+            elif date.timetuple().tm_yday >= autumn_start and date.timetuple().tm_yday < winter_start:
+                return "Autumn"
+        data = data.with_columns(
+            pl.col(dt_col).map_elements(map_season, return_dtype=pl.String).alias(out_col)
+        )
+        return data
+def create_weekend(data, dt_col="Datetime", out_col="is_weekend", prefix=""):
+    df_type = "pandas" if isinstance(data, pd.DataFrame) else "polars"
+    out_col = prefix + out_col
+    if df_type == "pandas":
+        data[out_col] = (data[dt_col].dt.weekday.isin([5,6])).astype(np.int8)
+    elif df_type == "polars":
+        data = data.with_columns(
+            pl.col(dt_col).dt.weekday().is_in([6,7]).cast(pl.Int8).alias(out_col)
+        )
+    return data
+def create_holidays(data, dt_col="Datetime", out_col="is_holiday", prefix=""):
+    df_type = "pandas" if isinstance(data, pd.DataFrame) else "polars"
+    out_col = prefix + out_col
+    # The only holiday not included will be new years as I expect a potential affect
+    HOLIDAYS = [
+        pd.to_datetime("2016-01-18"), pd.to_datetime("2016-02-15"),
+        pd.to_datetime("2016-05-30"), pd.to_datetime("2016-07-04"), pd.to_datetime("2016-09-05"),
+        pd.to_datetime("2016-10-10"), pd.to_datetime("2016-11-11"), pd.to_datetime("2016-11-24"),
+        # Christmas is variable (depends on what day is actually holiday vs. what day is XMAS)
+        pd.to_datetime("2016-12-24"), pd.to_datetime("2016-12-25"), pd.to_datetime("2016-12-26"),
+        pd.to_datetime("2017-01-16"), pd.to_datetime("2017-02-20"),
+        pd.to_datetime("2017-05-29"), pd.to_datetime("2017-07-04"), pd.to_datetime("2017-09-04"),
+        pd.to_datetime("2017-10-09"), pd.to_datetime("2017-11-10"), pd.to_datetime("2017-11-23"),
+        pd.to_datetime("2017-12-24"), pd.to_datetime("2017-12-25"),
+        pd.to_datetime("2018-01-15"), pd.to_datetime("2018-02-19"),
+        pd.to_datetime("2018-05-28"), pd.to_datetime("2018-07-04"), pd.to_datetime("2018-09-03"),
+        pd.to_datetime("2018-10-08"), pd.to_datetime("2018-11-12"), pd.to_datetime("2018-11-22"),
+        pd.to_datetime("2018-12-24"), pd.to_datetime("2018-12-25"),
+    ]
+    if df_type == "pandas":
+        data[out_col] = (data[dt_col].isin(HOLIDAYS)).astype(np.int8)
+    elif df_type == "polars":
+        data = data.with_columns(
+            pl.col(dt_col).dt.datetime().is_in(HOLIDAYS).cast(pl.Int8).alias(out_col)
+        )
+    return data
+def build_temporal_features(data, dt_col, prefix=""):
+    df_type = "pandas" if isinstance(data, pd.DataFrame) else "polars"
+    if df_type == "pandas" and data.index.name == dt_col:
+        data = data.reset_index()
+    if df_type == "pandas":
+        data[prefix+"Year"] = data[dt_col].dt.year.astype(np.int16)
+        data[prefix+"Month"] = data[dt_col].dt.month.astype(np.int8)
+        data[prefix+"Day"] = data[dt_col].dt.day.astype(np.int8)
+        data[prefix+"DayOfYear"] = data[dt_col].dt.dayofyear.astype(np.int16)
+        data[prefix+"DayOfWeek"] = data[dt_col].dt.dayofweek.astype(np.int8)
+    else:
+        data = data.with_columns (**{
+            prefix+"Year": pl.col(dt_col).dt.year().cast(pl.Int16),
+            prefix+"Month": pl.col(dt_col).dt.month().cast(pl.Int8),
+            prefix+"Day": pl.col(dt_col).dt.day().cast(pl.Int8),
+            prefix+"DayOfYear": pl.col(dt_col).dt.ordinal_day().cast(pl.Int16),
+            prefix+"DayOfWeek": pl.col(dt_col).dt.weekday().cast(pl.Int8)
+        })
+    data = create_seasons(data, dt_col, prefix=prefix)
+    data = create_weekend(data, dt_col, prefix=prefix)
+    data = create_holidays(data, dt_col, prefix=prefix)
+    return data
+def agg_and_merge_historical(curr_df, hist_df, col, agg_cols=[], ops=["mean", "max", "min"]):
+    merge_dict = {}
+    for agg_col in agg_cols:
+        describe_tb = hist_df.groupby(col)[agg_col].describe().reset_index()
+        if col not in merge_dict:
+            merge_dict[col] = describe_tb[col].values
+        for op in ops:
+            merge_col_name = "historical_" + col + "_" + op + "_" + agg_col
+            if op == "mean":
+                merge_dict[merge_col_name] = describe_tb["mean"].values
+            elif op == "max":
+                merge_dict[merge_col_name] = describe_tb["max"].values
+            elif op == "min":
+                merge_dict[merge_col_name] = describe_tb["min"].values
+            elif op == "median":
+                merge_dict[merge_col_name] = describe_tb["50%"].values
+            elif op == "std":
+                merge_dict[merge_col_name] = describe_tb["std"].values
+    merge_df = pd.merge(curr_df, pd.DataFrame(merge_dict), on=col, how="left")
+    return merge_df
+def map_vals(data, cols=["Latitude", "Longitude"], label_cols=[], color="red", submap=None, weight=3, radius=1, sample_size=10000, start_loc=[42.1657, -74.9481], zoom_start=6):
+    cols = cols
+    df_type = "pandas" if isinstance(data, pd.DataFrame) or isinstance(data, pd.Series) else "polars"
+    fig = folium.Figure(height=500, width=750)
+    if submap is None:
+        map_nyc = folium.Map(
+            location=start_loc,
+            zoom_start=zoom_start,
+            tiles='cartodbpositron',
+            zoom_control=False,
+            scrollWheelZoom=False,
+            dragging=False
+        )
+    else:
+        map_nyc = submap
+    cols.extend(label_cols)
+    if df_type == "pandas":
+        for idx, row in data.loc[:, cols].dropna().sample(sample_size).iterrows():
+            label = ""
+            lat, long = row.iloc[0,], row.iloc[1,]
+            for i, label_col in enumerate(label_cols):
+                label += label_col + ": " + str(row.iloc[2+i,]) + "\n"
+            label_params = {"popup": label, "tooltip": label} if len(label_cols) > 0 else {}
+            folium.CircleMarker(location=[lat, long], radius=radius, weight=weight, color=color, fill_color=color, fill_opacity=0.7, **label_params).add_to(map_nyc)
+    else:
+        for row in data[:, cols].drop_nulls().sample(sample_size).rows():
+            label = ""
+            lat, long = row[0], row[1]
+            for i, label_col in enumerate(label_cols):
+                label += label_col + ": " + str(row[2+i]) + "\n"
+            label_params = {"popup": label, "tooltip": label} if len(label_cols) > 0 else {}
+            folium.CircleMarker(location=[lat, long], radius=radius, weight=weight, color=color, fill_color=color, fill_opacity=0.7, **label_params).add_to(map_nyc)
+    fig.add_child(map_nyc)
+    return fig, map_nyc
+def find_variable_data(soup, curr_var = "Created Date"):
+    src = "<!doctype html>"
+    # HTML and head start
+    src += "<html lang=\"en\">"
+    src += str(soup.find("head"))
+    # Body -> content -> container -> row -> variable
+    src += "<body style=\"background-color: var(--table-odd-background-fill); padding-top: 20px;\">"
+    src += "<div class=\"content\" style=\"padding-left: 150px; padding-right: 150px; border: 0px !important; \">"
+    # src += "<div class=\"container\">"
+    src += "<div class=\"section-items\" style=\"background-color: white;\">"
+    # src += "<div class=\"row spacing\">"
+    variables_html = soup.find_all("div", class_="variable")
+    for var_html in variables_html:
+        if var_html.text[:len(curr_var)] == curr_var:
+            parent = var_html.parent
+            parent['style'] = "border: 0px"
+            src += str(parent)
+            break
+    src += "</div></div>"
+    # Scripts
+    for script in soup.find_all("script"):
+        src += str(script)
+    # End
+    src += "</body>"
+    src += "</html>"
+    # src = BeautifulSoup(src, 'html.parser').prettify()
+    src_doc = html.escape(src)
+    iframe = f'<iframe width="100%" height="1200px" srcdoc="{src_doc}" frameborder="0"></iframe>'
+    return iframe, src_doc
+def plot_autocorr(data, col, apply=None):
+    time_series = data.loc[:, col].to_frame().copy()
+    if apply:
+        time_series[col] = time_series[col].apply(apply)
+    fig, ax = plt.subplots(2, 1, figsize=(12, 8))
+    _ = plot_acf(time_series[col], lags=30, ax=ax[0])
+    _ = plot_pacf(time_series[col], lags=30, method="ols-adjusted", ax=ax[1])
+    _ = plt.suptitle(f"{col}", y=0.95)
+    return fig
+def adf_test(timeseries):
+    dftest = adfuller(timeseries, autolag='AIC')
+    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','Lags Used','Number of Observations Used'])
+    dfoutput['Number of Observations Used'] = dfoutput['Number of Observations Used'].astype(np.int64)
+    for key,value in dftest[4].items():
+       dfoutput['Critical Value (%s)'%key] = value
+    return dfoutput
+def kpss_test(timeseries):
+    kpsstest = kpss(timeseries, regression='ct')
+    kpss_output = pd.Series(kpsstest[0:3], index=['Test Statistic','p-value','Lags Used'])
+    for key,value in kpsstest[3].items():
+      kpss_output['Critical Value (%s)'%key] = value
+    return kpss_output
+def test_stationary(data, var):
+    adf_df = adf_test(data[var].dropna())
+    kpss_df = kpss_test(data[var].dropna())
+    result_df = adf_df.to_frame(name="Augmented-Dickey-Fuller")
+    result_df["KPSS Test"] = kpss_df
+    def pass_hypothesis(col):
+        test_stat, p_val = col.iloc[0], col.iloc[1]
+        one_p, five_p, ten_p = col.iloc[4], col.iloc[5], col.iloc[6]
+        if col.name == "KPSS Test":
+            if test_stat < one_p and p_val < 0.01:
+                color_fmt = ["background-color: #fc5749; font-weight: bold; color: black"]
+            elif test_stat < five_p and p_val < 0.05:
+                color_fmt = ["background-color: #F88379; font-weight: bold; color: black"]
+            elif test_stat < ten_p and p_val < 0.1:
+                color_fmt = ["background-color: #ff9f96; font-weight: bold; color: black"]
+            else:
+                color_fmt = ["background-color: green; font-weight: bold; color: black"]
+        else:
+            if test_stat < one_p and p_val < 0.01:
+                color_fmt = ["background-color: green; font-weight: bold; color: black"]
+            elif test_stat < five_p and p_val < 0.05:
+                color_fmt = ["background-color: greenyellow; font-weight: bold; color: black"]
+            elif test_stat < ten_p and p_val < 0.1:
+                color_fmt = ["background-color: lightgreen; font-weight: bold; color: black"]
+            else:
+                color_fmt = ["background-color: #fc5749; font-weight: bold; color: black"]
+        color_fmt.extend(['' for _ in col[1:]])
+        return color_fmt
+    result_df.loc["Lags Used",:] = result_df.loc["Lags Used",:].astype(np.int32)
+    return result_df.style.apply(pass_hypothesis)
+def plot_timeseries(data, var, data_name="My", all_vars=[], height=800, width=600, start_date="2017-12-31", end_date="2018-12-31"):
+    if var == "":
+        return gr.update()
+    fig = go.Figure()
+    fig.add_trace(
+        go.Scatter(
+            x=data.index,
+            y=data[var],
+            name=var,
+            customdata=np.dstack((data["Season"].to_numpy(), data.reset_index()["Datetime"].dt.day_name().to_numpy(), data["is_holiday"].astype(bool).to_numpy()))[0],
+            hovertemplate='<br>value:%{y:.3f} <br>Season: %{customdata[0]} <br>Weekday: %{customdata[1]} <br>Is Holiday: %{customdata[2]}',
+        )
+    )
+    fig.update_layout(
+        autosize=True,
+        title=f"{data_name} Time Series by {var}",
+        xaxis_title='Date',
+        yaxis_title=var,
+        hovermode='x unified',
+    )
+    fig.update_layout(
+        autosize=True,
+        xaxis=dict(
+            rangeselector=dict(
+                buttons=list([
+                    dict(count=7, label="1w", step="day", stepmode="backward"),
+                    dict(count=21, label="3w", step="day", stepmode="backward"),
+                    dict(count=1, label="1m", step="month", stepmode="backward"),
+                    dict(count=6, label="6m", step="month", stepmode="backward"),
+                    dict(count=1, label="1y", step="year", stepmode="backward"),
+                    dict(step="all")
+                ])
+            ),
+            rangeslider=dict(
+                visible=True,
+                #
+            ),
+            type="date",
+            range=(start_date, end_date),
+        ),
+    )
+    return fig
+def plot_bivariate(data, x, y, subset=None, trendline=True):
+    title = f"Scatterplot of {x} vs. {y}"
+    if subset == "None" or subset is None:
+        subset = None
+        height = 450
+    else:
+        subset_title = subset.replace(" String","")
+        title += f" By {subset_title}"
+        if subset_title in ["Season", "Year"]:
+            height = 450
+        else:
+            height = 800
+    if trendline:
+        trendline = "ols"
+    else:
+        trendline = None
+    # Special case to view categorical features
+    if x in ["Agency", "Borough", "Descriptor"]:
+        if x == "Agency":
+            prefix = 'AG'
+        elif x == "Borough":
+            prefix = "Borough"
+        else:
+            prefix="DG"
+        categories = [col for col in data.columns if prefix in col]
+        melt_df = pd.melt(data, id_vars=["Target"], value_vars=categories)
+        fig = px.scatter(
+            melt_df,
+            x="value",
+            y="Target",
+            trendline=trendline,
+            facet_col="variable",
+            facet_col_wrap=4,
+            facet_col_spacing=0.05,
+            title=title
+        )
+        height = 800
+    else:
+        fig = px.scatter(
+            data,
+            x=x, y=y,
+            trendline=trendline,
+            facet_col=subset,
+            facet_col_wrap=4,
+            facet_col_spacing=0.05,
+            title=title
+        )
+    fig.update_layout(
+        autosize=True,
+        height=height,
+    )
+    return fig
+def plot_seasonality(data, x, y, show_box=True, show_outliers=False):
+    title = f"{y} by {x}"
+    if show_box:
+        if show_outliers:
+            points = "outliers"
+        else:
+            points = "all"
+        fig = px.box(data, x=x, y=y, points=points, title=title, facet_col_wrap=4, facet_col_spacing=0.05,)
+    else:
+        fig = px.strip(data, x=x, y=y, title=title, facet_col_wrap=4, facet_col_spacing=0.05,)
+    fig.update_layout(
+        autosize=True,
+        height=600,
+    )
+    return fig
+def build_service_data(filename):
+    # Loading data directly with polars leads to errors
+    # Some rows end up missing for an unknown reason
+    # FIX: Load in pandas then convert to polars
+    service_data_pd = pd.read_csv(filename)
+    # Quick test to assure the unique key is in fact unique
+    assert service_data_pd["Unique Key"].nunique() == len(service_data_pd)
+    # Load from pandas Dataframe
+    service_data_pd["Incident Zip"] = service_data_pd["Incident Zip"].astype("string")
+    service_data_pd["BBL"] = service_data_pd["BBL"].astype("string")
+    service_data = pl.DataFrame(service_data_pd)
+    # Clear some ram
+    del service_data_pd
+    gc.collect()
+    drop_cols = [
+        "Unique Key", "Agency Name", "Location Type", "Incident Zip",
+        "Incident Address", "Street Name", "Cross Street 1",
+        "Cross Street 2", "Intersection Street 1", "Intersection Street 2",
+        "Address Type", "City", "Landmark", "Facility Type",
+        "Status", "Due Date", "Resolution Description",
+        "Resolution Action Updated Date", "Community Board",
+        "BBL", "X Coordinate (State Plane)", "Y Coordinate (State Plane)",
+        "Open Data Channel Type", "Park Facility Name", "Park Borough",
+        "Vehicle Type", "Taxi Company Borough", "Taxi Pick Up Location",
+        "Bridge Highway Name", "Bridge Highway Direction", "Road Ramp",
+        "Bridge Highway Segment", "Location", "Created Year"
+    ]
+    # Drop columns and create the date variable
+    service_data = service_data.drop(drop_cols)
+    service_data = create_datetime(service_data, "Created Date")
+    service_data = create_datetime(service_data, "Closed Date")
+    # Group by date to get the number of Created tickets (as target)
+    sd_grouped = service_data.rename({"Created Date": "Datetime"}).group_by("Datetime").agg(
+        pl.len().alias("Target"),
+    ).sort(by="Datetime")
+    # Calculate the number of closed tickets
+    # Mean diff used to filter service data
+    # mean_diff = service_data.with_columns(
+    #     diff_created_closed = pl.col("Closed Date") - pl.col("Created Date")
+    # ).filter((pl.col("Closed Date").dt.year() >= 2016) & (pl.col("Closed Date").dt.year() < 2020))["diff_created_closed"].mean().days
+    # Mean diff precalculated as
+    mean_diff = 13
+    # Create new Closed date with errors filled using the mean diff above
+    service_data = service_data.with_columns(
+        Closed_Date_New = pl.when(pl.col("Created Date") - pl.col("Closed Date")  > pl.duration(days=1))
+                                .then(pl.col("Created Date") + pl.duration(days=mean_diff))
+                                .otherwise(pl.col("Closed Date")).fill_null(pl.col("Created Date") + pl.duration(days=mean_diff))
+    )
+    # Filter tickets such that the closed date < the created date to prevent future data leakage in our dataset
+    # We want to make sure future data is not accidentally leaked across other points in our data
+    closed_tickets = service_data.group_by(["Closed_Date_New", "Created Date"]) \
+        .agg((pl.when(pl.col("Created Date") <= pl.col("Closed_Date_New")).then(1).otherwise(0)).sum().alias("count")) \
+        .sort("Closed_Date_New") \
+        .filter((pl.col("Closed_Date_New").dt.year() >= 2016) & (pl.col("Closed_Date_New").dt.year() < 2019)) \
+        .group_by("Closed_Date_New").agg(pl.col("count").sum().alias("num_closed_tickets"))
+    # Rename this column to num closed tickets
+    ct_df = closed_tickets.with_columns(
+        pl.col("num_closed_tickets")
+    )
+    # Concat the new columns into our data
+    sd_df = pl.concat([sd_grouped, ct_df.drop("Closed_Date_New")], how="horizontal")
+    assert len(sd_grouped) == len(ct_df)
+    # CATEGORICAL FEATURE MAPPING
+    # MAPPING FOR BOROUGH
+    Borough_Map = {
+        "Unspecified": "OTHER",
+        "2017": "OTHER",
+        None: "OTHER",
+        "2016": "OTHER"
+    }
+    service_data = service_data.with_columns(
+        pl.col("Borough").replace(Borough_Map)
+    )
+    # MAPPING FOR AGENCY
+    # This mapping was done Manually
+    Agency_Map = {
+        "NYPD": "Security", "HPD": "Buildings", "DOT": "Transportation",
+        "DSNY": "Environment & Sanitation", "DEP": "Environment & Sanitation",
+        "DOB": "Buildings", "DOE": "Buildings", "DPR": "Parks",
+        "DOHMH": "Health", "DOF": "Other", "DHS": "Security",
+        "TLC": "Transportation", "HRA": "Other", "DCA": "Other",
+        "DFTA": "Other", "EDC": "Other", "DOITT": "Other", "OMB": "Other",
+        "DCAS": "Other", "NYCEM": "Other", "ACS": "Other", "3-1-1": "Other",
+        "TAX": "Other", "DCP": "Other", "DORIS": "Other", "FDNY": "Other",
+        "TAT": "Other", "COIB": "Other", "CEO": "Other", "MOC": "Other",
+    }
+    service_data = service_data.with_columns(
+        pl.col("Agency").replace(Agency_Map).alias("AG") # AG Shorthand for Agency Groups
+    )
+    # Mapping for Descriptor using BERTopic
+    # Store descriptors as pandas dataframe (polars not supported)
+    # Drop any nan values, and we only care about the unique values
+    descriptor_docs = service_data["Descriptor"].unique().to_numpy()
+    # Build our topic mapping using the pretrained BERTopic model
+    # Load model and get predictions
+    topic_model = BERTopic.load("models/BERTopic")
+    topics, probs = topic_model.transform(descriptor_docs)
+    # Visualize if wanted
+    # topic_model.visualize_barchart(list(range(-1,6,1)))
+    # Create a topic to ID map
+    topic_df = topic_model.get_topic_info()
+    topic_id_map = {row["Topic"]: row["Name"][2:] for _, row in topic_df.iterrows()}
+    topic_id_map[-1] = topic_id_map[-1][1:] # Fix for the -1 topic case
+    # For each document (descriptor string) get a mapping of topics
+    doc_to_topic_map = defaultdict(str)
+    for topic_id, doc in zip(topics, descriptor_docs):
+        topic = topic_id_map[topic_id]
+        doc_to_topic_map[doc] = topic
+    service_data = service_data.with_columns(
+        pl.col("Descriptor").replace(doc_to_topic_map).alias("DG") # DG Shorthand for descriptor Groups
+    )
+    # One Hot Encode Features
+    cat_features = ["AG", "Borough", "DG"]
+    service_data = service_data.to_dummies(columns=cat_features)
+    # Group by Date and create our Category Feature Vector
+    cat_df = service_data.rename({"Created Date": "Datetime"}).group_by("Datetime").agg(
+        # Categorical Features Sum
+        pl.col('^AG_.*$').sum(),
+        pl.col('^Borough_.*$').sum(),
+        pl.col('^DG_.*$').sum(),
+    ).sort(by="Datetime")
+    # Concat our category features to our current dataframe
+    sd_df = pl.concat([sd_df, cat_df.drop("Datetime")], how="horizontal")
+    # Now that our dataframe is significantly reduced in size
+    # We can finally convert back to a pandas dataframe
+    # as pandas is usable across more python packages
+    sd_df = sd_df.to_pandas()
+    # Set index to datetime
+    sd_df = sd_df.set_index("Datetime")
+    # NOTE we added 7 new rows to our weather df
+    # These 7 new rows will essentially be our final pred set
+    # The Target for these rows will be null -> indicating it needs to be predicted
+    # Add these rows to the service dataframe
+    preds_df = pd.DataFrame({'Datetime': pd.date_range(start=sd_df.index[-1], periods=8, freq='D')})[1:]
+    sd_df = pd.concat([sd_df, preds_df.set_index("Datetime")], axis=0)
+    return sd_df
+# Build all weather data from file
+def build_weather_data(filename):
+    # Use pandas to read file
+    weather_data = pd.read_csv(filename)
+    # Quickly aggregate Year, Month, Day into a datetime object
+    # This is because the 311 data uses datetime
+    weather_data["Datetime"] = weather_data["Year"].astype("str") + "-" + weather_data["Month"].astype("str") + "-" + weather_data["Day"].astype("str")
+    weather_data = create_datetime(weather_data, "Datetime", format="%Y-%m-%d")
+    # LOCALIZE
+    # Pre-recorded min/max values from the service data (so we don't need again)
+    lat_min = 40.49804421521046
+    lat_max = 40.91294056699566
+    long_min = -74.25521082506387
+    long_max = -73.70038354802529
+    # Create the conditions for location matching
+    mincon_lat = weather_data["Latitude"] >= lat_min
+    maxcon_lat = weather_data["Latitude"] <= lat_max
+    mincon_long = weather_data["Longitude"] >= long_min
+    maxcon_long = weather_data["Longitude"] <= long_max
+    # Localize our data to match the service data
+    wd_localized = weather_data.loc[mincon_lat & maxcon_lat & mincon_long & maxcon_long]
+    drop_cols = [
+        "USAF",
+        "WBAN",
+        "StationName",
+        "State",
+        "Latitude",
+        "Longitude"
+    ]
+    wd_localized = wd_localized.drop(columns=drop_cols)
+    # AGGREGATE
+    # Map columns with aggregation method
+    mean_cols = [
+        'MeanTemp',
+        'DewPoint',
+        'Percipitation',
+        'WindSpeed',
+        'Gust',
+        'SnowDepth',
+    ]
+    min_cols = [
+        'MinTemp'
+    ]
+    max_cols = [
+        'MaxTemp',
+        'MaxSustainedWind'
+    ]
+    round_cols = [
+        'Rain',
+        'SnowIce'
+    ]
+    # Perform Aggregation
+    mean_df = wd_localized.groupby("Datetime")[mean_cols].mean()
+    min_df = wd_localized.groupby("Datetime")[min_cols].min()
+    max_df = wd_localized.groupby("Datetime")[max_cols].max()
+    round_df = wd_localized.groupby("Datetime")[round_cols].mean().round().astype(np.int8)
+    wd_full = pd.concat([mean_df, min_df, max_df, round_df], axis=1)
+    # Add seasonal features
+    wd_full = build_temporal_features(wd_full, "Datetime")
+    wd_full["Season"] = wd_full["Season"].astype("category")
+    wd_full = wd_full.set_index("Datetime")
+    # We will calculate the imputation for the next 7 days after 12/31/2018
+    # Along with the 49 missing days
+    # This will act as our "Weather Forecast"
+    time_steps = 49 + 7
+    # Impute Cols
+    impute_cols = [
+        'MeanTemp', 'MinTemp', 'MaxTemp', 'DewPoint',
+        'Percipitation', 'WindSpeed', 'MaxSustainedWind',
+        'Gust', 'Rain', 'SnowDepth', 'SnowIce',
+    ]
+    # Mean Vars
+    mean_vars = ["WindSpeed", "MaxSustainedWind", "Gust", "SnowDepth"]
+    min_vars = ["SnowIce", "MeanTemp", "MinTemp", "MaxTemp", "DewPoint", "Percipitation"]
+    max_vars = ["Rain"]
+    # Use the imported function to create the imputed data
+    preds_mean = impute_missing_weather(wd_full, strategy="mean", time_steps=time_steps, impute_cols=mean_vars)
+    preds_min = impute_missing_weather(wd_full, strategy="min", time_steps=time_steps, impute_cols=min_vars)
+    preds_max = impute_missing_weather(wd_full, strategy="max", time_steps=time_steps, impute_cols=max_vars)
+    all_preds = pd.concat([preds_mean, preds_min, preds_max], axis=1)
+    all_preds = build_temporal_features(all_preds.loc[:, impute_cols], "Datetime")
+    all_preds = all_preds.set_index("Datetime")
+    wd_curr = wd_full.loc[wd_full["Year"] >= 2016]
+    wd_df = pd.concat([wd_full, all_preds], axis=0, join="outer")
+    time_vars = ["Year", "Month", "Day", "DayOfWeek", "DayOfYear", "is_weekend", "is_holiday", "Season"]
+    wd_df.drop(columns=time_vars)
+    return wd_df
+class MyNaiveImputer():
+    def __init__(self, data, time_steps=49, freq="D"):
+        self.data = data.reset_index().copy()
+        start_date = self.data["Datetime"].max() + pd.Timedelta(days=1)
+        end_date = start_date + pd.Timedelta(days=time_steps-1)
+        missing_range = pd.date_range(start_date, end_date, freq="D")
+        self.missing_df = pd.DataFrame(missing_range, columns=["Datetime"])
+        self.missing_df = build_temporal_features(self.missing_df, "Datetime")
+    def impute(self, col, by="DayOfYear", strategy="mean"):
+        def naive_impute_by(val, impute_X, data, by=by, strategy=strategy):
+            if strategy.lower() == "mean":
+                func = pd.core.groupby.DataFrameGroupBy.mean
+            elif strategy.lower() == "median":
+                func = pd.core.groupby.DataFrameGroupBy.median
+            elif strategy.lower() == "max":
+                func = pd.core.groupby.DataFrameGroupBy.max
+            elif strategy.lower() == "min":
+                func = pd.core.groupby.DataFrameGroupBy.min
+            grouped = func(data.groupby(by)[impute_X])
+            return grouped[val]
+        return self.missing_df["DayOfYear"].apply(naive_impute_by, args=(col, self.data, by, strategy))
+    def impute_all(self, cols, by="DayOfYear", strategy="mean"):
+        output_df = self.missing_df.copy()
+        for col in cols:
+            output_df[col] = self.impute(col, by, strategy)
+        return output_df
+def impute_missing_weather(data, strategy="mean", time_steps=7, impute_cols=impute_cols):
+    final_imputer = MyNaiveImputer(data, time_steps=time_steps)
+    preds = final_imputer.impute_all(impute_cols, strategy=strategy).set_index("Datetime")
+    return preds
+def get_feature_importance(data, target, split_date="01-01-2016", print_score=False):
+    import torch
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    train = data.loc[data.index <= pd.to_datetime(split_date)]
+    test = data.loc[data.index > pd.to_datetime(split_date)]
+    if type(target) == str:
+        X_train, X_test = train.drop(columns=target), test.drop(columns=target)
+        y_train, y_test = train[target], test[target]
+    else:
+        X_train, X_test = train, test
+        y_train, y_test = target.loc[train.index], target.loc[test.index]
+        target = str(target.name)
+    if 'int' in y_train.dtype.name:
+        # Use binary Classifier
+        metric = "logloss"
+        model = xgb.XGBClassifier(
+            base_score=0.25,
+            n_estimators=500,
+            early_stopping_rounds=50,
+            objective='binary:logistic',
+            device=device,
+            max_depth=3,
+            learning_rate=0.01,
+            enable_categorical=True,
+            eval_metric="logloss",
+            importance_type="gain",
+            random_state=22,
+        )
+    else:
+        metric = "MAPE"
+        model = xgb.XGBRegressor(
+            n_estimators=500,
+            early_stopping_rounds=50,
+            objective='reg:squarederror',
+            device=device,
+            max_depth=3,
+            learning_rate=0.01,
+            enable_categorical=True,
+            eval_metric="mape",
+            importance_type="gain",
+            random_state=22,
+        )
+    _ = model.fit(
+        X_train, y_train,
+        eval_set=[(X_train, y_train), (X_test, y_test)],
+        verbose=False
+    )
+    fig, ax = plt.subplots()
+    ax = plot_importance(model, title=f"Feature Importance for {target}", ax=ax)
+    if print_score:
+        best_score = str(round(100*model.best_score,2))+"%"
+        print(f"Best {metric}: {best_score}")
+    return fig, model
+def corr_with_lag(data, target_col, covar, lags=[1], method="pearson"):
+    data_lagged = pd.DataFrame()
+    data_lagged["Target"] = data[target_col]
+    for lag in lags:
+        new_col = f"lag_{lag}D"
+        data_lagged[new_col] = data[covar].shift(lag)
+    return data_lagged.dropna().corr(method=method)
+def plot_correlations(data, target, covar, lags=[0,1,2,3,4,5,6,7,10,14,18,21], method="pearson"):
+    df_corr = corr_with_lag(data, target, covar, lags, method)
+    mask = np.triu(np.ones_like(df_corr, dtype=bool))
+    z_dim, x_dim = len(df_corr.to_numpy()), len(df_corr.columns)
+    y_dim = x_dim
+    fig = ff.create_annotated_heatmap(
+        z=df_corr.mask(mask).to_numpy(),
+        x=df_corr.columns.tolist(),
+        y=df_corr.columns.tolist(),
+        colorscale=px.colors.diverging.RdBu,
+        zmin=-1,
+        zmax=1,
+        ygap=2,
+        xgap=2,
+        name="",
+        customdata=np.full((x_dim, y_dim, z_dim), covar),
+        hovertemplate='%{customdata[0]}<br>%{x} to %{y}<br>Correlation: %{z:.4f}',
+        showscale=True
+    )
+    fig.update_layout(
+        title_text=f"Correlation Heatmap of Lagged {covar}",
+        title_x=0.5,
+        height=600,
+        xaxis_showgrid=False,
+        yaxis_showgrid=False,
+        xaxis_zeroline=False,
+        yaxis_zeroline=False,
+        yaxis_autorange='reversed',
+        template='plotly_white'
+    )
+    # fig.update_annotations(font=dict(color="black"))
+    for i in range(len(fig.layout.annotations)):
+        if fig.layout.annotations[i].text == 'nan':
+            fig.layout.annotations[i].text = ""
+        else:
+            corr_i = round(float(fig.layout.annotations[i].text), 3)
+            fig.layout.annotations[i].text = corr_i
+            if (corr_i > 0.2 and corr_i < 0.5) or (corr_i < -0.2 and corr_i > -0.5):
+                fig.layout.annotations[i].font.color = "white"
+    return fig
+def plot_all_correlations(data, data_name="weather", method="pearson", width=1392, height=600):
+    if data_name == "weather":
+        covars = ["MeanTemp", "MinTemp", "MaxTemp", 'DewPoint', 'Percipitation', 'WindSpeed', 'Gust', 'MaxSustainedWind', "SnowDepth", "SnowIce", "Rain", "Target"]
+    elif data_name == "service":
+        covars = [
+            "num_closed_tickets",
+            # Agency Group Counts
+            'AG_Buildings', 'AG_Environment & Sanitation', 'AG_Health',
+            'AG_Parks', 'AG_Security', 'AG_Transportation',
+            'AG_Other',
+            # Borough Counts
+            'Borough_BRONX', 'Borough_BROOKLYN', 'Borough_MANHATTAN',
+            'Borough_QUEENS', 'Borough_STATEN ISLAND',
+            'Borough_OTHER',
+            # Descriptor Group Counts
+            'DG_damaged_sign_sidewalk_missing',
+            'DG_english_emergency_spanish_chinese',
+            'DG_exemption_commercial_tax_business',
+            'DG_license_complaint_illegal_violation', 'DG_noise_animal_truck_dead',
+            'DG_odor_food_air_smoke', 'DG_order_property_inspection_condition',
+            'DG_water_basin_litter_missed', "Target"
+        ]
+    df_corr = data.loc[:, covars].corr(method=method)
+    mask = np.triu(np.ones_like(df_corr, dtype=bool))
+    fig = ff.create_annotated_heatmap(
+        z=df_corr.mask(mask).to_numpy(),
+        x=df_corr.columns.tolist(),
+        y=df_corr.columns.tolist(),
+        colorscale=px.colors.diverging.RdBu,
+        zmin=-1,
+        zmax=1,
+        ygap=2,
+        xgap=2,
+        name="",
+        hovertemplate='%{x}-%{y} <br>Correlation: %{z:.4f}',
+        showscale=True
+    )
+    fig.update_layout(
+        title_text=f"Correlation Heatmap of Weather Variables & Target",
+        title_x=0.5,
+        height=600,
+        width=width,
+        xaxis_showgrid=False,
+        yaxis_showgrid=False,
+        xaxis_zeroline=False,
+        yaxis_zeroline=False,
+        yaxis_autorange='reversed',
+        template='plotly_white'
+    )
+    fig.update_annotations(font=dict(color="black"))
+    for i in range(len(fig.layout.annotations)):
+        if fig.layout.annotations[i].text == 'nan':
+            fig.layout.annotations[i].text = ""
+        else:
+            corr_i = round(float(fig.layout.annotations[i].text), 3)
+            fig.layout.annotations[i].text = corr_i
+            if corr_i > 0.5 or corr_i < -0.5:
+                fig.layout.annotations[i].font.color = "white"
+    return fig
+def plot_gust_interpolation(data):
+    fig, ax = plt.subplots(2, 2, figsize=(15,12))
+    data["Gust_lin"].plot(ax=ax[0][0], color=color_pal[0], title="linear")
+    data["Gust_spline3"].plot(ax=ax[0][1], color=color_pal[1], title="spline3")
+    data["Gust_spline5"].plot(ax=ax[1][0], color=color_pal[2], title="spline5")
+    data["Gust_quad"].plot(ax=ax[1][1], color=color_pal[3], title="quadratic")
+    curr_fig = plt.gcf()
+    plt.close()
+    return curr_fig
+def plot_train_split(train, val):
+    fig = plt.subplots(figsize=(15, 5))
+    ax = train["Target"].plot(label="Training Set")
+    val["Target"].plot(label="Validation Set", ax=ax)
+    ax.axvline('2018-04-01', color='black', ls='--')
+    ax.legend()
+    ax.set_title("Train Test Split (2018-04-01)")
+    curr_fig = plt.gcf()
+    plt.close()
+    return curr_fig
+def plot_predictions(train, val, preds):
+    fig = plt.subplots(figsize=(16, 5))
+    ax = train["Target"].plot(label="Training Set")
+    val["Target"].plot(label="Validation Set", ax=ax)
+    val["Prediction"] = preds
+    val["Prediction"].plot(label="Prediction", ax=ax)
+    ax.axvline('2018-04-01', color='black', ls='--')
+    ax.legend()
+    ax.set_title("Model Prediction for 311 Call Volume")
+    curr_fig = plt.gcf()
+    plt.close()
+    return curr_fig
+def plot_final_feature_importance(model):
+    fig, ax = plt.subplots(figsize=(12,6))
+    ax = plot_importance(model, max_num_features=20, title=f"Feature Importance for 311 Service Calls", ax=ax)
+    curr_fig = plt.gcf()
+    plt.close()
+    return curr_fig
+def predict_recurse(dataset, test, model, features_to_impute=['Target_L1D', 'Target_Diff7D', 'Target_Diff14D'], last_feature='Target_L6D'):
+    n_steps = len(test)
+    merged_data = pd.concat([dataset[-14:], test], axis=0)
+    all_index = merged_data.index
+    X_test = test.drop(columns="Target")
+    sd = -6 # Starting point for filling next value
+    # For each step, get the predictions
+    for i in range(n_steps-1):
+        pred = model.predict(X_test)[i]
+        # For the three features needed, compute the new value
+        X_test.loc[all_index[sd+i], features_to_impute[0]] = pred
+        X_test.loc[all_index[sd+i], features_to_impute[1]] = pred - merged_data.loc[all_index[sd+i-7], features_to_impute[1]]
+        X_test.loc[all_index[sd+i], features_to_impute[2]] = pred - merged_data.loc[all_index[sd+i-14], features_to_impute[2]]
+        # In the last iteration compute the Lag6D value
+        if i == 5:
+            X_test.loc[all_index[sd+i], last_feature] = pred - merged_data.loc[all_index[sd+i-6], last_feature]
+    final_preds = model.predict(X_test)
+    return final_preds