File size: 67,052 Bytes
dad00c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
import sys
import inspect
import math
import pandas as pd
import numpy as np
import polars as pl
import seaborn as sns
import matplotlib
import utils
from matplotlib import pyplot as plt
import sklearn
import gradio as gr
from IPython.display import display
import plotly.figure_factory as ff
from sklearn.impute import SimpleImputer
from utils import create_seasons
from bs4 import BeautifulSoup
from IPython.display import display, HTML
from bertopic import BERTopic
import html
import xgboost as xgb
from xgboost import plot_importance
from sklearn.metrics import r2_score, mean_absolute_percentage_error
from utils import find_variable_data, build_temporal_features, create_datetime, map_vals
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.io as pio
import folium
import gc
import json
from utils import MyNaiveImputer
matplotlib.use('agg')
dark_mode = """

function refresh() {

    const url = new URL(window.location);



    if (url.searchParams.get('__theme') !== 'dark') {

        url.searchParams.set('__theme', 'dark');

        window.location.href = url.href;

    }

}

"""

# Imputation Variables
wd_full_local = pd.read_csv("data/weather_aggregated_2010-2018.csv", index_col=0)
wd_full_local = wd_full_local.reset_index()
wd_full_local["Datetime"] = pd.to_datetime(wd_full_local["Datetime"], format="%Y-%m-%d")
wd_full_local = build_temporal_features(wd_full_local, "Datetime")
impute_cols = ['MeanTemp', 'MinTemp', 'MaxTemp', 'DewPoint',
               'Percipitation', 'WindSpeed', 'MaxSustainedWind',
               'Gust', 'Rain', 'SnowDepth', 'SnowIce']

my_imputer = MyNaiveImputer(wd_full_local, time_steps=49+7)
imputers = {
    "Mean": my_imputer.impute_all(impute_cols, strategy="mean"),
    "Median": my_imputer.impute_all(impute_cols, strategy="median"),
    "Max": my_imputer.impute_all(impute_cols, strategy="max"),
    "Min": my_imputer.impute_all(impute_cols, strategy="min")
}

# Merged Data Variables
data_merged = pd.read_csv("data/data_merged_full.csv", index_col=0)
data_merged = create_datetime(data_merged, "Datetime", format="%Y-%m-%d")
data_merged["Day Of Week"] = data_merged["Datetime"].dt.day_name()
data_merged["Year String"] = data_merged["Year"].astype(str)
data_merged["Month String"] = data_merged["Datetime"].dt.month_name()
data_merged["Rain Bool"] = data_merged["Rain"].astype(bool)
data_merged["SnowIce Bool"] = data_merged["SnowIce"].astype(bool)
data_merged = data_merged.set_index("Datetime")
weather_full_df = data_merged.loc[data_merged["Year"] <= 2018].copy()
data_merged_eda = data_merged.loc[(data_merged["Year"] <= 2018) & (data_merged["Year"] >= 2016)]

# Feature Preprocessing
data_preprocess = data_merged.loc[(data_merged["Year"] >= 2016)].copy()
data_preprocess["Gust_lin"] = data_preprocess["Gust"].interpolate(method="linear")
data_preprocess["Gust_spline3"] = data_preprocess["Gust"].interpolate(method="spline", order=3)
data_preprocess["Gust_spline5"] = data_preprocess["Gust"].interpolate(method="spline", order=5)
data_preprocess["Gust_quad"] = data_preprocess["Gust"].interpolate(method="quadratic")
data_preprocess["Gust"] = data_preprocess["Gust"].interpolate(method="linear")
data_preprocess["DewPoint_old"] = data_preprocess["DewPoint"]
data_preprocess["DewPoint_diff7d"] = data_preprocess["DewPoint"] - data_preprocess["DewPoint"].shift(7)
data_preprocess["DewPoint"] = data_preprocess["DewPoint_diff7d"]
data_preprocess["MinTemp_old"] = data_preprocess["MinTemp"]
data_preprocess["MinTemp_log"] = data_preprocess["MinTemp"].apply(np.log1p)
data_preprocess["MinTemp_log_diff7d"] = data_preprocess["MinTemp_log"] - data_preprocess["MinTemp_log"].shift(7)
data_preprocess["MinTemp"] = data_preprocess["MinTemp_log_diff7d"] 


# Final Preprocessed Variables
data_final = pd.read_csv("data/data_final.csv")
data_final = create_datetime(data_final, "Datetime", format="%Y-%m-%d")
data_final = data_final.set_index("Datetime")
test = data_final[-7:]
dataset = data_final[:-7]
split_point = int(len(data_final[:-7])*0.75)
train, val = dataset[:split_point], dataset[split_point:]
X_train, y_train = train.drop(columns="Target"), train["Target"]
X_val, y_val = val.drop(columns="Target"), val["Target"]
X_test, y_test = test.drop(columns="Target"), test["Target"]
forecast_model = xgb.XGBRegressor()
forecast_model.load_model("models/final_model.json")

# Current Predictions
global r2_val, r2_train, mape_train, mape_val
r2_train = 0.8691238468740025
mape_train = 0.04889510400934162
r2_val = 0.6072642783665692
mape_val = 0.6072642783665692


# Initial Variables
reports = {
    "weather_2011-2018": BeautifulSoup(open("reports/weather_data_ts.html"), "html.parser"),
    "weather_2016-2018": BeautifulSoup(open("reports/weather_data_after2016_ts.html"), "html.parser"),
    "service_full": BeautifulSoup(open("reports/311_data_1.html"), "html.parser")
}

iframe_dp_weather, _ = find_variable_data(reports["weather_2011-2018"], "MeanTemp")
iframe_dp_service, _ = find_variable_data(reports["service_full"], "Created Date")

# Code Variables to show in app
load_code = """

# Load Weather Data in pandas

# No need for polars because data is sufficiently small

weather_data = pd.read_csv("data/weather_NY_2010_2018Nov.csv")



# Load Service data in polars for speed optimization

# Loading directly with polars leads to errors

# Load in pandas then convert to polars

service_data_pd = pd.read_csv("data/311-2016-2018.csv")

assert service_data_pd["Unique Key"].nunique() == len(service_data_pd)

# This casting is done just because of some errors when loading pl from pandas

service_data_pd["Incident Zip"] = service_data_pd["Incident Zip"].astype("string")

service_data_pd["BBL"] = service_data_pd["BBL"].astype("string")

service_data = pl.DataFrame(service_data_pd)



# Clear some ram

del service_data_pd

gc.collect()"""


map_code = """

lat_min = service_data["Latitude"].min()

lat_max = service_data["Latitude"].max()

long_min = service_data["Longitude"].min()

long_max = service_data["Longitude"].max()



mincon_lat = weather_data["Latitude"] >= lat_min

maxcon_lat = weather_data["Latitude"] <= lat_max

mincon_long = weather_data["Longitude"] >= long_min

maxcon_long = weather_data["Longitude"] <= long_max

wd_localized = weather_data.loc[mincon_lat & maxcon_lat & mincon_long & maxcon_long]

"""

Closed_Ticket_Code = """

# Fill null and Typos with mean time diff (13 days)

service_data = service_data.with_columns(

    Closed_Date_New = pl.when(pl.col("Created Date") - pl.col("Closed Date")  > pl.duration(days=1))

                        .then(pl.col("Created Date") + pl.duration(days=mean_diff))

                        .otherwise(pl.col("Closed Date")).fill_null(pl.col("Created Date") + pl.duration(days=mean_diff))

)



# Check for no null values

assert service_data["Closed_Date_New"].is_null().sum() == 0



# Pair wise GroupBy and Filter

closed_tickets = service_data.group_by(["Closed_Date_New", "Created Date"]) \

    .agg((pl.when(pl.col("Created Date") <= pl.col("Closed_Date_New")).then(1).otherwise(0)).sum().alias("count")) \ # FILTER Created Date < Closed Date Here

    .sort("Closed_Date_New") \ # Sort by new column Closed Date New

    .filter((pl.col("Closed_Date_New").dt.year() >= 2016) & (pl.col("Closed_Date_New").dt.year() < 2019)) \ # Filter for only Closed Dates in time window

    .group_by("Closed_Date_New").agg(pl.col("count").sum().alias("num_closed_tickets")) # Final Group By Closed date after filtering



ct_df = closed_tickets.with_columns(

    pl.col("num_closed_tickets") # Rename Column

)

"""

global topic_model
topic_model = BERTopic.load("models/BERTopic")

def plot_imputations(var, data, imputers=imputers):
    plt.close('all')
    fig = plt.figure(figsize=(15,5))
    plt.plot(data["Datetime"][-800:], data[var][-800:], label="Actual")
    plt.title(f"{var} Imputation")
    for method in imputers:
        plt.plot(imputers[method]["Datetime"], imputers[method][var], label=method)
    
    plt.legend()

    return gr.update(value=fig)


def plot_timeseries(data, var, data_name="My", all_vars=[], height=800, width=600):
    plt.close('all')
    if var == "":
        return gr.update()

    from utils import plot_timeseries
    fig = plot_timeseries(data, var, data_name, all_vars, height, width)

    return gr.update(value=fig)


def plot_bivariate(data, x, y, subset=None, trendline=True):
    plt.close('all')
    map_var = {
        "Year": "Year String",
        "Season": "Season",
        "Month": "Month String",
        "Day Of Week": "Day Of Week",
        "Weekend": "is_weekend",
        "Holiday": "is_holiday",
        "Rain": "Rain Bool",
        "SnowIce": "SnowIce Bool",
        "None": None,
        "": None,
    }
    subset = map_var[subset]
    
    from utils import plot_bivariate
    fig = plot_bivariate(data, x, y, subset, trendline)

    return gr.update(value=fig)


def plot_seasonality(data, x, y, show_box=True, show_outliers=False):
    plt.close('all')
    map_var = {
        "Year": "Year String",
        "Season": "Season",
        "Month": "Month String",
        "Day Of Week": "Day Of Week",
        "Weekend": "is_weekend",
        "Holiday": "is_holiday",
        "Rain": "Rain Bool",
        "SnowIce": "SnowIce Bool",
        "None": None,
    }
    x = map_var[x]

    from utils import plot_seasonality
    fig = plot_seasonality(data, x, y, show_box, show_outliers)

    return gr.update(value=fig)
    

def plot_correlations(data, covar, target="Target", lags=[0,1,2,3,4,5,6,7,8,13,14,15,21], method="pearson"):
    plt.close('all')
    from utils import plot_correlations
    fig = plot_correlations(data, covar, target, lags, method)
            
    return gr.update(value=fig)


def plot_autocorr(data, var, apply=None):
    plt.close('all')
    from utils import plot_acf, plot_pacf
    time_series = data.loc[:, var].to_frame().copy()
    if apply:
        time_series[var] = time_series[var].apply(apply)
    fig, ax = plt.subplots(2, 1, figsize=(12, 8))
    _ = plot_acf(time_series[var], lags=30, ax=ax[0])
    _ = plot_pacf(time_series[var], lags=30, method="ols-adjusted", ax=ax[1])
    _ = plt.suptitle(f"{var}", y=0.95)

    return gr.update(value=fig)


def plot_all_correlations(data, data_name="weather", method="pearson"):
    plt.close('all')
    from utils import plot_all_correlations
    fig = plot_all_correlations(data, data_name, method)

    return fig



def run_report(report_base, variable_name, report_category="full"):
    report_name = report_base + "_" + report_category
    iframe, _ = find_variable_data(reports[report_name], variable_name)
    return gr.update(value=iframe)


def test_stationary(data, var):
    from utils import test_stationary
    df = test_stationary(data, var)

    return df


def plot_interpolation(data):
    plt.close('all')
    from utils import plot_gust_interpolation
    fig = plot_gust_interpolation(data)

    return fig


def plot_model_feature_importance():

    plt.close('all')
    from utils import plot_final_feature_importance
    fig = plot_final_feature_importance(forecast_model)

    return fig


def plot_final_predictions():
    plt.close('all')
    from utils import predict_recurse
    next_7_day_prediction = predict_recurse(dataset, test, forecast_model)
    fig = plt.subplots(figsize=(15, 5))
    data_final.loc[data_final.index[-7:], "Target"]= next_7_day_prediction
    ax = data_final.loc[data_final.index[-96:-6], "Target"].plot(label="Real", title="311 Service Volume: 7 Day Prediction")
    data_final.loc[data_final.index[-7:], "Target"].plot(label="Forecast", ax=ax)
    ax.legend()

    curr_fig = plt.gcf()
    plt.close()

    return curr_fig


def plot_train_split():
    plt.close('all')
    from utils import plot_train_split
    fig = plot_train_split(train, val)

    return fig


def plot_val_predicitons():
    data = val.copy()
    data["Prediction"] = preds_val

    from utils import plot_predictions
    
    fig = plot_predictions(train, val, preds_val)

    return fig


curr_theme = gr.themes.Default(
    text_size=gr.themes.sizes.text_lg
)

with gr.Blocks(theme=curr_theme, js=dark_mode, css=open("custom.css", "r").read()) as app:
    title = gr.HTML("""<h1 align="center">Point72 Case Study</h1>""")
    with gr.Tabs() as pages:

        with gr.Tab("Overview") as toc_page:
            gr.Markdown("# My Point72 Case Study Results")
            gr.Markdown("""

            * Please follow the tabs sequentially left to right to get the full story of my work

            * There will be many interactive parts where you will be able to test and view different parameters

            * This app may also be built and ran locally

            * This app is hosted and served from a cloud server VM Instance

            * Any questions please email me: davidna22@gmail.com

            """)


        with gr.Tab("Data Preprocessing") as data_preprocessing_page:

            with gr.Tab("Data Loading") as dp_overview:
                gr.HTML("<h1 style=\"text-align: center;\">Loading the Data</h1>")
                gr.Markdown("## Goal: Load the Data as efficiently as possible")
                gr.Markdown("""

                * Using Pandas alone is **slow and inefficient**.

                * With small datasets, pandas is great because the API is robust.

                * With medium datasets, using a library like polars (a Rust based module with 10x pandas speed) is much faster.

                * As data gets even larger, multi-processing languages like Spark are required.

                * For this dataset, I use pandas for the weather data and polars for the 311 data. After the aggregation and merge, I revert back to pandas for API compatibility.

                """)

                with gr.Accordion("Code", open=False):
                    gr.Code(load_code, language="python")


            with gr.Tab("Location Mapping") as dp_overview:
                src_doc = html.escape(open("figures/map1.html","r").read())
                iframe1 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>'
                src_doc = html.escape(open("figures/map2.html","r").read())
                iframe2 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>'
                src_doc = html.escape(open("figures/bounded_map.html","r").read())
                iframe3 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>'
                src_doc = html.escape(open("figures/final_map.html","r").read())
                iframe4 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>'

                gr.HTML("<h1 style=\"text-align: center;\">Location Mapping for Both Datasets</h1>")
                with gr.Row(elem_classes="map-legend"):
                    gr.Markdown("""

                    **Legend:**

                    * <span style=\"color: red\">Red:</span> Weather records

                    * <span style=\"color: #5989ff\">Blue:</span> 311 Service records

                    """, elem_classes="map-legend-text")

                with gr.Row():
                    with gr.Column():
                        gr.HTML("<h1 style=\"text-align: center; margin: 0px;\">Map of New York State</h1>")
                        map1 = gr.HTML(iframe1, elem_classes="map")
                    with gr.Column():
                        gr.HTML("<h1 style=\"text-align: center; margin: 0px;\">Map of New York City</h1>")
                        map2 = gr.HTML(iframe2, elem_classes="map")

                with gr.Row():
                    gr.Markdown("""

                    Juxtaposing these two maps and seeing the approximate distributions of data observations,

                    its easy to see the problem. The weather dataset encompasses a larger area than the 311 Service call dataset.

                    Once this problem was diagnosed the solution was simple. First you find the max coordinate (Lat, Long) bounds

                    from the 311 Service Dataset. Then, you just filter the weather dataset to only include points from within

                    these bounds. This was one of my initial discoveries when analyzing the dataset and crucial to ensure

                    congruity between the two. **Below you can see the bounding box I created and how the new weather data

                    observations fit in this bounding box.**

                    """)

                with gr.Row():
                    with gr.Column():
                        map3 = gr.HTML(iframe3, elem_classes="map")
                    with gr.Column():
                        map4 = gr.HTML(iframe4, elem_classes="map")

                with gr.Accordion("Code", open=False):
                    gr.Code(map_code, language="python")


            with gr.Tab("Variable Pruning") as var_pruning:
                gr.HTML("<h1 style=\"text-align: center;\">How I pruned the datasets</h1>")
                gr.Markdown("## Goal: Remove as many useless features as possible")
                gr.HTML("<h3 style=\"color: darkorange;\">Key Factors for Feature Removal</h3>")
                gr.Markdown("""

                * Percentage of missing data points

                * Distribution Imbalance

                * Irrelevance

                * Number of distinct categories

                * Another variable was chosen as replacement <br/><br/>

                NOTE: Look in the appendix for visualizations of individual variables

                """)
                droped_var_df = pd.read_excel("data/drop_vars.xlsx")
                gr.Dataframe(
                    droped_var_df,
                    wrap=True,
                    label="Dropped Variables & Justification (Weather on Bottom)"
                )


            with gr.Tab("Time Aggregation") as time_agg:
                gr.HTML("<h1 style=\"text-align: center;\">Aggregate Data by Date</h1>")
                gr.Markdown("## Goal: Aggregate data by Date")
                gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: 311 Service data is not inherently formatted to provide Created Ticket Counts</h3>")
                gr.HTML("""

                <ul style="font-size: 18px">

                    <li>Data must be aggregated by day to find ticket counts</li>

                    <li>Covariate features need a special transformation</li>

                    <li>Final Aggregations Mapping</li>

                    <ul style="padding-inline-start: 40px;">

                        <li>Created Date ==> groupby.count ==> Target (Created ticket count)</li>

                        <li>Closed Date ==> Agg* ==> Number of closed tickets (Agg* explained in next tabs)</li>

                        <li>Agency ==> Agg* ==> Number of tickets by Agency (Agg* explained in next tabs)</li>

                        <li>Borough ==> Agg* ==> Number of tickets by Boroguh (Agg* explained in next tabs)</li>

                        <li>Descriptor ==> Agg* ==> Number of tickets by Descriptor Group/Category (Agg* explained in next tabs)</li>

                    </ul>

                </ul>""")
                gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Weather data is not aggregated by day</h3>")
                gr.HTML("""

                <ul style="font-size: 18px">

                    <li>To merge with 311 Service data, both datasets must be aggregated</li>

                    <li>Additional transformations may be applied only after time aggregation</li>

                    <li>Aggregation function needs to be handled feature by feature</li>

                    <li>Final Aggregation Mapping</li>

                    <ul style="padding-inline-start: 40px;">

                        <li>MaxTemp, MaxSustainedWind  ==> groupby.max ==> Variables have an inherent max feature</li>

                        <li>MinTemp ==> groupby.min ==> Variable has an inherent min feature</li>

                        <li>Rain, SnowIce ==> groupby.mean.round ==> Binary variables are first aggregated than rounded back to binary</li>

                        <li>All Other Variables ==> groupy.mean ==> Mean used by default as it is the least lossy pooling method</li>

                    </ul>

                </ul>""")


            with gr.Tab("Weather Data: Imputation") as wd_impute:
                gr.HTML("<h1 style=\"text-align: center;\">Data Imputation</h1>")
                gr.Markdown("## Goal: Impute missing values in Weather Data")
                gr.HTML("<h3 style=\"color: darkorange;\">Issue: Weather data is incomplete, 49 days are missing in 2018</h3>")
                gr.Markdown("#### Proposed Solution: Use a simple imputer to fill these missing days + 7 more days into the \"future\"")
                gr.HTML("""

                <ul style="font-size: 18px">

                    <li>Use a simple imputer rather than a robust imputation method to reduce model complexity</li>

                    <ul style="padding-inline-start: 40px;">

                        <li>Using a robust imputer = Conducting a multivariate forcast, Very complex & can be slow</li>

                        <li>Using a simple imputer = Low complexity, low latency</li>

                    </ul>

                    <li>Simple imputer applies an aggregate function using Day Of Year (1-366) as the interval</li>

                    <li>4 different Imputation Methods: Mean, Median, Min, Max</li>

                    <li>7 additional days are imputed so the weather data can be used as a future covariate in our model</li>

                    <li>Final Aggregation Mapping</li>

                    <ul style="padding-inline-start: 40px;">

                        <li>WindSpeed, MaxSustainedWind, Gust, SnowDepth => Use Mean => Noisy Variables, Non-Mean/Median methods are too biased, curve best fit with Mean</li>

                        <li>Rain => Use Max => Binary Variables with noise, min/mean/median imputes 0, which does not follow the trend</li>

                        <li>SnowIce => Use Min (impute 0) => Binary variables but mostly 0's, any other imputation is visually inaccurate</li>

                        <li>MeanTemp, MinTemp, MaxTemp, DewPoint, Percipitation => Use Min => Perhaps helping to remove non-stationarity (global warming), Winter is colder now than before, Curve best fits with min</li>

                    </ul>

                </ul>""")

                gr.Markdown("Use plots below to view the plots used to help justify above reasoning")
                with gr.Accordion("Show Plots", open=False):
                    impute_data = gr.State(wd_full_local)
                    impute_choices = ["None"]
                    impute_choices.extend(impute_cols)
                    wd_impute_col = gr.Dropdown(
                        choices=impute_choices,
                        value="None",
                        label="Choose a Variable to plot all imputation methods"
                    )

                    wd_impute_plot = gr.Plot()

                    wd_impute_col.change(
                        plot_imputations,
                        [wd_impute_col, impute_data],
                        [wd_impute_plot]
                    )
           

            with gr.Tab("311: Closed Ticket Counting") as ct_date:
                gr.HTML("<h1 style=\"text-align: center;\">Closed Ticket Feature</h1>")
                gr.Markdown("## The Closed Ticket Feature is built from the Closed Date column similarly to how Created Date was used to generate new 311 Call Volume")
                gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Data Error, Typos, and/or Null valuess</h3>")
                gr.HTML("""

                <ul style="font-size: 18px">

                    <li>Number of Null Values: </li>

                    <li>Number of Closed Dates where Closed Date > Created Date: </li>

                    <ul style="padding-inline-start: 40px;">

                        <li>These values were most likely typos/data recording errors</li>

                        <li>For instance, some of these values dated to 1900</li>

                    </ul>

                    <li>SOLUTION: For every data error, impute with the mean difference (recompute Closed Date based off Created)</li>

                    <li>Mean is calculated as the mean time differential between all valid Closed & Created Dates</li>

                    <li>Mean Time Differential: 13 Days</li>

                </ul>""")
                gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Data Leakage - Future into Past</h3>")
                gr.HTML("""

                <ul style="font-size: 18px">

                    <li>Most of the Closed Date values are 13 days ahead relative to Created Date</li>

                    <li>GroupBy Closed Date only will lead to some closed ticket counts leaking into future created dates</li>

                    <li>SOLUTION: GroupBy [Closed Date, Created Date] pairwise, filter so Created Date < Closed Date</li>

                </ul>""")
                with gr.Accordion("Code", open=False):
                    gr.Code(Closed_Ticket_Code, language="python")  
            

            with gr.Tab("311: Categorical Grouping") as cat_groups:
                BERTopic = gr.State(BERTopic.load("models/BERTopic"))
                gr.HTML("<h1 style=\"text-align: center;\">Categorical Features</h1>")
                gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Categorical Features have too many categories</h3>")
                gr.Markdown("#### Create a mapping of categories into groups to reduce total number (Viewable at the bottom of the page)")
                gr.HTML("""

                <ul style="font-size: 18px">

                    <li>Borough:</li>

                    <ul style="padding-inline-start: 40px;">

                        <li>Only 9 Categories without grouping</li>

                        <li>Four Categories are either typos or just null => Group all into OTHER</li>

                    </ul>

                    <li>Agency:</li>

                    <ul style="padding-inline-start: 40px;">

                        <li>30 Agencies in total are listed</li>

                        <li>Manual Research to group each Agency by Category of what they typically do</li>

                        <li>30 Agencies down to 7 Agency Groupings, based on frequency and research</li>

                    </ul>

                    <li>Complaint Type: Removed because analysis showed complaints were too related to the agency</li>

                    <ul style="padding-inline-start: 40px;">

                        <li>299 unique pairs out of 271 unique complaints => only ~10% difference in distribution</li>

                    </ul>

                    <li>Descriptor: Over 1000+ unique categories. Only way to realistically group is to use NLP</li>

                    <ul style="padding-inline-start: 40px;">

                        <li>Pretrained a BERTopic model to extract topics from the text</li>

                        <li>BERTopic uses TF-IDF & Transformers to extract topics from text</li>

                        <li>BERTopic reduced 1000 categories into 8 groups</li>

                    </ul>

                </ul>""")
                    
                gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: How do we aggregate by day these features when there are multiple repeated categories per day</h3>")
                gr.Markdown("#### One Hot Encode and Sum per category")
                gr.HTML("""

                <ul style="font-size: 18px">

                    <li>Step 1: One hot encode all the features before aggregation</li>

                    <li>Step 2: GroupBy date and Sum for each encoding</li>

                    <ul style="padding-inline-start: 40px;">

                        <li>Example: A categorical group with 4 categories</li>

                        <li>One Sum column per category representing the frequency of that category per day</li>

                    </ul>

                    <li>Main Downside: Highly correlated with Created Ticket data; aggregation method was essentially the same</li>

                    <ul style="padding-inline-start: 40px;">

                        <li>Summing across the four feature categories in the example above would just equal the ticket count</li>

                    </ul>

                    <li>Solution: Leave some categories out of final vector to reduce bias (Shown in feature engineering stage)</li>

                </ul>""")

                with gr.Accordion("View Feature Groups", open=False):
                    with gr.Accordion("Borough", open=False):
                        gr.JSON(json.loads(open("code/Borough.json", "r").read()))

                    with gr.Accordion("Agency", open=False):
                        gr.JSON(open("code/Agency.json", "r").read())

                    with gr.Accordion("Descriptor", open=False):
                        gr.Dataframe(topic_model.get_topic_info().loc[:, ["Count", "Name", "Representation"]])
                        gr.Plot(topic_model.visualize_barchart(list(range(-1,6,1))))


            with gr.Tab("All Code") as code_preprocess:
                gr.Markdown("# View Full Code for building Weather Data")
                with gr.Accordion(open=False):
                    gr.Code(open("code/build_weather.py", "r").read())

                gr.Markdown("# View Full Code for building 311 Service Data")
                with gr.Accordion(open=False):
                    gr.Code(open("code/build_service.py", "r").read())

        
        with gr.Tab("Exploratory Data Analysis", id="eda_page") as eda_page:
            bivar_data = gr.State(data_merged_eda)
            with gr.Tab("Overview", id="eda_overview") as eda_overview:
                gr.Markdown("# The EDA Section is intended to be a set of interactive visualizations")
                gr.Markdown("The tabs are interactive plots and tables that were used to generate the key insights below.")
                gr.HTML("<h3 style=\"color: darkorange;\">Key Insights</h3>")
                gr.HTML("""

                <ul style="font-size: 18px">

                    <li>Missing Values:</li>

                    <ul style="padding-inline-start: 40px; font-size: 18px;">

                        <li>Gust if used may need interpolation to fill missing values</li>

                    </ul>

                    <li>Stationarity</li>

                    <ul style="padding-inline-start: 40px; font-size: 18px;">

                        <li>Weather variables exhibit various levels of non-stationarity (mostly based on trend but some constant)</li>

                        <ul style="padding-inline-start: 60px; font-size: 18px;">

                            <li>Trends are clear for some like Temperature and DewPoint</li>

                            <li>Possible cause of constant non-stationarity are factors such as global warming</li>

                        </ul>

                        <li>311 Calls may exhibit some forms of weekly non-stationarity</li>

                        <ul style="padding-inline-start: 60px; font-size: 18px;">

                            <li>Potentially weekly and monthly non-stationarity</li>

                            <li>Affected by Holidays and Weekends</li>

                            <li>More robust tests needed</li>

                        </ul>

                        <li>Action Item: Test for stationarity and remove</li>

                    </ul>

                    <li>Bivariate Interactions:</li>

                    <ul style="padding-inline-start: 40px; font-size: 18px;">

                        <li>311 Calls have stronger relationships with certain Agency, Borough and Descriptor categories</li>

                        <li>311 calls exhibit weak overal linear relationships with weather</li>

                        <ul style="padding-inline-start: 60px; font-size: 18px;">

                            <li>Monthly and Seasonal relationship is strongest in winter months</li>

                            <li>Month Of January: strongest linear relationship between MinTemp, DewPoint</li>

                        </ul>

                    </ul>

                    <li>Seasonality:</li>

                    <ul style="padding-inline-start: 40px; font-size: 18px;">

                        <li>Weather variables exhibit a strong Yearly and Seasonal seasonality</li>

                        <li>311 Service Variables exhibit Weekly Seasonality</li>

                        <li>311 Variables affected strongly by holidays and weekends (less 311 calls on weekends and holidays)</li>

                    </ul>

                    <li>Correlation:</li>

                    <ul style="padding-inline-start: 40px; font-size: 18px;">

                        <li>Heavy Collinearity among weather variables (especially Min, Mean, MaxTemp)</li>

                        <li>Varying degrees of correlation among 311 covariates and 311 volume</li>

                    </ul>

                    <li>Lags & Autocorrelation:</li>

                    <ul style="padding-inline-start: 40px; font-size: 18px;">

                        <li>311 Service Calls have highest correlation with 7,14,21 weekly lags</li>

                        <li>6,8 day lag intervals second strongest relationship. 8 day exhibits some negative correlation</li>

                        <li>1 day lag exhibits similar correlation with 6,7 day lags</li>

                    </ul>

                </ul>""")


            with gr.Tab("Univariate", id="eda_univar") as eda_univar:

                with gr.Tab("Weather Data") as eda_uni_weather:
                    eda_univar_weatherdf = gr.State(weather_full_df)
                    gr.Markdown("# Use the Interactive plot below")
                    eda_uni_weather_name = gr.State("Weather")
                    weather_vars = [
                        "", 'MeanTemp', 'DewPoint', 'Percipitation', 'WindSpeed', 'Gust', 'SnowDepth',
                        'MinTemp', 'MaxTemp', 'MaxSustainedWind'
                    ]
                    select_weather_var = gr.Dropdown(
                        choices=weather_vars,
                        value="",
                        label="Select a Variable to View"
                    )

                    weather_uniplot = gr.Plot()

                    select_weather_var.change(
                        plot_timeseries,
                        inputs=[
                            eda_univar_weatherdf,
                            select_weather_var,
                            eda_uni_weather_name
                        ],
                        outputs=[
                            weather_uniplot
                        ]
                    )

                with gr.Tab("311 Service Data") as eda_uni_weather:
                    eda_univar_servicedf = gr.State(data_merged_eda)
                    gr.Markdown("# Use the Interactive plot below")
                    gr.Markdown("**NOTE: Target is the count of 311 service records**")
                    eda_uni_service_name = gr.State("Weather")
                    service_vars = [
                        "", 'Target', 'num_closed_tickets',
                        # Agency Group Counts
                        'AG_Buildings', 'AG_Environment & Sanitation', 'AG_Health',
                        'AG_Parks', 'AG_Security', 'AG_Transportation',
                        'AG_Other',
                        # Borough Counts
                        'Borough_BRONX', 'Borough_BROOKLYN', 'Borough_MANHATTAN',
                        'Borough_QUEENS', 'Borough_STATEN ISLAND',
                        'Borough_OTHER', 
                        # Descriptor Group Counts
                        'DG_damaged_sign_sidewalk_missing',
                        'DG_english_emergency_spanish_chinese',
                        'DG_exemption_commercial_tax_business',
                        'DG_license_complaint_illegal_violation', 'DG_noise_animal_truck_dead',
                        'DG_odor_food_air_smoke', 'DG_order_property_inspection_condition',
                        'DG_water_basin_litter_missed'
                    ]
                    select_service_var = gr.Dropdown(
                        choices=service_vars,
                        value="",
                        label="Select a Variable to View"
                    )

                    service_uniplot = gr.Plot()

                    select_service_var.change(
                        plot_timeseries,
                        inputs=[
                            eda_univar_servicedf,
                            select_service_var,
                            eda_uni_service_name
                        ],
                        outputs=[
                            service_uniplot
                        ]
                    )


            with gr.Tab("Bivariate", id="eda_bivar") as eda_bivar:
                gr.Markdown("# Use the Interactive plot below")
                gr.Markdown("Use this tab to view relationships between the Target variable (number of tickets created daily) and a Covariate")
                with gr.Column():
                    with gr.Row() as bivar_params:
                        bivar_dist_target = gr.Dropdown(
                            choices=["Target"],
                            value="Target",
                            label="Target Variable (One option)"
                        )

                        all_bivars = ['num_closed_tickets', "Agency", "Borough", "Descriptor"]
                        all_bivars.extend(weather_vars)
                        all_bivars = sorted(all_bivars)
                        all_bivars = all_bivars[1:]
                        bivar_dist_cov = gr.Dropdown(
                            choices=all_bivars,
                            value="MeanTemp",
                            label="Select Covariate"
                        )
                        bivar_trendline = gr.Dropdown(
                            choices=[True, False],
                            value=True,
                            label="Graph with OLS Trendline"
                        )

                    with gr.Accordion("Add Seasonality", open=False):
                        bivar_subset = gr.Dropdown(
                            choices=["None", "Year", "Season", "Month", "Day Of Week", "Weekend", "Holiday"],
                            value="None",
                            label="Seasonality Options (Disabled for Agency, Borough and Descriptor)"
                        )
                
                bivar_submit = gr.Button("Run")
                bivar_plot = gr.Plot()
                bivar_submit.click(
                    plot_bivariate,
                    [bivar_data, bivar_dist_cov, bivar_dist_target, bivar_subset, bivar_trendline],
                    bivar_plot
                )


            with gr.Tab("Seasonality") as bivar_season:
                gr.Markdown("## Exploring the affect of Seasonality")
                
                with gr.Row() as bivar_season_params:
                    bivar_season_var = gr.Dropdown(
                        choices=["Target", 'MeanTemp', 'DewPoint',
                        'Percipitation', 'WindSpeed', 'Gust', 'SnowDepth',
                        'MinTemp', 'MaxTemp', 'MaxSustainedWind'],
                        value="Target",
                        label="Variable"
                    )

                    bivar_season_cov = gr.Dropdown(
                        choices=["Year", "Season", "Month", "Day Of Week", "Weekend", "Holiday", "Rain", "SnowIce"],
                        value="Year",
                        label="Seasonality"
                    )

                    with gr.Column():
                        season_boxplot = gr.Checkbox(value=True, label="Show Boxplot")
                        season_outlier = gr.Checkbox(value=False, label="Show Outliers")

                bivar_season_btn = gr.Button("Run")

                bivar_season_plot = gr.Plot()

                bivar_season_btn.click(
                    plot_seasonality,
                    [bivar_data, bivar_season_cov, bivar_season_var, season_boxplot, season_outlier],
                    [bivar_season_plot]
                )


            with gr.Tab("Correlation") as corr:
                
                with gr.Tab("Weather Correlations") as corr_weather:
                    gr.Plot(plot_all_correlations(data_merged_eda, "weather", method="pearson"))


                with gr.Tab("311 Service Correlations") as corr_service:
                    gr.Plot(plot_all_correlations(data_merged_eda, "service", method="pearson"))


                with gr.Tab("Lag Correlations") as corr_dynamic:
                    gr.Markdown("## Use this to dynamically view correlations based on Lag")
                    gr.Markdown("By Default, we will analyze lags of [0,1,2,3,4,5,6,7,8,13,14,15,21] days for chosen variable")
                    gr.Markdown("Scroll Down For AutoCorrelation Graphs")
                    with gr.Row():
                        corr_vars = [
                            "None", 'Target', 'num_closed_tickets',
                            # Weather Variables
                            'MeanTemp', 'DewPoint', 'Percipitation',
                            'WindSpeed', 'Gust', 'SnowDepth',
                            'MinTemp', 'MaxTemp', 'MaxSustainedWind',
                            # Agency Group Counts
                            'AG_Buildings', 'AG_Environment & Sanitation', 'AG_Health',
                            'AG_Parks', 'AG_Security', 'AG_Transportation',
                            'AG_Other',
                            # Borough Counts
                            'Borough_BRONX', 'Borough_BROOKLYN', 'Borough_MANHATTAN',
                            'Borough_QUEENS', 'Borough_STATEN ISLAND',
                            'Borough_OTHER', 
                            # Descriptor Group Counts
                            'DG_damaged_sign_sidewalk_missing',
                            'DG_english_emergency_spanish_chinese',
                            'DG_exemption_commercial_tax_business',
                            'DG_license_complaint_illegal_violation', 'DG_noise_animal_truck_dead',
                            'DG_odor_food_air_smoke', 'DG_order_property_inspection_condition',
                            'DG_water_basin_litter_missed'
                        ]
                        corr_vars = gr.Dropdown(
                            choices=corr_vars,
                            value="Target",
                            label="Variable"
                        )
                    
                    corr_btn = gr.Button("Run")
                    corr_plot = gr.Plot()
                    autocorr_plot = gr.Plot()

                    corr_btn.click(
                        plot_correlations,
                        [bivar_data, corr_vars],
                        [corr_plot]
                    )

                    corr_btn.click(
                        plot_autocorr,
                        [bivar_data, corr_vars],
                        [autocorr_plot]
                    )


        with gr.Tab("Feature Engineering") as feature_engineer_page:

            
            with gr.Tab("Feature Selection") as feature_select:
                gr.HTML("<h1 style=\"text-align: center;\">Select Features Based on EDA</h1>")
                gr.Markdown("### Below is the logic used in our model feature selection")
                gr.HTML("""

                <ul style="font-size: 18px">

                    <li>Weather Covariates</li>

                    <ul style="padding-inline-start: 30px; font-size: 18px;">

                        <li>Weather variables exhibit various levels of non-stationarity (mostly based on trend but some constant)</li>

                        <li>MeanTemp, MaxTemp: High collinearity with MinTemp. MinTemp has highest correlation of 3 => REMOVE</li>

                        <ul style="padding-inline-start: 50px; font-size: 18px;">

                            <li>Possible Reason: High temps, people stay indoors. A/C doesn't break nowadays. Lower Temps lead to building/tech failure more often</li>

                        </ul>

                        <li>Percipitation: Bivariate plot shows weak relationship, outliers no effect on 311 => REMOVE</li>

                        <li>SnowDepth: High number missing values, low correlation => REMOVE</li>

                        <li>Rain, SnowIce: Binary, plots (look in Seasonality Tab) show weak relationship, SnowIce heavily imbalanced (99% 0's) => REMOVE</li>

                    </ul>

                    <li>311 Service Covariates:</li>

                    <ul style="padding-inline-start: 30px; font-size: 18px;">

                        <li>LOO (Leave One - or many - Out) Encoding:</li>

                        <ul style="padding-inline-start: 50px; font-size: 18px;">

                            <li>Remove weakest features from our categorical covariates</li>

                            <li>Reduces bias and removes multicollinearity inherent to One-Hot Encoding</li>

                            <li>Candidates For Removal:</li>

                            <ul style="padding-inline-start: 70px; font-size: 18px;">

                                <li>AG_Health, AG_Other: Lowest Correlation, lowest counts => REMOVE</li>

                                <li>AG_Parks: Lowest Correlation, but low multi-collinearity => KEEP</li>

                                <li>Borough_OTHER: Weakest Correlation, lowest count => REMOVE</li>

                                <li>DG_english_emergency, DG_exemption_commercial: Weakest Correlation, lowest counts => REMOVE</li>

                                <li>DG_odor_food_air_smoke: Lowest Count, but high correlation => KEEP</li>

                            </ul>

                        </ul>

                    </ul>

                </ul>""")
                
                with gr.Accordion("Show Final Variable List", open=False):
                    gr.JSON(json.loads(open("code/all_vars.json","r").read()))

            
            with gr.Tab("Feature Preprocessing") as feature_prep:
                data_feature_prep = gr.State(data_preprocess)
                gr.HTML("<h1 style=\"text-align: center;\">Preprocess Features</h1>")
                gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Missing Values</h3>")
                gr.HTML("""

                <ul style="font-size: 18px">

                    <li>Only One value has missing values to impute: Gust</li>

                        <ul style="padding-inline-start: 30px; font-size: 18px;">

                            <li>Various interpolation methods were tested</li>

                            <li>Methods like Spline and Polynomial over-estimated some values, breaking inherent data ranges</li>

                            <li>Turns out Simple Linear interpolation was best</li>

                        </ul>

                    <li>SOLUTION: Interpolate Gust with Linear method</li>

                </ul>""")

                with gr.Accordion("Show Interpolation Plots", open=False):
                    gr.Plot(plot_interpolation(data_preprocess))

                gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Remove Non-Stationarity</h3>")
                gr.HTML("""

                <ul style="font-size: 18px">

                    <li>Variables that are non-stationary change over time, they have a trend</li>

                    <li>Ideal to transform non-stationarity variables for modeling</li>

                    <li>Ignore Categorical Variables (simply to keep model complexity low)</li>

                    <li>Numerical Variables were tested for Non-Stationarity using two methods: ADF and KPSS</li>

                    <ul style="padding-inline-start: 30px; font-size: 18px;">

                        <li>Using ADF and KPSS together can reveal what kind of trend exists in the data</li>

                        <li>Only 1 Case Met: Pass KPSS, Fail ADF = Trend Stationary (most likely by season)</li>

                    </ul>

                    <li>Only Two Variables failed the tests: DewPoint & MinTemp</li>

                    <li>SOLUTION: Use Differencing (7d lag) + Log for MinTemp and Differencing (7d lag) for DewPoint (Log caused many NaNs)</li>

                </ul>""")

                with gr.Accordion("View Results Below", open=False):
                    gr.Markdown("### MinTemp (Log) Tests Before and After Transformation")
                    with gr.Row():
                        with gr.Column():
                            gr.Dataframe(test_stationary(data_preprocess, "MinTemp_old"), label="MinTemp No Augments")
                        with gr.Column():
                            gr.Dataframe(test_stationary(data_preprocess, "MinTemp"), label="Log + 7 Day Lag Differencing")

                    gr.Markdown("### DewPoint Tests Before and After Transformation")
                    with gr.Row():
                        with gr.Column():
                            gr.Dataframe(test_stationary(data_preprocess, "DewPoint_old"), label="DewPoint No Augments")
                        with gr.Column():
                            gr.Dataframe(test_stationary(data_preprocess, "DewPoint"), label="7 Day Lag Differencing")
                

            with gr.Tab("Feature Engineering") as feature_eng:


                with gr.Tab("Past Covariates") as fe_past:
                    gr.HTML("<h1 style=\"text-align: center;\">Past Covariate Features</h1>")
                    gr.Markdown("""

                    * Past Covariates are datapoints that are implied to be only related to past information

                    * For Instance, using past sales of product B to predict futures sales of product A

                    * There are two ways to use past covariates

                    * *Option 1:* Build a multi-variate forecast to predict these variables simultaneously

                    * *Option 2:* Use a sliding window and lags to provide past data (especially for multi-step forecasts)

                    """)
                    gr.Markdown("**I will use Option 2 to avoid building a very complex multi-variate model**")
                    gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Leaking Future Data into the past</h3>")
                    gr.Markdown("""

                    * By using lags, I can shift my data in a way to avoid leaking past data into the future

                    * For predicting 7 days into the future, I must lag my data by at least 7 days

                    * Use a rolling window that will reset over time

                    """)
                    gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Curse of Dimensionality</h3>")
                    gr.Markdown("""

                    * Possible to use many variations of lags, rolling and differences to generate many features

                    * Too many features leads to the curse of dimensionality, i.e. Overfitting

                    * Thus, I keep my Feature Set as simple as possible

                    """)
                    gr.Markdown("""

                    ### Feature Set

                    * Lags: 7D, 14D, 21D

                    * Rolling (Shifted 7 Days forward): Mean of 14D (14 because mean(Created - Closed Date) = 13 days)

                    * Differencing (7D difference = 7D lag - 14D lag): 7D

                    """)
                    
                    
                    with gr.Accordion("Open to view implementation code", open=False):
                        gr.Code(open("code/past_features.py","r").read())


                with gr.Tab("Future Covariates") as fe_past:
                    gr.HTML("<h1 style=\"text-align: center;\">Past Covariate Features</h1>")
                    gr.Markdown("""

                    * Future Covariates are data that I have about the future

                    * For Instance, I can use the projected revenue of Company A to predict daily sales

                    * For Future Covariates, I do not need to shift variables. I will provide a shift up to 2 days.

                    * I apply a rolling and expanding window as more features

                    * Also, I use mean and min to follow the logic learned in EDA. Minimum temp values seem to be more impactful on 311 volume

                    """)
                    gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Curse of Dimensionality</h3>")
                    gr.Markdown("""

                    * Similar to the Past Covaraiates, I keep my features as simple as possible with as little as possible

                    * The more features, the more we may overfit

                    """)
                    gr.Markdown("""

                    ### Feature Set

                    * Lags: 0D, 1D, 2D

                    * Rolling: Mean & Min of last 14D

                    * Expanding Window: Max, Min (min-length of 14)

                    * Differencing already performed to remove trends

                    """)
                    
                    with gr.Accordion("Open to view implementation code", open=False):
                        gr.Code(open("code/future_features.py","r").read())


                with gr.Tab("Target Variable") as fe_past:
                    gr.HTML("<h1 style=\"text-align: center;\">311 Service Calls Features</h1>")
                    gr.Markdown("""

                    * For providing feature transformations of our Target, we can follow a similar process as above

                    * Main Difference: Lags of < prediction window need to be recomputed at each iteration

                    * So, for predicting at time (t+1) we need the predicted value at time (t)

                    * For a recursive prediction model, this means the model cannot make batch predictions without iterating

                    """)
                    gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: More variables increase complexity for prediction</h3>")
                    gr.Markdown("""

                    * The more features, the more overfitting & more computation

                    * As I will use a recursive model, these values must be recomputed at each step t+1

                    * In favor of a less complex model, I will choose as minimal features as possible (excluding rolling features as its prone to error with recalculation)

                    """)
                    gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Leaking Future Data into the past</h3>")
                    gr.Markdown("""

                    * Must be careful about how these features are computed

                    * For instance, for rolling mean, I would shift the data up by 1 lag first then compute the rolling sum

                    * For differencing, a 7D lag difference is really the 1D - 8D lag. (For t=8, 7D diff = t7-t1 not t8-t2)

                    """)
                    gr.Markdown("""

                    ### Feature Set

                    * Lags: 1D, 6D, 7D, 8D, 14D, 21D (based on highest correlations and weekly seasonality)

                    * Differencing: 7D, 14D

                    """)

                    with gr.Accordion("Open to view implementation code", open=False):
                        gr.Code(open("code/target_features.py","r").read())

        
        with gr.Tab("Forecast Model") as model_select_train_page:


            with gr.Tab("Splitting the data") as model_data_split:
                gr.HTML("<h1 style=\"text-align: center;\">Splitting Time-Series Data</h1>")
                gr.HTML("""

                <ul style="font-size: 18px">

                    <li>Splitting Time-Series Data is different than splitting other data</li>

                    <li>Rather than splitting on random samples, you split the data by time with order consistent</li>

                    <li>I took a 75% splitting approach where I split my data at the date that sits on the 75% of data length</li>

                </ul>""")
                gr.Markdown("#### As an example, I provide a graph showing exactly how I split my data")
                gr.Plot(plot_train_split())


            with gr.Tab("Model Selection") as model_data_split:
                gr.HTML("<h1 style=\"text-align: center;\">Choosing the Right Model</h1>")
                gr.Markdown("### Types of Forecast Models for Multi-Step Prediction")
                gr.HTML("""

                <ul style="font-size: 18px">

                    <li>Parallel Models: Train a model for each prediction (one for 1 day ahead, another for 2, etc.)</li>

                    <li>Recursive Models: Model makes a forecast, fills any values it needs for the next prediction, predicts again</li>

                    <ul style="padding-inline-start: 40px; font-size: 18px;">

                        <li>One of the assumptions was to build a model that was reasonable for production</li>

                        <li>Parallel models are hard to maintain as the steps of prediction increase</li>

                    </ul>

                    <li>Decision: Recursive Modele</li>

                </ul>""")
                gr.Markdown("### My Model Choice: XGBoost")
                gr.HTML("""

                <ul style="font-size: 18px">

                    <li>Reasons for choosing:</li>

                    <ul style="padding-inline-start: 40px; font-size: 18px;">

                        <li>Industry standard for regression</li>

                        <li>Lightweight and relatively fast</li>

                        <li>Many parameters to tune, such as tree depth and regularization</li>

                        <li>Scale invariant - Data does not have to be scaled</li>

                        <li>Allows NaN values and categorical features without encodings (unused in my implementation)</li>

                        <li>Provides key explainability in its feature importance metrics</li>

                    </ul>

                    <li>Decision: Use XGBoost</li>

                </ul>""")


            with gr.Tab("Model Training") as model_data_split:
                gr.HTML("<h1 style=\"text-align: center;\">Training the Model</h1>")
                gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Overfitting</h3>")
                gr.HTML("""

                <ul style="font-size: 18px">

                    <li>Main Cause: High number of variables and XGBoost's tendency to overfit without tuning</li>

                    <li>While training, effort was made to watch the validation and training set's relative performance</li>

                    <li>Steps Taken to avoid Overfitting</li>

                    <ul style="padding-inline-start: 40px; font-size: 18px;">

                        <li>Low Learning Rate</li>

                        <li>Low Tree Depth</li>

                        <li>Keeping Val score relatively close to Training score</li>

                        <li>Increased l2-lambda parameter, boosting regularization</li>

                        <li>Many trials to get best set of parameters</li>

                        <li>Implementing Early Stopping</li>

                    </ul>

                </ul>""")
                gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Choosing a Metric</h3>")
                gr.HTML("""

                <ul style="font-size: 18px">

                    <li>Three metrics I considered: MAPE, MAE and MSE</li>

                    <li>MAPE seemed to show the most consistent and visually accurate results</li>

                    <li>Decision: MAPE</li>

                    <li>Justification: 311 Service volume is quite noisy and MAPE better estimates fit to a very noisy curve than the others</li>

                </ul>""")


            with gr.Tab("Model Prediction") as model_data_split:
                gr.HTML("<h1 style=\"text-align: center;\">Recursive Model Prediction</h1>")
                gr.Markdown("""

                * Below is the code I wrote to implement the Recursive prediction explained in previous tabs

                * Predictions are made one step at a time, where the prediction t depends on prediction t-1

                * To view the final predictions made by the model see below

                """)
                gr.Code(open("code/recurse_predict.py","r").read())
                with gr.Accordion("View 7 Day Model Forecast", open=False):
                    gr.Plot(plot_final_predictions())


            with gr.Tab("Model Evaluation") as model_eval_page:
                gr.HTML("<h1 style=\"text-align: center;\">Forecast Results</h1>")
                gr.Markdown("Overall, the model seemed to have performed pretty well. The MAPE is also <10% for both Validation and Training sets.")
                gr.Markdown("The model did suffer from a low validation R2, but this was difficult to resolve without compromising overall performance of the model.")
                gr.Markdown("The predictions seem to visually pass most backtests, which can be viewed in the graph below.")
                with gr.Accordion("Model Prediction Scores", open=False):
                    gr.JSON({"Train R2": r2_train, "Train MAPE": mape_train, "Validation R2": r2_val, "Validation MAPE": mape_val})
                gr.Image("figures/model_performance.png", show_download_button=False)


            with gr.Tab("Feature Importance") as model_eval_page:
                gr.HTML("<h1 style=\"text-align: center;\">Feature Importance</h1>")
                gr.Markdown("""

                * Below you can view the feature importance metrics from the XGBoost model

                * It seems there is significant impact of the weather variables on 311 Service Call Volume

                * Interestingly, it seems some categories were more impactful than others as well

                """)
                gr.Plot(plot_model_feature_importance())
                    

        with gr.Tab("Future Work & Limitations") as future_limitations_page:
            gr.Markdown("# Future Work")
            gr.Markdown("""

            * **Multi-Variate Time Series Forecasting** rather than imputing values naively

            * Testing more kinds of models such as LightGBM

            * Robustly testing parameters of current model using GridSearchCV

            * Comparing performance of my forecast model to others

            * More Data! Having more 311 Call data may help find other indicators

            """)
            gr.Markdown("# Future Deployments")
            gr.Markdown("""

            * Containerize the model and load onto an API for ingestion

            * Containerize data preprocessing and load into a Spark Cluster

            * Create triggers and view tables to verify data preprocessing

            * Create functions to monitor model performance

            """)

        with gr.Tab("Appendix") as future_limitations_page:
            
            with gr.Tab("Weather Data Analysis") as dp_weather:
                dp_weather_state = gr.State("weather")
                with gr.Column():
                    with gr.Row():
                        dp_weather_category = gr.Dropdown(
                            choices=["2011-2018", "2016-2018"],
                            value="2011-2018",
                            label="Time Range"
                        )

                        dp_weather_var = gr.Dropdown(
                            choices = ["MeanTemp", "MinTemp", "MaxTemp", "DewPoint", "Percipitation", "WindSpeed", "MaxSustainedWind", "Gust", "Rain", "SnowDepth", "SnowIce"],
                            value = "MeanTemp",
                            label = "Variable"
                        )
                        
                    dp_weather_btn = gr.Button("Run")
            
                dp_weather_report = gr.HTML(value=iframe_dp_weather)

                dp_weather_btn.click(
                    run_report,
                    [dp_weather_state, dp_weather_var, dp_weather_category],
                    dp_weather_report,
                )

            with gr.Tab("Service Data Analysis") as dp_service:
                dp_service_state = gr.State("service")
                dp_service_category = gr.State("full")
                with gr.Column():
                    dp_service_var = gr.Dropdown(
                        choices = [
                            "Created Date", "Closed Date", "Agency", "Agency Name",
                            "Complaint Type", "Descriptor", "Location Type", "Landmark",
                            "Facility Type", "Status", "Community Board", "Borough",
                            "Open Data Channel Type", "Park Facility Name", "Park Borough",
                            "Vehicle Type", "Taxi Company Borough", "Taxi Pick Up Location",
                            "Bridge Highway Name", "Bridge Highway Direction", "Road ramp",
                            "Bridge Highway Segment"
                        ],
                        value = "Created Date",
                        label = "Select Variable and Run"
                    )
                    dp_service_btn = gr.Button("Run")
            
                dp_service_report = gr.HTML(value=iframe_dp_service)

                dp_service_btn.click(
                    run_report,
                    [dp_service_state, dp_service_var, dp_service_category],
                    dp_service_report,
                )

def main():


    app.launch(share=False)
    return app


if __name__=="__main__":
    
   
    main()