Spaces:
Sleeping
Sleeping
File size: 3,597 Bytes
dad00c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
# Build all weather data from file
def build_weather_data(filename):
# Use pandas to read file
weather_data = pd.read_csv(filename)
# Quickly aggregate Year, Month, Day into a datetime object
# This is because the 311 data uses datetime
weather_data["Datetime"] = weather_data["Year"].astype("str") + "-" + weather_data["Month"].astype("str") + "-" + weather_data["Day"].astype("str")
weather_data = create_datetime(weather_data, "Datetime", format="%Y-%m-%d")
# LOCALIZE
# Pre-recorded min/max values from the service data (so we don't need again)
lat_min = 40.49804421521046
lat_max = 40.91294056699566
long_min = -74.25521082506387
long_max = -73.70038354802529
# Create the conditions for location matching
mincon_lat = weather_data["Latitude"] >= lat_min
maxcon_lat = weather_data["Latitude"] <= lat_max
mincon_long = weather_data["Longitude"] >= long_min
maxcon_long = weather_data["Longitude"] <= long_max
# Localize our data to match the service data
wd_localized = weather_data.loc[mincon_lat & maxcon_lat & mincon_long & maxcon_long]
drop_cols = [
"USAF",
"WBAN",
"StationName",
"State",
"Latitude",
"Longitude"
]
wd_localized = wd_localized.drop(columns=drop_cols)
# AGGREGATE
# Map columns with aggregation method
mean_cols = [
'MeanTemp',
'DewPoint',
'Percipitation',
'WindSpeed',
'Gust',
'SnowDepth',
]
min_cols = [
'MinTemp'
]
max_cols = [
'MaxTemp',
'MaxSustainedWind'
]
round_cols = [
'Rain',
'SnowIce'
]
# Perform Aggregation
mean_df = wd_localized.groupby("Datetime")[mean_cols].mean()
min_df = wd_localized.groupby("Datetime")[min_cols].min()
max_df = wd_localized.groupby("Datetime")[max_cols].max()
round_df = wd_localized.groupby("Datetime")[round_cols].mean().round().astype(np.int8)
wd_full = pd.concat([mean_df, min_df, max_df, round_df], axis=1)
# Add seasonal features
wd_full = build_temporal_features(wd_full, "Datetime")
wd_full["Season"] = wd_full["Season"].astype("category")
wd_full = wd_full.set_index("Datetime")
# We will calculate the imputation for the next 7 days after 12/31/2018
# Along with the 49 missing days
# This will act as our "Weather Forecast"
time_steps = 49 + 7
# Impute Cols
impute_cols = [
'MeanTemp', 'MinTemp', 'MaxTemp', 'DewPoint',
'Percipitation', 'WindSpeed', 'MaxSustainedWind',
'Gust', 'Rain', 'SnowDepth', 'SnowIce',
]
# Mean Vars
mean_vars = ["WindSpeed", "MaxSustainedWind", "Gust", "SnowDepth"]
min_vars = ["SnowIce", "MeanTemp", "MinTemp", "MaxTemp", "DewPoint", "Percipitation"]
max_vars = ["Rain"]
# Use the imported function to create the imputed data
preds_mean = impute_missing_weather(wd_full, strategy="mean", time_steps=time_steps, impute_cols=mean_vars)
preds_min = impute_missing_weather(wd_full, strategy="min", time_steps=time_steps, impute_cols=min_vars)
preds_max = impute_missing_weather(wd_full, strategy="max", time_steps=time_steps, impute_cols=max_vars)
all_preds = pd.concat([preds_mean, preds_min, preds_max], axis=1)
all_preds = build_temporal_features(all_preds.loc[:, impute_cols], "Datetime")
all_preds = all_preds.set_index("Datetime")
wd_curr = wd_full.loc[wd_full["Year"] >= 2016]
wd_df = pd.concat([wd_full, all_preds], axis=0, join="outer")
return wd_df |