Spaces:

davidna22
/

dna-casestudy

Sleeping

File size: 3,597 Bytes

dad00c5

# Build all weather data from file
def build_weather_data(filename):
    # Use pandas to read file
    weather_data = pd.read_csv(filename)
    
    # Quickly aggregate Year, Month, Day into a datetime object
    # This is because the 311 data uses datetime
    weather_data["Datetime"] = weather_data["Year"].astype("str") + "-" + weather_data["Month"].astype("str") + "-" + weather_data["Day"].astype("str")
    weather_data = create_datetime(weather_data, "Datetime", format="%Y-%m-%d")

    # LOCALIZE
    # Pre-recorded min/max values from the service data (so we don't need again)
    lat_min = 40.49804421521046
    lat_max = 40.91294056699566
    long_min = -74.25521082506387
    long_max = -73.70038354802529

    # Create the conditions for location matching
    mincon_lat = weather_data["Latitude"] >= lat_min
    maxcon_lat = weather_data["Latitude"] <= lat_max
    mincon_long = weather_data["Longitude"] >= long_min
    maxcon_long = weather_data["Longitude"] <= long_max

    # Localize our data to match the service data
    wd_localized = weather_data.loc[mincon_lat & maxcon_lat & mincon_long & maxcon_long]
    drop_cols = [
        "USAF",
        "WBAN",
        "StationName",
        "State",
        "Latitude",
        "Longitude"
    ]
    wd_localized = wd_localized.drop(columns=drop_cols)

    # AGGREGATE
    # Map columns with aggregation method
    mean_cols = [
        'MeanTemp',
        'DewPoint',
        'Percipitation',
        'WindSpeed',
        'Gust',
        'SnowDepth',
    ]
    min_cols = [
        'MinTemp'
    ]
    max_cols = [
        'MaxTemp',
        'MaxSustainedWind'
    ]
    round_cols = [
        'Rain',
        'SnowIce'
    ]

    # Perform Aggregation
    mean_df = wd_localized.groupby("Datetime")[mean_cols].mean()
    min_df = wd_localized.groupby("Datetime")[min_cols].min()
    max_df = wd_localized.groupby("Datetime")[max_cols].max()
    round_df = wd_localized.groupby("Datetime")[round_cols].mean().round().astype(np.int8)
    wd_full = pd.concat([mean_df, min_df, max_df, round_df], axis=1)

    # Add seasonal features
    wd_full = build_temporal_features(wd_full, "Datetime")
    wd_full["Season"] = wd_full["Season"].astype("category")
    wd_full = wd_full.set_index("Datetime")
    
    # We will calculate the imputation for the next 7 days after 12/31/2018
    # Along with the 49 missing days
    # This will act as our "Weather Forecast"
    time_steps = 49 + 7
    
    # Impute Cols
    impute_cols = [
        'MeanTemp', 'MinTemp', 'MaxTemp', 'DewPoint',
        'Percipitation', 'WindSpeed', 'MaxSustainedWind',
        'Gust', 'Rain', 'SnowDepth', 'SnowIce',
    ]
    
    # Mean Vars
    mean_vars = ["WindSpeed", "MaxSustainedWind", "Gust", "SnowDepth"]
    min_vars = ["SnowIce", "MeanTemp", "MinTemp", "MaxTemp", "DewPoint", "Percipitation"]
    max_vars = ["Rain"]
    
    # Use the imported function to create the imputed data
    preds_mean = impute_missing_weather(wd_full, strategy="mean", time_steps=time_steps, impute_cols=mean_vars)
    preds_min = impute_missing_weather(wd_full, strategy="min", time_steps=time_steps, impute_cols=min_vars)
    preds_max = impute_missing_weather(wd_full, strategy="max", time_steps=time_steps, impute_cols=max_vars)
    all_preds = pd.concat([preds_mean, preds_min, preds_max], axis=1)
    all_preds = build_temporal_features(all_preds.loc[:, impute_cols], "Datetime")
    all_preds = all_preds.set_index("Datetime")

    wd_curr = wd_full.loc[wd_full["Year"] >= 2016]
    wd_df = pd.concat([wd_full, all_preds], axis=0, join="outer")

    return wd_df