import numpy as np import pandas as pd # function to map weather values to numerical values def map_weather_to_numerical(weather, ordinal_values): val = 0 for w in weather.split(","): w = w.strip() if w in ordinal_values: val = max(ordinal_values[w], val) return val def standardizeX(X, mean, std): for i in range(len(mean)): X[:,:,i] = (X[:, :, i]-mean[i])/std[i] return X def process_data(df): # Drop redundant columns redundant_cols = [col for col in ["Unnamed: 0", "Minimum Temperature", "Maximum Temperature", "Snow Depth", "Heat Index", "Precipitation Cover", "Wind Gust", "Wind Chill", "Snow Depth", "Info", "Latitude", "Longitude", "Address", "Resolved Address", "Name"] if col in df.columns] df.drop(redundant_cols, axis=1, inplace=True) # Interpolate missing values df = df.interpolate() # Fill missing values in 'Conditions' and 'Weather Type' columns df['Conditions'].fillna("Clear", inplace=True) df['Weather Type'].fillna("", inplace=True) # Rename column and convert to datetime format df.rename(columns={"Relative Humidity": "Humidity"}, inplace=True) df['DATETIME'] = pd.to_datetime(df['Date time']) # Drop 'Date time' column df.drop(["Date time"], axis=1, inplace=True) # Map weather values to numerical values rain_values = {'Heavy Rain': 7, 'Snow And Rain Showers': 6, 'Rain Showers': 5, 'Rain': 4, 'Light Rain': 3, 'Light Drizzle': 2, 'Drizzle': 1} storm_values = {'Dust storm': 1, 'Lightning Without Thunder': 2, 'Thunderstorm Without Precipitation': 3, 'Thunderstorm': 4} overview = {'Clear': 1, 'Partially cloudy': 2, 'Rain': 2, 'Overcast': 3} df["Rain"] = df['Weather Type'].apply(lambda s: map_weather_to_numerical(s, rain_values)) df["Storm"] = df['Weather Type'].apply(lambda s: map_weather_to_numerical(s, storm_values)) df["Overview"] = df['Conditions'].apply(lambda s: map_weather_to_numerical(s, overview)) # Drop 'Weather Type' and 'Conditions' columns df.drop(["Weather Type", "Conditions"], axis=1, inplace=True) # Convert DATETIME to seconds df["seconds"] = df["DATETIME"].map(pd.Timestamp.timestamp) df.drop("DATETIME", axis=1, inplace=True) # Process seconds to represent periodic nature of days and years day_in_seconds = 24 * 3600 year_in_seconds = day_in_seconds * 365.2425 df["sin(day)"] = np.sin((df["seconds"] * (2 * np.pi)) / day_in_seconds) df["cos(day)"] = np.cos((df["seconds"] * (2 * np.pi)) / day_in_seconds) df["sin(year)"] = np.sin((df["seconds"] * (2 * np.pi)) / year_in_seconds) df["cos(year)"] = np.cos((df["seconds"] * (2 * np.pi)) / year_in_seconds) df.drop("seconds", axis=1, inplace=True) return df