kolkata-weather-forecaster / data_processing.py
Arijit-hazra's picture
Initial commit
a3d05c5
import numpy as np
import pandas as pd
# function to map weather values to numerical values
def map_weather_to_numerical(weather, ordinal_values):
val = 0
for w in weather.split(","):
w = w.strip()
if w in ordinal_values:
val = max(ordinal_values[w], val)
return val
def standardizeX(X, mean, std):
for i in range(len(mean)):
X[:,:,i] = (X[:, :, i]-mean[i])/std[i]
return X
def process_data(df):
# Drop redundant columns
redundant_cols = [col for col in ["Unnamed: 0", "Minimum Temperature",
"Maximum Temperature", "Snow Depth", "Heat Index",
"Precipitation Cover", "Wind Gust", "Wind Chill",
"Snow Depth", "Info", "Latitude",
"Longitude", "Address", "Resolved Address", "Name"] if col in df.columns]
df.drop(redundant_cols, axis=1, inplace=True)
# Interpolate missing values
df = df.interpolate()
# Fill missing values in 'Conditions' and 'Weather Type' columns
df['Conditions'].fillna("Clear", inplace=True)
df['Weather Type'].fillna("", inplace=True)
# Rename column and convert to datetime format
df.rename(columns={"Relative Humidity": "Humidity"}, inplace=True)
df['DATETIME'] = pd.to_datetime(df['Date time'])
# Drop 'Date time' column
df.drop(["Date time"], axis=1, inplace=True)
# Map weather values to numerical values
rain_values = {'Heavy Rain': 7, 'Snow And Rain Showers': 6, 'Rain Showers': 5, 'Rain': 4, 'Light Rain': 3, 'Light Drizzle': 2, 'Drizzle': 1}
storm_values = {'Dust storm': 1, 'Lightning Without Thunder': 2, 'Thunderstorm Without Precipitation': 3, 'Thunderstorm': 4}
overview = {'Clear': 1, 'Partially cloudy': 2, 'Rain': 2, 'Overcast': 3}
df["Rain"] = df['Weather Type'].apply(lambda s: map_weather_to_numerical(s, rain_values))
df["Storm"] = df['Weather Type'].apply(lambda s: map_weather_to_numerical(s, storm_values))
df["Overview"] = df['Conditions'].apply(lambda s: map_weather_to_numerical(s, overview))
# Drop 'Weather Type' and 'Conditions' columns
df.drop(["Weather Type", "Conditions"], axis=1, inplace=True)
# Convert DATETIME to seconds
df["seconds"] = df["DATETIME"].map(pd.Timestamp.timestamp)
df.drop("DATETIME", axis=1, inplace=True)
# Process seconds to represent periodic nature of days and years
day_in_seconds = 24 * 3600
year_in_seconds = day_in_seconds * 365.2425
df["sin(day)"] = np.sin((df["seconds"] * (2 * np.pi)) / day_in_seconds)
df["cos(day)"] = np.cos((df["seconds"] * (2 * np.pi)) / day_in_seconds)
df["sin(year)"] = np.sin((df["seconds"] * (2 * np.pi)) / year_in_seconds)
df["cos(year)"] = np.cos((df["seconds"] * (2 * np.pi)) / year_in_seconds)
df.drop("seconds", axis=1, inplace=True)
return df