#!/usr/bin/env python
# coding: utf-8
# ## Data Loading
# Importing the necessary libraries, like pandas, numpy and some plotting libraries such as matplotlib and seaborn
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')
# Set the default font size, figure size and the grid in the plot
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'
# Reading of data as a pandas dataframe and named as **df**
df = pd.read_csv('Walmart.csv')
# **About Data:**
# * Store - the store number
# * Date - the week of sales
# * Weekly_Sales - sales for the given store
# * Holiday_Flag - whether the week is a special holiday week 1 – Holiday week 0 – Non-holiday week
# * Temperature - Temperature on the day of sale
# * Fuel_Price - Cost of fuel in the region
# * CPI – Prevailing consumer price index
# * Unemployment - Prevailing unemployment rate
# **Insights:**
# * Here the target columns is Weekly_Sales.
# * The data is related to walmart store of united state of america. Where **Store**, **Holiday_Flag** are categorical in nature
# * The data is collected over a 45 stores and weekly sales gives the sales of the crossponding store.
# ## Data Exploration and Modification
# In[ ]: # it gives the information (like count and data type) of the dataset
# Here Date columns is **object** and other remain columns are **interger or float** in nature. Now using the pandas I change the date column datatype(i.e. object) into a pandas-datetime.
# Using the date column i create three seperate columns of weekday, month and year and added to the existing dataset.
# In[ ]:
df['weekday'] = df.Date.dt.weekday
df['month'] = df.Date.dt.month
df['year'] = df.Date.dt.year
# Now I drop the date columns because of no use of it.
df.drop(['Date'], axis=1, inplace=True)
# Hence the modified dataset is look like:
# Explored the unique values of the weekday, month and year columns as follows:
print('years unique value', df.year.unique())
print('months unique value', df.month.unique())
print('weekday unique value', df.weekday.unique())
# Months and weekday are as usual, but the data is taken from year 2010, 2011, 2012 only.
# Now to get the idea of distribution of the dataset, I used describe function which gives a table of various statistical values of all the columns
# **Insights:**
# * Temperature - has values ranges from (-2, 100.1) Fahrenhite.
# * CPI - is ranges from 126 to 227 with a standard deviation of 39.35
# * Unemployment - is ranges from 3.87 to 14.31 with a standard deviation of 1.87
original_df = df.copy() # made the copy of dataframe to check the dublicates values in the dataset
# Checking of dublicates values :
counter = 0
rs,cs = original_df.shape
if df.shape==(rs,cs):
print('The dataset doesn\'t have any duplicates')
print('Number of duplicates dropped/fixed ---> {rs-df.shape[0]}')
# Checking of missing values :
# Dataset doesn't have null values
# ## Data Visualization
# Here we have:
# **Numerical columns:** Weekly_sales, temperature, fuel_price, cpi, unemployment
# **Categorical columns:** Holiday_flag, Weekday, month, year
# Now plotted the count plot to get the distribution or frequency of the columns
fig, axes = plt.subplots(2, 2, figsize=(16, 8))
#axes[0,0].set_title('Holiday Count plot')
sns.countplot(x='Holiday_Flag', data=df, ax= axes[0,0])
#axes[0,1].set_title('Weekday Count plot')
sns.countplot(x='weekday', data=df, ax= axes[0,1]);
#axes[1,0].set_title('month Count plot')
sns.countplot(x='month', data=df, ax= axes[1,0]);
#axes[1,1].set_title('year Count plot')
sns.countplot(x='year', data=df, ax= axes[1,1]);
# **Insights:**
# * In Holiday flag most of the time there is no holiday in that week.
# * In weekdays columns observations are mostly related to the day 4
# * Most of the observation in the data is from the month of april
# * Most of the observation in the data is from year 2011
# To get the idea of how many observations are there in dataset crossponding to each store, I again plot a count plot.
plt.figure(figsize= (18,8))
sns.countplot(x= 'Store', data= df);
# All the store have equal number of data in the set
# To analyze the distribution of the data, I plotted the histogram and boxplot for Temperature, Unemployment, Fuel_Price, CPI.
fig, axes = plt.subplots(4, 2, figsize=(16, 16))
# axes[0,0].set_title('Temperature')
sns.histplot(x= 'Temperature', data= df, ax= axes[0,0])
sns.boxplot(x= 'Temperature', data= df, ax= axes[0,1])
# axes[1,0].set_title('Unemployment')
sns.histplot(x= 'Unemployment', data= df, ax= axes[1,0])
sns.boxplot(x= 'Unemployment', data= df, ax= axes[1,1])
# axes[2,0].set_title('Fuel_Price')
sns.histplot(x= 'Fuel_Price', data= df, ax= axes[2,0])
sns.boxplot(x = 'Fuel_Price', data= df, ax= axes[2,1])
# axes[3,0].set_title('CPI')
sns.histplot(x= 'CPI', data= df, ax= axes[3,0])
sns.boxplot(x= 'CPI', data= df, ax= axes[3,1]);
# **Insights:**
# * Temperature: Crossponding to the lower temperature, there is a presence of outlier.
# * Umemployment: The outlier is present in the dataset crossponding to higher and lower both values.
# * CPI: It is either very low or very high.
# Removing the outlier from Temperature column
Q1 = df['Temperature'].quantile(0.25)
Q3 = df['Temperature'].quantile(0.75)
IQR = Q3 - Q1
df = df[df['Temperature'] <= (Q3+(1.5*IQR))]
df = df[df['Temperature'] >= (Q1-(1.5*IQR))]
# Removing the outlier from Unemployment column
Q1 = df['Unemployment'].quantile(0.25)
Q3 = df['Unemployment'].quantile(0.75)
IQR = Q3 - Q1
df = df[df['Unemployment'] <= (Q3+(1.5*IQR))]
df = df[df['Unemployment'] >= (Q1-(1.5*IQR))]
# On the process of removing outlier, **484 data** points are removed from data-set
# ## Encoding
# Encoding is a process to convert the categorical columns into a numerical columns, as it is not a good preactice to train a model with categorical inputs.
cat_cols = ['Store', 'Holiday_Flag', 'weekday', 'month', 'year'] # these are the categorical columns
df[cat_cols].nunique() # Counting the unique value in each of the categorical columns.
# Imported OneHotEncoder to perfrom the encoding
from sklearn.preprocessing import OneHotEncoder
# Creating a object of the encoder function
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
# Fit the encoder object to the dataset which i want to convert into numerical form.[cat_cols])
# Creating a list of the encoded columns
encoded_cols = list(encoder.get_feature_names(cat_cols))
# Now i added those encoded columns into the original dataset by transforming it into a categorical form.
df[encoded_cols] = encoder.transform(df[cat_cols])
# ## Standardization
# To scale all the column values to specific range of 0 - 1, I used standard scaler function. It is important to give the equal weights to all the columns.
# In[ ]:
# Importing a MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
# Creating Scaler Object
scaler = MinMaxScaler()
# Fitted the scaler to the dataset
# Transformed the dataset using the fitted scaler object
scaled_df = scaler.transform(df)
# Converting the output scaled dataframe into a pandas dataframe
scaled_df = pd.DataFrame(data = scaled_df, columns = df.columns)
# Checking the output dataframe
# ## Train-Test-Split
# Split the dataset into the two part:
# 1. Training dataset (used to train the model)
# 2. Testing dataset (used to test the model)
# Drop the sales columns to get the input features
X = scaled_df.drop('Weekly_Sales', axis=1)
# Use the sales column as a target columns
y = scaled_df['Weekly_Sales']
# Importing train test split
from sklearn.model_selection import train_test_split
# dividing the dataset into the train and the test parts and each part has input feature and target features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)
# Printin the shape of all the dataset
X_train.shape, X_test.shape, y_train.shape, y_test.shape
# ## Feature Selection
# Out of all the 78 features all are not important and we have to choose the important feature out of all the features
# import a linear regerssion model
from sklearn.linear_model import LinearRegression
# import a Random Forest Regressor model
from sklearn.ensemble import RandomForestRegressor
# import a mean squared error for model evaluation
from sklearn.metrics import mean_squared_error
# import a r2 score for model evaluation
from sklearn.metrics import r2_score
# import a RFE model for feature selection
from sklearn.feature_selection import RFE
# Creatint a list to store training and test error
Trr=[]; Tss=[]; n=3
order=['ord-'+str(i) for i in range(2,n)]
Trd = pd.DataFrame(np.zeros((10,n-2)), columns=order)
Tsd = pd.DataFrame(np.zeros((10,n-2)), columns=order)
for i in range(m):
# creating a linear regression model object
lm = LinearRegression()
# creating a rfe model object with linear regression model and with a parameter of the number of features
rfe = RFE(lm, n_features_to_select=X_train.shape[1]-i)
# fitting the rfe model to the trainig dataset
rfe =, y_train)
# creating a linear regression model object for prediction
LR = LinearRegression()
# fitted the lr model using the selected features[:,rfe.support_], y_train)
# Made the prediction using the linear regression model
pred1 = LR.predict(X_train.loc[:,rfe.support_]) # make the prediction on the trainig dataset
pred2 = LR.predict(X_test.loc[:,rfe.support_]) # make the prediction on the test dataset
# Insert the mse into the Trr and Tss for train and test respectively
Trr.append(np.sqrt(mean_squared_error(y_train, pred1)))
Tss.append(np.sqrt(mean_squared_error(y_test, pred2)))
plt.plot(Trr, label= 'Train RMSE')
plt.plot(Tss, label= 'Test RMSE')
# If we Recursively Eleminate at most **Ten** features then the score is maximum.
# Eleminating 10 features and using Linear Regresion model the error printed as follows which is the best possible score.
# creating a linear regression model object
lm = LinearRegression()
# creating a rfe model object with linear regression model and with number of features equal to 10.
rfe = RFE(lm,n_features_to_select=X_train.shape[1]-9)
# fitting the rfe model to the trainig dataset
rfe =, y_train)
# creating a linear regression model object for prediction
LR = LinearRegression()
# fitted the lr model using the selected features[:,rfe.support_], y_train)
# Made the prediction using the linear regression model
pred1 = LR.predict(X_train.loc[:,rfe.support_])
pred2 = LR.predict(X_test.loc[:,rfe.support_])
# Printing the results as a MSE and r2_score.
print("MSE train",np.sqrt(mean_squared_error(y_train, pred1)))
print("MSE test",np.sqrt(mean_squared_error(y_test, pred2)))
print("r2_score train - {}".format(r2_score(y_train, pred1)))
print("r2_score test - {}".format(r2_score(y_test, pred2)))
# Now Removing the 10 features and create the New training and test dataset
X_train = X_train.loc[:,rfe.support_]
X_test = X_test.loc[:,rfe.support_]
# Now onwards I am going to use various models
# ## Linear Regression
lr =LinearRegression(), y_train)
pred1 = lr.predict(X_train)
pred2 = lr.predict(X_test)
print("Root Mean Squared Error train {}".format(np.mean(mean_squared_error(y_train, pred1))))
print("Root Mean Squared Error test {}".format(np.sqrt(mean_squared_error(y_test, pred2))))
print("r2_score train {}".format(r2_score(y_train, pred1)))
print("r2_score test {}".format(r2_score(y_test, pred2)))
# **Ridge Regression**
from sklearn.linear_model import Ridge
rr = Ridge(), y_train)
predrr1 = rr.predict(X_train)
predrr2 = rr.predict(X_test)
print("Root Mean Squared Error train {}".format(np.mean(mean_squared_error(y_train, predrr1))))
print("Root Mean Squared Error test {}".format(np.sqrt(mean_squared_error(y_test, predrr2))))
print("r2_score train {}".format(r2_score(y_train, predrr1)))
print("r2_score test {}".format(r2_score(y_test, predrr2)))
# **Lasso Regression**
from sklearn.linear_model import Lasso
lr = Lasso(), y_train)
predlr1 = lr.predict(X_train)
predlr2 = lr.predict(X_test)
print("Root Mean Squared Error train {}".format(np.mean(mean_squared_error(y_train, predlr1))))
print("Root Mean Squared Error test {}".format(np.sqrt(mean_squared_error(y_test, predlr2))))
print("r2_score train {}".format(r2_score(y_train, predlr1)))
print("r2_score test {}".format(r2_score(y_test, predlr2)))
# **ElasticNet Regression**
# In[ ]:
from sklearn.linear_model import ElasticNet
en = ElasticNet(), y_train)
predlr1 = en.predict(X_train)
predlr2 = en.predict(X_test)
print("Root Mean Squared Error train {}".format(np.mean(mean_squared_error(y_train, predlr1))))
print("Root Mean Squared Error test {}".format(np.sqrt(mean_squared_error(y_test, predlr2))))
print("r2_score train {}".format(r2_score(y_train, predlr1)))
print("r2_score test {}".format(r2_score(y_test, predlr2)))
# **Polynomial Regression**
from sklearn.preprocessing import PolynomialFeatures
Trr = []
Tss = []
for i in range(2,4):
poly_reg = PolynomialFeatures(degree = i)
pl_X_train = poly_reg.fit_transform(X_train)
pl_X_test = poly_reg.fit_transform(X_test)
lr = LinearRegression(), y_train)
pred_poly_train = lr.predict(pl_X_train)
Trr.append(np.sqrt(mean_squared_error(y_train, pred_poly_train)))
pred_poly_test = lr.predict(pl_X_test)
Tss.append(np.sqrt(mean_squared_error(y_test, pred_poly_test)))
plt.plot(range(2,4), Trr, label= 'Training')
plt.plot(range(2,4), Tss, label= 'Testing')
plt.title('Polynomial Feature on training data')
# It is clear that in between 2-4 degree polynomial regression 2 has Bais-variance tradeoff
poly_reg = PolynomialFeatures(degree = 2)
pl_X_train = poly_reg.fit_transform(X_train)
pl_X_test = poly_reg.fit_transform(X_test)
lr = LinearRegression(), y_train)
pred_poly_train = lr.predict(pl_X_train)
print("r2_score train {}".format(r2_score(pred_poly_train, y_train)))
pred_poly_test = lr.predict(pl_X_test)
print("r2_score test {}".format(r2_score(pred_poly_test, y_test)))
print("Root Mean Squared Error train {}".format(np.mean(mean_squared_error(y_train, pred_poly_train))))
print("Root Mean Squared Error test {}".format(np.sqrt(mean_squared_error(y_test, pred_poly_test))))
#creating a tabel
tabel = {
'Train R2': [0.9324387485162124, 0.9323641360074176, 0.0, 0.0, 0.9563932198334125],
'Test R2' : [0.9223162582948724, 0.9219331606995953, -0.00014816618161050954, -0.00014816618161050954, -0.0005599911350040454],
'Train RMSE' : [0.0016695395619648289, 0.0016713833486400986, 0.024711495499242828, 0.024711495499242828, 0.0010346077251656776 ],
'Test RMSE' : [0.04569350618906344, 0.04580603645234492, 0.16395383804559885, 0.16395383804559885, 730742413.004261 ]
df_new = pd.DataFrame(tabel)
df_new.index = ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'ElasticNet Regression', 'Polynomial Regression']
# It is clear that Linear Regression is the Best Model in the dataset, with test accuracy of 92%(approx).
# To improve the accuracy further we can apply other regressor i.e. Random Forest, G
# Now I am going to imporve the accuracy till 98% - 99%. For this I have to use Decision Tree or Random Forest etc.
# **Decision Tree Regressor**
# In[ ]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(), y_train)
# In[ ]:
pred_dt1 = dt.predict(X_train)
pred_dt2 = dt.predict(X_test)
print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_dt1))))
print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_dt2))))
print('Accuracy Score train: ', dt.score(X_train, y_train))
print('Accuracy Score test: ', dt.score(X_test, y_test))
max_depth_range = np.arange(1,40,1)
for x in max_depth_range:
dt = DecisionTreeRegressor(max_depth= x), y_train)
pred_dt1 = dt.predict(X_train)
pred_dt2 = dt.predict(X_test)
print('for max_depth: ', x)
print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_dt1))))
print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_dt2))))
print('Accuracy Score train: ', dt.score(X_train, y_train))
print('Accuracy Score test: ', dt.score(X_test, y_test))
# Decision Tree has maximum accuracy at **maximum depth 39**
# **Random Forest Regressor**
from sklearn.ensemble import RandomForestRegressor
rfc = RandomForestRegressor(), y_train)
pred_rfc1 = rfc.predict(X_train)
pred_rfc2 = rfc.predict(X_test)
print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_rfc1))))
print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_rfc2))))
print('Accuracy Score train: ', dt.score(X_train, y_train))
print('Accuracy Score test: ', dt.score(X_test, y_test))
max_depth_range = np.arange(1,40,1)
for x in max_depth_range:
dt = RandomForestRegressor(max_depth= x), y_train)
pred_xg1 = dt.predict(X_train)
pred_xg2 = dt.predict(X_test)
print('for max_depth: ', x)
print('for max_depth: ', x)
print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_xg1))))
print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_xg2))))
print('Accuracy Score train: ', dt.score(X_train, y_train))
print('Accuracy Score test: ', dt.score(X_test, y_test))
# In the depth of **36** the** Random Forest Regressor** has its maximum value of accuracy.
rfc = RandomForestRegressor(max_depth = 36), y_train)
pred_rfc1 = rfc.predict(X_train)
pred_rfc2 = rfc.predict(X_test)
print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_rfc1))))
print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_rfc2))))
print('Accuracy Score train: ', rfc.score(X_train, y_train))
print('Accuracy Score test: ', rfc.score(X_test, y_test))
# **XG Boost Regressor**
from xgboost import XGBRegressor
xg = XGBRegressor(), y_train)
pred_xg1 = xg.predict(X_train)
pred_xg2 = xg.predict(X_test)
print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_xg1))))
print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_xg2))))
max_depth_range = np.arange(1,15,1)
for x in max_depth_range:
dt = XGBRegressor(max_depth= x), y_train)
pred_xg1 = dt.predict(X_train)
pred_xg2 = dt.predict(X_test)
print('for max_depth: ', x)
print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_xg1))))
print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_xg2))))
print('Accuracy Score train: ', dt.score(X_train, y_train))
print('Accuracy Score test: ', dt.score(X_test, y_test))
# It means **maximun depth 9** has best value of Accuracy
xg = XGBRegressor(max_depth = 9), y_train)
pred_xg1 = xg.predict(X_train)
pred_xg2 = xg.predict(X_test)
print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_xg1))))
print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_xg2))))
print('Accuracy Score train: ', xg.score(X_train, y_train))
print('Accuracy Score test: ', xg.score(X_test, y_test))
tabel1 = {
'Train Score': [0.9679040861170889, 0.9637587048322853, 0.9601543222728802],
'Test Score' : [0.8808466556220073, 0.9028060343874318, 0.9115195955339979],
'Train RMSE' : [0.02816270639447925, 0.02992618589836856, 0.03137907401148098],
'Test RMSE' : [0.05659037012899937, 0.051110374979016944, 0.04876553192516943]
df1 = pd.DataFrame(tabel1)
df1.index = ['Decision Tree', 'Random Forest', 'XGBoost']
# Among the method XGBoost is the best method for the data set
# By Comparising the Linear and XGBoost we can conclude that linear Regression the best suited for the above data set
