#!/usr/bin/env python # coding: utf-8 # ## Data Loading # Importing the necessary libraries, like pandas, numpy and some plotting libraries such as matplotlib and seaborn # In[ ]: import pandas as pd import numpy as np import matplotlib import matplotlib.pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') # Set the default font size, figure size and the grid in the plot # In[ ]: sns.set_style('darkgrid') matplotlib.rcParams['font.size'] = 14 matplotlib.rcParams['figure.figsize'] = (10, 6) matplotlib.rcParams['figure.facecolor'] = '#00000000' # Reading of data as a pandas dataframe and named as **df** # In[ ]: df = pd.read_csv('Walmart.csv') # In[ ]: df # **About Data:** # * Store - the store number # * Date - the week of sales # * Weekly_Sales - sales for the given store # * Holiday_Flag - whether the week is a special holiday week 1 – Holiday week 0 – Non-holiday week # * Temperature - Temperature on the day of sale # * Fuel_Price - Cost of fuel in the region # * CPI – Prevailing consumer price index # * Unemployment - Prevailing unemployment rate # **Insights:** # # * Here the target columns is Weekly_Sales. # * The data is related to walmart store of united state of america. Where **Store**, **Holiday_Flag** are categorical in nature # * The data is collected over a 45 stores and weekly sales gives the sales of the crossponding store. # # ## Data Exploration and Modification # In[ ]: df.info() # it gives the information (like count and data type) of the dataset # Here Date columns is **object** and other remain columns are **interger or float** in nature. Now using the pandas I change the date column datatype(i.e. object) into a pandas-datetime. # In[ ]: df.Date=pd.to_datetime(df.Date) # Using the date column i create three seperate columns of weekday, month and year and added to the existing dataset. # In[ ]: df['weekday'] = df.Date.dt.weekday df['month'] = df.Date.dt.month df['year'] = df.Date.dt.year # Now I drop the date columns because of no use of it. # In[ ]: df.drop(['Date'], axis=1, inplace=True) # Hence the modified dataset is look like: # In[ ]: df.head(3) # Explored the unique values of the weekday, month and year columns as follows: # In[ ]: print('years unique value', df.year.unique()) print('months unique value', df.month.unique()) print('weekday unique value', df.weekday.unique()) # Months and weekday are as usual, but the data is taken from year 2010, 2011, 2012 only. # Now to get the idea of distribution of the dataset, I used describe function which gives a table of various statistical values of all the columns # In[ ]: df.describe() # **Insights:** # * Temperature - has values ranges from (-2, 100.1) Fahrenhite. # * CPI - is ranges from 126 to 227 with a standard deviation of 39.35 # * Unemployment - is ranges from 3.87 to 14.31 with a standard deviation of 1.87 # In[ ]: original_df = df.copy() # made the copy of dataframe to check the dublicates values in the dataset # Checking of dublicates values : # In[ ]: counter = 0 rs,cs = original_df.shape df.drop_duplicates(inplace=True) if df.shape==(rs,cs): print('The dataset doesn\'t have any duplicates') else: print('Number of duplicates dropped/fixed ---> {rs-df.shape[0]}') # Checking of missing values : # In[ ]: df.isnull().sum() # Dataset doesn't have null values # ## Data Visualization # In[ ]: df.head(3) # Here we have: # # **Numerical columns:** Weekly_sales, temperature, fuel_price, cpi, unemployment # # **Categorical columns:** Holiday_flag, Weekday, month, year # # Now plotted the count plot to get the distribution or frequency of the columns # In[ ]: fig, axes = plt.subplots(2, 2, figsize=(16, 8)) #axes[0,0].set_title('Holiday Count plot') sns.countplot(x='Holiday_Flag', data=df, ax= axes[0,0]) #axes[0,1].set_title('Weekday Count plot') sns.countplot(x='weekday', data=df, ax= axes[0,1]); #axes[1,0].set_title('month Count plot') sns.countplot(x='month', data=df, ax= axes[1,0]); #axes[1,1].set_title('year Count plot') sns.countplot(x='year', data=df, ax= axes[1,1]); # **Insights:** # # * In Holiday flag most of the time there is no holiday in that week. # * In weekdays columns observations are mostly related to the day 4 # * Most of the observation in the data is from the month of april # * Most of the observation in the data is from year 2011 # To get the idea of how many observations are there in dataset crossponding to each store, I again plot a count plot. # In[ ]: plt.figure(figsize= (18,8)) sns.countplot(x= 'Store', data= df); plt.show() # All the store have equal number of data in the set # In[ ]: df.head(1) # To analyze the distribution of the data, I plotted the histogram and boxplot for Temperature, Unemployment, Fuel_Price, CPI. # In[ ]: fig, axes = plt.subplots(4, 2, figsize=(16, 16)) # axes[0,0].set_title('Temperature') sns.histplot(x= 'Temperature', data= df, ax= axes[0,0]) sns.boxplot(x= 'Temperature', data= df, ax= axes[0,1]) # axes[1,0].set_title('Unemployment') sns.histplot(x= 'Unemployment', data= df, ax= axes[1,0]) sns.boxplot(x= 'Unemployment', data= df, ax= axes[1,1]) # axes[2,0].set_title('Fuel_Price') sns.histplot(x= 'Fuel_Price', data= df, ax= axes[2,0]) sns.boxplot(x = 'Fuel_Price', data= df, ax= axes[2,1]) # axes[3,0].set_title('CPI') sns.histplot(x= 'CPI', data= df, ax= axes[3,0]) sns.boxplot(x= 'CPI', data= df, ax= axes[3,1]); # **Insights:** # # * Temperature: Crossponding to the lower temperature, there is a presence of outlier. # * Umemployment: The outlier is present in the dataset crossponding to higher and lower both values. # * CPI: It is either very low or very high. # In[ ]: # Removing the outlier from Temperature column Q1 = df['Temperature'].quantile(0.25) Q3 = df['Temperature'].quantile(0.75) IQR = Q3 - Q1 df = df[df['Temperature'] <= (Q3+(1.5*IQR))] df = df[df['Temperature'] >= (Q1-(1.5*IQR))] # In[ ]: # Removing the outlier from Unemployment column Q1 = df['Unemployment'].quantile(0.25) Q3 = df['Unemployment'].quantile(0.75) IQR = Q3 - Q1 df = df[df['Unemployment'] <= (Q3+(1.5*IQR))] df = df[df['Unemployment'] >= (Q1-(1.5*IQR))] # In[ ]: df.shape # On the process of removing outlier, **484 data** points are removed from data-set # ## Encoding # Encoding is a process to convert the categorical columns into a numerical columns, as it is not a good preactice to train a model with categorical inputs. # In[ ]: cat_cols = ['Store', 'Holiday_Flag', 'weekday', 'month', 'year'] # these are the categorical columns # In[ ]: df[cat_cols].nunique() # Counting the unique value in each of the categorical columns. # In[ ]: # Imported OneHotEncoder to perfrom the encoding from sklearn.preprocessing import OneHotEncoder # Creating a object of the encoder function encoder = OneHotEncoder(sparse=False, handle_unknown='ignore') # Fit the encoder object to the dataset which i want to convert into numerical form. encoder.fit(df[cat_cols]) # In[ ]: # Creating a list of the encoded columns encoded_cols = list(encoder.get_feature_names(cat_cols)) print(encoded_cols) # In[ ]: # Now i added those encoded columns into the original dataset by transforming it into a categorical form. df[encoded_cols] = encoder.transform(df[cat_cols]) # In[ ]: df.shape # ## Standardization # To scale all the column values to specific range of 0 - 1, I used standard scaler function. It is important to give the equal weights to all the columns. # In[ ]: # Importing a MinMaxScaler from sklearn.preprocessing import MinMaxScaler # Creating Scaler Object scaler = MinMaxScaler() # Fitted the scaler to the dataset scaler.fit(df) # Transformed the dataset using the fitted scaler object scaled_df = scaler.transform(df) # In[ ]: # Converting the output scaled dataframe into a pandas dataframe scaled_df = pd.DataFrame(data = scaled_df, columns = df.columns) # In[ ]: # Checking the output dataframe scaled_df.head(3) # ## Train-Test-Split # Split the dataset into the two part: # 1. Training dataset (used to train the model) # 2. Testing dataset (used to test the model) # In[ ]: # Drop the sales columns to get the input features X = scaled_df.drop('Weekly_Sales', axis=1) # Use the sales column as a target columns y = scaled_df['Weekly_Sales'] # In[ ]: # Importing train test split from sklearn.model_selection import train_test_split # dividing the dataset into the train and the test parts and each part has input feature and target features X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42) # In[ ]: # Printin the shape of all the dataset X_train.shape, X_test.shape, y_train.shape, y_test.shape # ## Feature Selection # Out of all the 78 features all are not important and we have to choose the important feature out of all the features # In[ ]: # import a linear regerssion model from sklearn.linear_model import LinearRegression # import a Random Forest Regressor model from sklearn.ensemble import RandomForestRegressor # import a mean squared error for model evaluation from sklearn.metrics import mean_squared_error # import a r2 score for model evaluation from sklearn.metrics import r2_score # import a RFE model for feature selection from sklearn.feature_selection import RFE # In[ ]: # Creatint a list to store training and test error Trr=[]; Tss=[]; n=3 order=['ord-'+str(i) for i in range(2,n)] Trd = pd.DataFrame(np.zeros((10,n-2)), columns=order) Tsd = pd.DataFrame(np.zeros((10,n-2)), columns=order) m=df.shape[1]-2 for i in range(m): # creating a linear regression model object lm = LinearRegression() # creating a rfe model object with linear regression model and with a parameter of the number of features rfe = RFE(lm, n_features_to_select=X_train.shape[1]-i) # fitting the rfe model to the trainig dataset rfe = rfe.fit(X_train, y_train) # creating a linear regression model object for prediction LR = LinearRegression() # fitted the lr model using the selected features LR.fit(X_train.loc[:,rfe.support_], y_train) # Made the prediction using the linear regression model pred1 = LR.predict(X_train.loc[:,rfe.support_]) # make the prediction on the trainig dataset pred2 = LR.predict(X_test.loc[:,rfe.support_]) # make the prediction on the test dataset # Insert the mse into the Trr and Tss for train and test respectively Trr.append(np.sqrt(mean_squared_error(y_train, pred1))) Tss.append(np.sqrt(mean_squared_error(y_test, pred2))) # In[ ]: plt.plot(Trr, label= 'Train RMSE') plt.plot(Tss, label= 'Test RMSE') plt.legend() plt.show() # If we Recursively Eleminate at most **Ten** features then the score is maximum. # In[2]: # Eleminating 10 features and using Linear Regresion model the error printed as follows which is the best possible score. # creating a linear regression model object lm = LinearRegression() # creating a rfe model object with linear regression model and with number of features equal to 10. rfe = RFE(lm,n_features_to_select=X_train.shape[1]-9) # fitting the rfe model to the trainig dataset rfe = rfe.fit(X_train, y_train) # creating a linear regression model object for prediction LR = LinearRegression() # fitted the lr model using the selected features LR.fit(X_train.loc[:,rfe.support_], y_train) # Made the prediction using the linear regression model pred1 = LR.predict(X_train.loc[:,rfe.support_]) pred2 = LR.predict(X_test.loc[:,rfe.support_]) # Printing the results as a MSE and r2_score. print("MSE train",np.sqrt(mean_squared_error(y_train, pred1))) print("MSE test",np.sqrt(mean_squared_error(y_test, pred2))) print("r2_score train - {}".format(r2_score(y_train, pred1))) print("r2_score test - {}".format(r2_score(y_test, pred2))) # Now Removing the 10 features and create the New training and test dataset # In[ ]: X_train = X_train.loc[:,rfe.support_] X_test = X_test.loc[:,rfe.support_] # Now onwards I am going to use various models # ## Linear Regression # In[ ]: lr =LinearRegression() lr.fit(X_train, y_train) pred1 = lr.predict(X_train) pred2 = lr.predict(X_test) print("Root Mean Squared Error train {}".format(np.mean(mean_squared_error(y_train, pred1)))) print("Root Mean Squared Error test {}".format(np.sqrt(mean_squared_error(y_test, pred2)))) print("r2_score train {}".format(r2_score(y_train, pred1))) print("r2_score test {}".format(r2_score(y_test, pred2))) # **Ridge Regression** # In[ ]: from sklearn.linear_model import Ridge rr = Ridge() rr.fit(X_train, y_train) predrr1 = rr.predict(X_train) predrr2 = rr.predict(X_test) print("Root Mean Squared Error train {}".format(np.mean(mean_squared_error(y_train, predrr1)))) print("Root Mean Squared Error test {}".format(np.sqrt(mean_squared_error(y_test, predrr2)))) print("r2_score train {}".format(r2_score(y_train, predrr1))) print("r2_score test {}".format(r2_score(y_test, predrr2))) # **Lasso Regression** # In[ ]: from sklearn.linear_model import Lasso lr = Lasso() lr.fit(X_train, y_train) predlr1 = lr.predict(X_train) predlr2 = lr.predict(X_test) print("Root Mean Squared Error train {}".format(np.mean(mean_squared_error(y_train, predlr1)))) print("Root Mean Squared Error test {}".format(np.sqrt(mean_squared_error(y_test, predlr2)))) print("r2_score train {}".format(r2_score(y_train, predlr1))) print("r2_score test {}".format(r2_score(y_test, predlr2))) # **ElasticNet Regression** # In[ ]: from sklearn.linear_model import ElasticNet en = ElasticNet() en.fit(X_train, y_train) predlr1 = en.predict(X_train) predlr2 = en.predict(X_test) print("Root Mean Squared Error train {}".format(np.mean(mean_squared_error(y_train, predlr1)))) print("Root Mean Squared Error test {}".format(np.sqrt(mean_squared_error(y_test, predlr2)))) print("r2_score train {}".format(r2_score(y_train, predlr1))) print("r2_score test {}".format(r2_score(y_test, predlr2))) # **Polynomial Regression** # In[ ]: from sklearn.preprocessing import PolynomialFeatures # In[ ]: Trr = [] Tss = [] for i in range(2,4): poly_reg = PolynomialFeatures(degree = i) pl_X_train = poly_reg.fit_transform(X_train) pl_X_test = poly_reg.fit_transform(X_test) lr = LinearRegression() lr.fit(pl_X_train, y_train) pred_poly_train = lr.predict(pl_X_train) Trr.append(np.sqrt(mean_squared_error(y_train, pred_poly_train))) pred_poly_test = lr.predict(pl_X_test) Tss.append(np.sqrt(mean_squared_error(y_test, pred_poly_test))) # In[ ]: plt.figure(figsize=[15,6]) plt.subplot(1,2,1) plt.plot(range(2,4), Trr, label= 'Training') plt.plot(range(2,4), Tss, label= 'Testing') plt.title('Polynomial Feature on training data') plt.xlabel('Degree') plt.ylabel('RMSE') plt.legend() # It is clear that in between 2-4 degree polynomial regression 2 has Bais-variance tradeoff # In[ ]: poly_reg = PolynomialFeatures(degree = 2) pl_X_train = poly_reg.fit_transform(X_train) pl_X_test = poly_reg.fit_transform(X_test) lr = LinearRegression() lr.fit(pl_X_train, y_train) pred_poly_train = lr.predict(pl_X_train) print("r2_score train {}".format(r2_score(pred_poly_train, y_train))) pred_poly_test = lr.predict(pl_X_test) print("r2_score test {}".format(r2_score(pred_poly_test, y_test))) print("Root Mean Squared Error train {}".format(np.mean(mean_squared_error(y_train, pred_poly_train)))) print("Root Mean Squared Error test {}".format(np.sqrt(mean_squared_error(y_test, pred_poly_test)))) # In[ ]: #creating a tabel tabel = { 'Train R2': [0.9324387485162124, 0.9323641360074176, 0.0, 0.0, 0.9563932198334125], 'Test R2' : [0.9223162582948724, 0.9219331606995953, -0.00014816618161050954, -0.00014816618161050954, -0.0005599911350040454], 'Train RMSE' : [0.0016695395619648289, 0.0016713833486400986, 0.024711495499242828, 0.024711495499242828, 0.0010346077251656776 ], 'Test RMSE' : [0.04569350618906344, 0.04580603645234492, 0.16395383804559885, 0.16395383804559885, 730742413.004261 ] } # In[ ]: df_new = pd.DataFrame(tabel) # In[ ]: df_new.index = ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'ElasticNet Regression', 'Polynomial Regression'] # In[ ]: df_new # It is clear that Linear Regression is the Best Model in the dataset, with test accuracy of 92%(approx). # # To improve the accuracy further we can apply other regressor i.e. Random Forest, G # Now I am going to imporve the accuracy till 98% - 99%. For this I have to use Decision Tree or Random Forest etc. # **Decision Tree Regressor** # In[ ]: from sklearn.tree import DecisionTreeRegressor dt = DecisionTreeRegressor() dt.fit(X_train, y_train) # In[ ]: pred_dt1 = dt.predict(X_train) pred_dt2 = dt.predict(X_test) print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_dt1)))) print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_dt2)))) print('Accuracy Score train: ', dt.score(X_train, y_train)) print('Accuracy Score test: ', dt.score(X_test, y_test)) # In[ ]: max_depth_range = np.arange(1,40,1) for x in max_depth_range: dt = DecisionTreeRegressor(max_depth= x) dt.fit(X_train, y_train) pred_dt1 = dt.predict(X_train) pred_dt2 = dt.predict(X_test) print('for max_depth: ', x) print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_dt1)))) print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_dt2)))) print('Accuracy Score train: ', dt.score(X_train, y_train)) print('Accuracy Score test: ', dt.score(X_test, y_test)) print() # Decision Tree has maximum accuracy at **maximum depth 39** # **Random Forest Regressor** # In[ ]: from sklearn.ensemble import RandomForestRegressor rfc = RandomForestRegressor() rfc.fit(X_train, y_train) # In[ ]: pred_rfc1 = rfc.predict(X_train) pred_rfc2 = rfc.predict(X_test) print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_rfc1)))) print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_rfc2)))) print('Accuracy Score train: ', dt.score(X_train, y_train)) print('Accuracy Score test: ', dt.score(X_test, y_test)) # In[ ]: max_depth_range = np.arange(1,40,1) for x in max_depth_range: dt = RandomForestRegressor(max_depth= x) dt.fit(X_train, y_train) pred_xg1 = dt.predict(X_train) pred_xg2 = dt.predict(X_test) print('for max_depth: ', x) print('for max_depth: ', x) print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_xg1)))) print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_xg2)))) print('Accuracy Score train: ', dt.score(X_train, y_train)) print('Accuracy Score test: ', dt.score(X_test, y_test)) print() # In the depth of **36** the** Random Forest Regressor** has its maximum value of accuracy. # In[ ]: rfc = RandomForestRegressor(max_depth = 36) rfc.fit(X_train, y_train) pred_rfc1 = rfc.predict(X_train) pred_rfc2 = rfc.predict(X_test) print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_rfc1)))) print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_rfc2)))) print('Accuracy Score train: ', rfc.score(X_train, y_train)) print('Accuracy Score test: ', rfc.score(X_test, y_test)) # **XG Boost Regressor** # In[ ]: from xgboost import XGBRegressor xg = XGBRegressor() xg.fit(X_train, y_train) # In[ ]: pred_xg1 = xg.predict(X_train) pred_xg2 = xg.predict(X_test) print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_xg1)))) print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_xg2)))) # In[ ]: max_depth_range = np.arange(1,15,1) for x in max_depth_range: dt = XGBRegressor(max_depth= x) dt.fit(X_train, y_train) pred_xg1 = dt.predict(X_train) pred_xg2 = dt.predict(X_test) print('for max_depth: ', x) print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_xg1)))) print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_xg2)))) print('Accuracy Score train: ', dt.score(X_train, y_train)) print('Accuracy Score test: ', dt.score(X_test, y_test)) print() # It means **maximun depth 9** has best value of Accuracy # In[ ]: xg = XGBRegressor(max_depth = 9) xg.fit(X_train, y_train) pred_xg1 = xg.predict(X_train) pred_xg2 = xg.predict(X_test) print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_xg1)))) print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_xg2)))) print('Accuracy Score train: ', xg.score(X_train, y_train)) print('Accuracy Score test: ', xg.score(X_test, y_test)) # In[ ]: tabel1 = { 'Train Score': [0.9679040861170889, 0.9637587048322853, 0.9601543222728802], 'Test Score' : [0.8808466556220073, 0.9028060343874318, 0.9115195955339979], 'Train RMSE' : [0.02816270639447925, 0.02992618589836856, 0.03137907401148098], 'Test RMSE' : [0.05659037012899937, 0.051110374979016944, 0.04876553192516943] } # In[ ]: df1 = pd.DataFrame(tabel1) # In[ ]: df1 # In[ ]: df1.index = ['Decision Tree', 'Random Forest', 'XGBoost'] # In[ ]: df1 # Among the method XGBoost is the best method for the data set # By Comparising the Linear and XGBoost we can conclude that linear Regression the best suited for the above data set # In[ ]: