compiler-ai commited on
Commit
6fba49f
1 Parent(s): c913229

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +833 -0
app.py ADDED
@@ -0,0 +1,833 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # ## Data Loading
5
+
6
+ # Importing the necessary libraries, like pandas, numpy and some plotting libraries such as matplotlib and seaborn
7
+
8
+ # In[ ]:
9
+
10
+
11
+ import pandas as pd
12
+ import numpy as np
13
+ import matplotlib
14
+ import matplotlib.pyplot as plt
15
+ import seaborn as sns
16
+ get_ipython().run_line_magic('matplotlib', 'inline')
17
+
18
+
19
+ # Set the default font size, figure size and the grid in the plot
20
+
21
+ # In[ ]:
22
+
23
+
24
+ sns.set_style('darkgrid')
25
+ matplotlib.rcParams['font.size'] = 14
26
+ matplotlib.rcParams['figure.figsize'] = (10, 6)
27
+ matplotlib.rcParams['figure.facecolor'] = '#00000000'
28
+
29
+
30
+ # Reading of data as a pandas dataframe and named as **df**
31
+
32
+ # In[ ]:
33
+
34
+
35
+ df = pd.read_csv('Walmart.csv')
36
+
37
+
38
+ # In[ ]:
39
+
40
+
41
+ df
42
+
43
+
44
+ # **About Data:**
45
+ # * Store - the store number
46
+ # * Date - the week of sales
47
+ # * Weekly_Sales - sales for the given store
48
+ # * Holiday_Flag - whether the week is a special holiday week 1 – Holiday week 0 – Non-holiday week
49
+ # * Temperature - Temperature on the day of sale
50
+ # * Fuel_Price - Cost of fuel in the region
51
+ # * CPI – Prevailing consumer price index
52
+ # * Unemployment - Prevailing unemployment rate
53
+
54
+ # **Insights:**
55
+ #
56
+ # * Here the target columns is Weekly_Sales.
57
+ # * The data is related to walmart store of united state of america. Where **Store**, **Holiday_Flag** are categorical in nature
58
+ # * The data is collected over a 45 stores and weekly sales gives the sales of the crossponding store.
59
+ #
60
+
61
+ # ## Data Exploration and Modification
62
+
63
+ # In[ ]:
64
+
65
+
66
+ df.info() # it gives the information (like count and data type) of the dataset
67
+
68
+
69
+ # Here Date columns is **object** and other remain columns are **interger or float** in nature. Now using the pandas I change the date column datatype(i.e. object) into a pandas-datetime.
70
+
71
+ # In[ ]:
72
+
73
+
74
+ df.Date=pd.to_datetime(df.Date)
75
+
76
+
77
+ # Using the date column i create three seperate columns of weekday, month and year and added to the existing dataset.
78
+
79
+ # In[ ]:
80
+
81
+
82
+ df['weekday'] = df.Date.dt.weekday
83
+ df['month'] = df.Date.dt.month
84
+ df['year'] = df.Date.dt.year
85
+
86
+
87
+ # Now I drop the date columns because of no use of it.
88
+
89
+ # In[ ]:
90
+
91
+
92
+ df.drop(['Date'], axis=1, inplace=True)
93
+
94
+
95
+ # Hence the modified dataset is look like:
96
+
97
+ # In[ ]:
98
+
99
+
100
+ df.head(3)
101
+
102
+
103
+ # Explored the unique values of the weekday, month and year columns as follows:
104
+
105
+ # In[ ]:
106
+
107
+
108
+ print('years unique value', df.year.unique())
109
+ print('months unique value', df.month.unique())
110
+ print('weekday unique value', df.weekday.unique())
111
+
112
+
113
+ # Months and weekday are as usual, but the data is taken from year 2010, 2011, 2012 only.
114
+
115
+ # Now to get the idea of distribution of the dataset, I used describe function which gives a table of various statistical values of all the columns
116
+
117
+ # In[ ]:
118
+
119
+
120
+ df.describe()
121
+
122
+
123
+ # **Insights:**
124
+ # * Temperature - has values ranges from (-2, 100.1) Fahrenhite.
125
+ # * CPI - is ranges from 126 to 227 with a standard deviation of 39.35
126
+ # * Unemployment - is ranges from 3.87 to 14.31 with a standard deviation of 1.87
127
+
128
+ # In[ ]:
129
+
130
+
131
+ original_df = df.copy() # made the copy of dataframe to check the dublicates values in the dataset
132
+
133
+
134
+ # Checking of dublicates values :
135
+
136
+ # In[ ]:
137
+
138
+
139
+ counter = 0
140
+ rs,cs = original_df.shape
141
+
142
+ df.drop_duplicates(inplace=True)
143
+
144
+ if df.shape==(rs,cs):
145
+ print('The dataset doesn\'t have any duplicates')
146
+ else:
147
+ print('Number of duplicates dropped/fixed ---> {rs-df.shape[0]}')
148
+
149
+
150
+ # Checking of missing values :
151
+
152
+ # In[ ]:
153
+
154
+
155
+ df.isnull().sum()
156
+
157
+
158
+ # Dataset doesn't have null values
159
+
160
+ # ## Data Visualization
161
+
162
+ # In[ ]:
163
+
164
+
165
+ df.head(3)
166
+
167
+
168
+ # Here we have:
169
+ #
170
+ # **Numerical columns:** Weekly_sales, temperature, fuel_price, cpi, unemployment
171
+ #
172
+ # **Categorical columns:** Holiday_flag, Weekday, month, year
173
+ #
174
+ # Now plotted the count plot to get the distribution or frequency of the columns
175
+
176
+ # In[ ]:
177
+
178
+
179
+ fig, axes = plt.subplots(2, 2, figsize=(16, 8))
180
+
181
+ #axes[0,0].set_title('Holiday Count plot')
182
+ sns.countplot(x='Holiday_Flag', data=df, ax= axes[0,0])
183
+
184
+ #axes[0,1].set_title('Weekday Count plot')
185
+ sns.countplot(x='weekday', data=df, ax= axes[0,1]);
186
+
187
+ #axes[1,0].set_title('month Count plot')
188
+ sns.countplot(x='month', data=df, ax= axes[1,0]);
189
+
190
+ #axes[1,1].set_title('year Count plot')
191
+ sns.countplot(x='year', data=df, ax= axes[1,1]);
192
+
193
+
194
+ # **Insights:**
195
+ #
196
+ # * In Holiday flag most of the time there is no holiday in that week.
197
+ # * In weekdays columns observations are mostly related to the day 4
198
+ # * Most of the observation in the data is from the month of april
199
+ # * Most of the observation in the data is from year 2011
200
+
201
+ # To get the idea of how many observations are there in dataset crossponding to each store, I again plot a count plot.
202
+
203
+ # In[ ]:
204
+
205
+
206
+ plt.figure(figsize= (18,8))
207
+ sns.countplot(x= 'Store', data= df);
208
+ plt.show()
209
+
210
+
211
+ # All the store have equal number of data in the set
212
+
213
+ # In[ ]:
214
+
215
+
216
+ df.head(1)
217
+
218
+
219
+ # To analyze the distribution of the data, I plotted the histogram and boxplot for Temperature, Unemployment, Fuel_Price, CPI.
220
+
221
+ # In[ ]:
222
+
223
+
224
+ fig, axes = plt.subplots(4, 2, figsize=(16, 16))
225
+ # axes[0,0].set_title('Temperature')
226
+ sns.histplot(x= 'Temperature', data= df, ax= axes[0,0])
227
+
228
+ sns.boxplot(x= 'Temperature', data= df, ax= axes[0,1])
229
+
230
+ # axes[1,0].set_title('Unemployment')
231
+ sns.histplot(x= 'Unemployment', data= df, ax= axes[1,0])
232
+
233
+ sns.boxplot(x= 'Unemployment', data= df, ax= axes[1,1])
234
+
235
+ # axes[2,0].set_title('Fuel_Price')
236
+ sns.histplot(x= 'Fuel_Price', data= df, ax= axes[2,0])
237
+
238
+ sns.boxplot(x = 'Fuel_Price', data= df, ax= axes[2,1])
239
+
240
+ # axes[3,0].set_title('CPI')
241
+ sns.histplot(x= 'CPI', data= df, ax= axes[3,0])
242
+
243
+ sns.boxplot(x= 'CPI', data= df, ax= axes[3,1]);
244
+
245
+
246
+ # **Insights:**
247
+ #
248
+ # * Temperature: Crossponding to the lower temperature, there is a presence of outlier.
249
+ # * Umemployment: The outlier is present in the dataset crossponding to higher and lower both values.
250
+ # * CPI: It is either very low or very high.
251
+
252
+ # In[ ]:
253
+
254
+
255
+ # Removing the outlier from Temperature column
256
+
257
+ Q1 = df['Temperature'].quantile(0.25)
258
+ Q3 = df['Temperature'].quantile(0.75)
259
+ IQR = Q3 - Q1
260
+ df = df[df['Temperature'] <= (Q3+(1.5*IQR))]
261
+ df = df[df['Temperature'] >= (Q1-(1.5*IQR))]
262
+
263
+
264
+ # In[ ]:
265
+
266
+
267
+ # Removing the outlier from Unemployment column
268
+
269
+ Q1 = df['Unemployment'].quantile(0.25)
270
+ Q3 = df['Unemployment'].quantile(0.75)
271
+ IQR = Q3 - Q1
272
+ df = df[df['Unemployment'] <= (Q3+(1.5*IQR))]
273
+ df = df[df['Unemployment'] >= (Q1-(1.5*IQR))]
274
+
275
+
276
+ # In[ ]:
277
+
278
+
279
+ df.shape
280
+
281
+
282
+ # On the process of removing outlier, **484 data** points are removed from data-set
283
+
284
+ # ## Encoding
285
+
286
+ # Encoding is a process to convert the categorical columns into a numerical columns, as it is not a good preactice to train a model with categorical inputs.
287
+
288
+ # In[ ]:
289
+
290
+
291
+ cat_cols = ['Store', 'Holiday_Flag', 'weekday', 'month', 'year'] # these are the categorical columns
292
+
293
+
294
+ # In[ ]:
295
+
296
+
297
+ df[cat_cols].nunique() # Counting the unique value in each of the categorical columns.
298
+
299
+
300
+ # In[ ]:
301
+
302
+
303
+ # Imported OneHotEncoder to perfrom the encoding
304
+ from sklearn.preprocessing import OneHotEncoder
305
+ # Creating a object of the encoder function
306
+ encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
307
+ # Fit the encoder object to the dataset which i want to convert into numerical form.
308
+ encoder.fit(df[cat_cols])
309
+
310
+
311
+ # In[ ]:
312
+
313
+
314
+ # Creating a list of the encoded columns
315
+ encoded_cols = list(encoder.get_feature_names(cat_cols))
316
+ print(encoded_cols)
317
+
318
+
319
+ # In[ ]:
320
+
321
+
322
+ # Now i added those encoded columns into the original dataset by transforming it into a categorical form.
323
+ df[encoded_cols] = encoder.transform(df[cat_cols])
324
+
325
+
326
+ # In[ ]:
327
+
328
+
329
+ df.shape
330
+
331
+
332
+ # ## Standardization
333
+
334
+ # To scale all the column values to specific range of 0 - 1, I used standard scaler function. It is important to give the equal weights to all the columns.
335
+
336
+ # In[ ]:
337
+
338
+
339
+ # Importing a MinMaxScaler
340
+ from sklearn.preprocessing import MinMaxScaler
341
+ # Creating Scaler Object
342
+ scaler = MinMaxScaler()
343
+ # Fitted the scaler to the dataset
344
+ scaler.fit(df)
345
+ # Transformed the dataset using the fitted scaler object
346
+ scaled_df = scaler.transform(df)
347
+
348
+
349
+ # In[ ]:
350
+
351
+
352
+ # Converting the output scaled dataframe into a pandas dataframe
353
+ scaled_df = pd.DataFrame(data = scaled_df, columns = df.columns)
354
+
355
+
356
+ # In[ ]:
357
+
358
+
359
+ # Checking the output dataframe
360
+ scaled_df.head(3)
361
+
362
+
363
+ # ## Train-Test-Split
364
+
365
+ # Split the dataset into the two part:
366
+ # 1. Training dataset (used to train the model)
367
+ # 2. Testing dataset (used to test the model)
368
+
369
+ # In[ ]:
370
+
371
+
372
+ # Drop the sales columns to get the input features
373
+ X = scaled_df.drop('Weekly_Sales', axis=1)
374
+ # Use the sales column as a target columns
375
+ y = scaled_df['Weekly_Sales']
376
+
377
+
378
+ # In[ ]:
379
+
380
+
381
+ # Importing train test split
382
+ from sklearn.model_selection import train_test_split
383
+ # dividing the dataset into the train and the test parts and each part has input feature and target features
384
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)
385
+
386
+
387
+ # In[ ]:
388
+
389
+
390
+ # Printin the shape of all the dataset
391
+ X_train.shape, X_test.shape, y_train.shape, y_test.shape
392
+
393
+
394
+ # ## Feature Selection
395
+
396
+ # Out of all the 78 features all are not important and we have to choose the important feature out of all the features
397
+
398
+ # In[ ]:
399
+
400
+
401
+ # import a linear regerssion model
402
+ from sklearn.linear_model import LinearRegression
403
+ # import a Random Forest Regressor model
404
+ from sklearn.ensemble import RandomForestRegressor
405
+ # import a mean squared error for model evaluation
406
+ from sklearn.metrics import mean_squared_error
407
+ # import a r2 score for model evaluation
408
+ from sklearn.metrics import r2_score
409
+ # import a RFE model for feature selection
410
+ from sklearn.feature_selection import RFE
411
+
412
+
413
+ # In[ ]:
414
+
415
+
416
+ # Creatint a list to store training and test error
417
+ Trr=[]; Tss=[]; n=3
418
+ order=['ord-'+str(i) for i in range(2,n)]
419
+ Trd = pd.DataFrame(np.zeros((10,n-2)), columns=order)
420
+ Tsd = pd.DataFrame(np.zeros((10,n-2)), columns=order)
421
+
422
+ m=df.shape[1]-2
423
+ for i in range(m):
424
+ # creating a linear regression model object
425
+ lm = LinearRegression()
426
+ # creating a rfe model object with linear regression model and with a parameter of the number of features
427
+ rfe = RFE(lm, n_features_to_select=X_train.shape[1]-i)
428
+ # fitting the rfe model to the trainig dataset
429
+ rfe = rfe.fit(X_train, y_train)
430
+ # creating a linear regression model object for prediction
431
+ LR = LinearRegression()
432
+ # fitted the lr model using the selected features
433
+ LR.fit(X_train.loc[:,rfe.support_], y_train)
434
+ # Made the prediction using the linear regression model
435
+ pred1 = LR.predict(X_train.loc[:,rfe.support_]) # make the prediction on the trainig dataset
436
+ pred2 = LR.predict(X_test.loc[:,rfe.support_]) # make the prediction on the test dataset
437
+ # Insert the mse into the Trr and Tss for train and test respectively
438
+ Trr.append(np.sqrt(mean_squared_error(y_train, pred1)))
439
+ Tss.append(np.sqrt(mean_squared_error(y_test, pred2)))
440
+
441
+
442
+ # In[ ]:
443
+
444
+
445
+ plt.plot(Trr, label= 'Train RMSE')
446
+ plt.plot(Tss, label= 'Test RMSE')
447
+ plt.legend()
448
+ plt.show()
449
+
450
+
451
+ # If we Recursively Eleminate at most **Ten** features then the score is maximum.
452
+
453
+ # In[2]:
454
+
455
+
456
+ # Eleminating 10 features and using Linear Regresion model the error printed as follows which is the best possible score.
457
+
458
+ # creating a linear regression model object
459
+ lm = LinearRegression()
460
+ # creating a rfe model object with linear regression model and with number of features equal to 10.
461
+ rfe = RFE(lm,n_features_to_select=X_train.shape[1]-9)
462
+ # fitting the rfe model to the trainig dataset
463
+ rfe = rfe.fit(X_train, y_train)
464
+ # creating a linear regression model object for prediction
465
+ LR = LinearRegression()
466
+ # fitted the lr model using the selected features
467
+ LR.fit(X_train.loc[:,rfe.support_], y_train)
468
+ # Made the prediction using the linear regression model
469
+ pred1 = LR.predict(X_train.loc[:,rfe.support_])
470
+ pred2 = LR.predict(X_test.loc[:,rfe.support_])
471
+ # Printing the results as a MSE and r2_score.
472
+ print("MSE train",np.sqrt(mean_squared_error(y_train, pred1)))
473
+ print("MSE test",np.sqrt(mean_squared_error(y_test, pred2)))
474
+ print("r2_score train - {}".format(r2_score(y_train, pred1)))
475
+ print("r2_score test - {}".format(r2_score(y_test, pred2)))
476
+
477
+
478
+ # Now Removing the 10 features and create the New training and test dataset
479
+
480
+ # In[ ]:
481
+
482
+
483
+ X_train = X_train.loc[:,rfe.support_]
484
+ X_test = X_test.loc[:,rfe.support_]
485
+
486
+
487
+ # Now onwards I am going to use various models
488
+
489
+ # ## Linear Regression
490
+
491
+ # In[ ]:
492
+
493
+
494
+ lr =LinearRegression()
495
+ lr.fit(X_train, y_train)
496
+ pred1 = lr.predict(X_train)
497
+ pred2 = lr.predict(X_test)
498
+
499
+ print("Root Mean Squared Error train {}".format(np.mean(mean_squared_error(y_train, pred1))))
500
+ print("Root Mean Squared Error test {}".format(np.sqrt(mean_squared_error(y_test, pred2))))
501
+ print("r2_score train {}".format(r2_score(y_train, pred1)))
502
+ print("r2_score test {}".format(r2_score(y_test, pred2)))
503
+
504
+
505
+ # **Ridge Regression**
506
+
507
+ # In[ ]:
508
+
509
+
510
+ from sklearn.linear_model import Ridge
511
+ rr = Ridge()
512
+ rr.fit(X_train, y_train)
513
+ predrr1 = rr.predict(X_train)
514
+ predrr2 = rr.predict(X_test)
515
+ print("Root Mean Squared Error train {}".format(np.mean(mean_squared_error(y_train, predrr1))))
516
+ print("Root Mean Squared Error test {}".format(np.sqrt(mean_squared_error(y_test, predrr2))))
517
+ print("r2_score train {}".format(r2_score(y_train, predrr1)))
518
+ print("r2_score test {}".format(r2_score(y_test, predrr2)))
519
+
520
+
521
+ # **Lasso Regression**
522
+
523
+ # In[ ]:
524
+
525
+
526
+ from sklearn.linear_model import Lasso
527
+ lr = Lasso()
528
+ lr.fit(X_train, y_train)
529
+ predlr1 = lr.predict(X_train)
530
+ predlr2 = lr.predict(X_test)
531
+ print("Root Mean Squared Error train {}".format(np.mean(mean_squared_error(y_train, predlr1))))
532
+ print("Root Mean Squared Error test {}".format(np.sqrt(mean_squared_error(y_test, predlr2))))
533
+ print("r2_score train {}".format(r2_score(y_train, predlr1)))
534
+ print("r2_score test {}".format(r2_score(y_test, predlr2)))
535
+
536
+
537
+ # **ElasticNet Regression**
538
+
539
+ # In[ ]:
540
+
541
+
542
+ from sklearn.linear_model import ElasticNet
543
+ en = ElasticNet()
544
+ en.fit(X_train, y_train)
545
+ predlr1 = en.predict(X_train)
546
+ predlr2 = en.predict(X_test)
547
+ print("Root Mean Squared Error train {}".format(np.mean(mean_squared_error(y_train, predlr1))))
548
+ print("Root Mean Squared Error test {}".format(np.sqrt(mean_squared_error(y_test, predlr2))))
549
+ print("r2_score train {}".format(r2_score(y_train, predlr1)))
550
+ print("r2_score test {}".format(r2_score(y_test, predlr2)))
551
+
552
+
553
+ # **Polynomial Regression**
554
+
555
+ # In[ ]:
556
+
557
+
558
+ from sklearn.preprocessing import PolynomialFeatures
559
+
560
+
561
+ # In[ ]:
562
+
563
+
564
+ Trr = []
565
+ Tss = []
566
+ for i in range(2,4):
567
+ poly_reg = PolynomialFeatures(degree = i)
568
+ pl_X_train = poly_reg.fit_transform(X_train)
569
+ pl_X_test = poly_reg.fit_transform(X_test)
570
+ lr = LinearRegression()
571
+ lr.fit(pl_X_train, y_train)
572
+ pred_poly_train = lr.predict(pl_X_train)
573
+ Trr.append(np.sqrt(mean_squared_error(y_train, pred_poly_train)))
574
+ pred_poly_test = lr.predict(pl_X_test)
575
+ Tss.append(np.sqrt(mean_squared_error(y_test, pred_poly_test)))
576
+
577
+
578
+ # In[ ]:
579
+
580
+
581
+ plt.figure(figsize=[15,6])
582
+ plt.subplot(1,2,1)
583
+ plt.plot(range(2,4), Trr, label= 'Training')
584
+ plt.plot(range(2,4), Tss, label= 'Testing')
585
+ plt.title('Polynomial Feature on training data')
586
+ plt.xlabel('Degree')
587
+ plt.ylabel('RMSE')
588
+ plt.legend()
589
+
590
+
591
+ # It is clear that in between 2-4 degree polynomial regression 2 has Bais-variance tradeoff
592
+
593
+ # In[ ]:
594
+
595
+
596
+ poly_reg = PolynomialFeatures(degree = 2)
597
+ pl_X_train = poly_reg.fit_transform(X_train)
598
+ pl_X_test = poly_reg.fit_transform(X_test)
599
+ lr = LinearRegression()
600
+ lr.fit(pl_X_train, y_train)
601
+ pred_poly_train = lr.predict(pl_X_train)
602
+ print("r2_score train {}".format(r2_score(pred_poly_train, y_train)))
603
+ pred_poly_test = lr.predict(pl_X_test)
604
+ print("r2_score test {}".format(r2_score(pred_poly_test, y_test)))
605
+ print("Root Mean Squared Error train {}".format(np.mean(mean_squared_error(y_train, pred_poly_train))))
606
+ print("Root Mean Squared Error test {}".format(np.sqrt(mean_squared_error(y_test, pred_poly_test))))
607
+
608
+
609
+ # In[ ]:
610
+
611
+
612
+ #creating a tabel
613
+ tabel = {
614
+ 'Train R2': [0.9324387485162124, 0.9323641360074176, 0.0, 0.0, 0.9563932198334125],
615
+ 'Test R2' : [0.9223162582948724, 0.9219331606995953, -0.00014816618161050954, -0.00014816618161050954, -0.0005599911350040454],
616
+ 'Train RMSE' : [0.0016695395619648289, 0.0016713833486400986, 0.024711495499242828, 0.024711495499242828, 0.0010346077251656776 ],
617
+ 'Test RMSE' : [0.04569350618906344, 0.04580603645234492, 0.16395383804559885, 0.16395383804559885, 730742413.004261 ]
618
+ }
619
+
620
+
621
+ # In[ ]:
622
+
623
+
624
+ df_new = pd.DataFrame(tabel)
625
+
626
+
627
+ # In[ ]:
628
+
629
+
630
+ df_new.index = ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'ElasticNet Regression', 'Polynomial Regression']
631
+
632
+
633
+ # In[ ]:
634
+
635
+
636
+ df_new
637
+
638
+
639
+ # It is clear that Linear Regression is the Best Model in the dataset, with test accuracy of 92%(approx).
640
+ #
641
+ # To improve the accuracy further we can apply other regressor i.e. Random Forest, G
642
+
643
+ # Now I am going to imporve the accuracy till 98% - 99%. For this I have to use Decision Tree or Random Forest etc.
644
+
645
+ # **Decision Tree Regressor**
646
+
647
+ # In[ ]:
648
+
649
+
650
+ from sklearn.tree import DecisionTreeRegressor
651
+ dt = DecisionTreeRegressor()
652
+ dt.fit(X_train, y_train)
653
+
654
+
655
+ # In[ ]:
656
+
657
+
658
+ pred_dt1 = dt.predict(X_train)
659
+ pred_dt2 = dt.predict(X_test)
660
+ print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_dt1))))
661
+ print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_dt2))))
662
+ print('Accuracy Score train: ', dt.score(X_train, y_train))
663
+ print('Accuracy Score test: ', dt.score(X_test, y_test))
664
+
665
+
666
+ # In[ ]:
667
+
668
+
669
+ max_depth_range = np.arange(1,40,1)
670
+ for x in max_depth_range:
671
+ dt = DecisionTreeRegressor(max_depth= x)
672
+ dt.fit(X_train, y_train)
673
+ pred_dt1 = dt.predict(X_train)
674
+ pred_dt2 = dt.predict(X_test)
675
+ print('for max_depth: ', x)
676
+ print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_dt1))))
677
+ print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_dt2))))
678
+ print('Accuracy Score train: ', dt.score(X_train, y_train))
679
+ print('Accuracy Score test: ', dt.score(X_test, y_test))
680
+ print()
681
+
682
+
683
+ # Decision Tree has maximum accuracy at **maximum depth 39**
684
+
685
+ # **Random Forest Regressor**
686
+
687
+ # In[ ]:
688
+
689
+
690
+ from sklearn.ensemble import RandomForestRegressor
691
+ rfc = RandomForestRegressor()
692
+ rfc.fit(X_train, y_train)
693
+
694
+
695
+ # In[ ]:
696
+
697
+
698
+ pred_rfc1 = rfc.predict(X_train)
699
+ pred_rfc2 = rfc.predict(X_test)
700
+ print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_rfc1))))
701
+ print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_rfc2))))
702
+ print('Accuracy Score train: ', dt.score(X_train, y_train))
703
+ print('Accuracy Score test: ', dt.score(X_test, y_test))
704
+
705
+
706
+ # In[ ]:
707
+
708
+
709
+ max_depth_range = np.arange(1,40,1)
710
+ for x in max_depth_range:
711
+ dt = RandomForestRegressor(max_depth= x)
712
+ dt.fit(X_train, y_train)
713
+ pred_xg1 = dt.predict(X_train)
714
+ pred_xg2 = dt.predict(X_test)
715
+ print('for max_depth: ', x)
716
+ print('for max_depth: ', x)
717
+ print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_xg1))))
718
+ print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_xg2))))
719
+ print('Accuracy Score train: ', dt.score(X_train, y_train))
720
+ print('Accuracy Score test: ', dt.score(X_test, y_test))
721
+ print()
722
+
723
+
724
+ # In the depth of **36** the** Random Forest Regressor** has its maximum value of accuracy.
725
+
726
+ # In[ ]:
727
+
728
+
729
+ rfc = RandomForestRegressor(max_depth = 36)
730
+ rfc.fit(X_train, y_train)
731
+ pred_rfc1 = rfc.predict(X_train)
732
+ pred_rfc2 = rfc.predict(X_test)
733
+ print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_rfc1))))
734
+ print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_rfc2))))
735
+ print('Accuracy Score train: ', rfc.score(X_train, y_train))
736
+ print('Accuracy Score test: ', rfc.score(X_test, y_test))
737
+
738
+
739
+ # **XG Boost Regressor**
740
+
741
+ # In[ ]:
742
+
743
+
744
+ from xgboost import XGBRegressor
745
+ xg = XGBRegressor()
746
+ xg.fit(X_train, y_train)
747
+
748
+
749
+ # In[ ]:
750
+
751
+
752
+ pred_xg1 = xg.predict(X_train)
753
+ pred_xg2 = xg.predict(X_test)
754
+ print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_xg1))))
755
+ print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_xg2))))
756
+
757
+
758
+ # In[ ]:
759
+
760
+
761
+ max_depth_range = np.arange(1,15,1)
762
+ for x in max_depth_range:
763
+ dt = XGBRegressor(max_depth= x)
764
+ dt.fit(X_train, y_train)
765
+ pred_xg1 = dt.predict(X_train)
766
+ pred_xg2 = dt.predict(X_test)
767
+ print('for max_depth: ', x)
768
+ print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_xg1))))
769
+ print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_xg2))))
770
+ print('Accuracy Score train: ', dt.score(X_train, y_train))
771
+ print('Accuracy Score test: ', dt.score(X_test, y_test))
772
+ print()
773
+
774
+
775
+ # It means **maximun depth 9** has best value of Accuracy
776
+
777
+ # In[ ]:
778
+
779
+
780
+ xg = XGBRegressor(max_depth = 9)
781
+ xg.fit(X_train, y_train)
782
+ pred_xg1 = xg.predict(X_train)
783
+ pred_xg2 = xg.predict(X_test)
784
+ print("RMSE for train {}".format(np.sqrt(mean_squared_error(y_train, pred_xg1))))
785
+ print("RMSE for test {}".format(np.sqrt(mean_squared_error(y_test, pred_xg2))))
786
+ print('Accuracy Score train: ', xg.score(X_train, y_train))
787
+ print('Accuracy Score test: ', xg.score(X_test, y_test))
788
+
789
+
790
+ # In[ ]:
791
+
792
+
793
+ tabel1 = {
794
+ 'Train Score': [0.9679040861170889, 0.9637587048322853, 0.9601543222728802],
795
+ 'Test Score' : [0.8808466556220073, 0.9028060343874318, 0.9115195955339979],
796
+ 'Train RMSE' : [0.02816270639447925, 0.02992618589836856, 0.03137907401148098],
797
+ 'Test RMSE' : [0.05659037012899937, 0.051110374979016944, 0.04876553192516943]
798
+ }
799
+
800
+
801
+ # In[ ]:
802
+
803
+
804
+ df1 = pd.DataFrame(tabel1)
805
+
806
+
807
+ # In[ ]:
808
+
809
+
810
+ df1
811
+
812
+
813
+ # In[ ]:
814
+
815
+
816
+ df1.index = ['Decision Tree', 'Random Forest', 'XGBoost']
817
+
818
+
819
+ # In[ ]:
820
+
821
+
822
+ df1
823
+
824
+
825
+ # Among the method XGBoost is the best method for the data set
826
+
827
+ # By Comparising the Linear and XGBoost we can conclude that linear Regression the best suited for the above data set
828
+
829
+ # In[ ]:
830
+
831
+
832
+
833
+