PreyPatel commited on
Commit
6949d38
1 Parent(s): c6c21ad
Files changed (1) hide show
  1. A3.py +141 -0
A3.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.datasets import make_regression
5
+ from sklearn.model_selection import train_test_split, KFold
6
+ from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, AdaBoostRegressor
7
+ from sklearn.linear_model import LinearRegression, Lasso, Ridge
8
+ from sklearn.svm import SVR
9
+ from sklearn.metrics import mean_squared_error
10
+ import matplotlib.pyplot as plt
11
+
12
+
13
+ st.title('Boosting in Regression')
14
+
15
+ DATE_COLUMN = 'date/time'
16
+ DATA_URL = ('https://s3-us-west-2.amazonaws.com/'
17
+ 'streamlit-demo-data/uber-raw-data-sep14.csv.gz')
18
+
19
+ @st.cache_data
20
+ def load_data(nrows):
21
+ data = pd.read_csv(DATA_URL, nrows=nrows)
22
+ lowercase = lambda x: str(x).lower()
23
+ data.rename(lowercase, axis='columns', inplace=True)
24
+ data[DATE_COLUMN] = pd.to_datetime(data[DATE_COLUMN])
25
+ return data
26
+
27
+ @st.cache_data
28
+ def make_data(dataset_option):
29
+ opt = dataset_option.split()[0]
30
+ if opt == "100":
31
+ X, y = make_regression(n_samples=100,
32
+ n_features=10, n_informative=2,
33
+ random_state=2)
34
+ elif opt == "200":
35
+ X, y = make_regression(n_samples=200,
36
+ n_features=5, n_informative=2,
37
+ random_state=4)
38
+ elif opt == "150":
39
+ X, y = make_regression(n_samples=150,
40
+ n_features=7,n_informative=2,
41
+ random_state=2)
42
+ else:
43
+ X, y = make_regression(random_state=10)
44
+ return X, y
45
+
46
+ def estimator_model(estimator_type):
47
+ if estimator_type == "Linear regressor":
48
+ model = LinearRegression()
49
+ elif estimator_type == "Ridge regressor":
50
+ model = Ridge()
51
+ elif estimator_type == "Lasso regressor":
52
+ model = Lasso()
53
+ elif estimator_type == "SVR":
54
+ model = SVR()
55
+ else:
56
+ model = LinearRegression()
57
+ return model
58
+
59
+ options = ['100 samples with 10 features and 1 target', '200 samples with 5 features and 1 target', '150 samples with 7 features and 1 target']
60
+ dataset_option = st.selectbox('Select dataset size:', options)
61
+ X, y = make_data(dataset_option)
62
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=4)
63
+ fig = plt.figure()
64
+ plt.xlabel("x")
65
+ plt.ylabel("y")
66
+ plt.title("Dataset")
67
+ plt.scatter(X[:,0], y)
68
+ st.pyplot(fig)
69
+
70
+ options = ['Linear regressor', 'Ridge regressor', 'Lasso regressor', 'SVR']
71
+ model_type = st.selectbox('Select model type to use:', options)
72
+ options = ['boosting', 'bagging', 'gradient descent']
73
+ ensemble_type = st.selectbox('Select the ensemble type:', options)
74
+ estimator_number = st.slider('n_estimators', 1, 20, 4)
75
+
76
+ fig = plt.figure()
77
+ if ensemble_type == "bagging":
78
+ estimator_ = estimator_model(model_type)
79
+ test_loss = []
80
+ train_loss = []
81
+ for i in range(1, estimator_number):
82
+ model = BaggingRegressor( n_estimators=i, random_state=45)
83
+ model.fit(X_train, y_train)
84
+ y_pred = model.predict(X_test)
85
+ temp = mean_squared_error(y_test, y_pred)
86
+ test_loss.append(temp)
87
+ y_pred = model.predict(X_train)
88
+ temp = mean_squared_error(y_train, y_pred)
89
+ train_loss.append(temp)
90
+ plt.plot(range(1, estimator_number), test_loss, label="test loss")
91
+ plt.plot(range(1, estimator_number), train_loss, label="train loss")
92
+ elif ensemble_type == "gradient descent":
93
+ test_loss = []
94
+ estimator_ = estimator_model(model_type)
95
+ for i in range(1, estimator_number):
96
+ model = GradientBoostingRegressor( n_estimators=i, learning_rate=0.1, random_state=45)
97
+ model.fit(X_train, y_train)
98
+ y_pred = model.predict(X_test)
99
+ test_loss.append(mean_squared_error(y_test, y_pred))
100
+ plt.plot(range(1, estimator_number), test_loss, label="test loss")
101
+ elif ensemble_type == "boosting":
102
+ test_loss = []
103
+ estimator_ = estimator_model(model_type)
104
+ for i in range(1, estimator_number):
105
+ model = AdaBoostRegressor(n_estimators=i)
106
+ model.fit(X_train, y_train)
107
+ y_pred = model.predict(X_test)
108
+ test_loss.append(mean_squared_error(y_test, y_pred))
109
+ plt.plot(range(1, estimator_number), test_loss, label="test loss")
110
+
111
+ plt.legend()
112
+ plt.title("loss plot")
113
+ plt.xlabel("n_estimators")
114
+ plt.ylabel("loss")
115
+ st.pyplot(fig)
116
+
117
+ if st.button('Magic'):
118
+ loss = []
119
+ n_splits=5
120
+ opts = ['Linear regressor', 'Ridge regressor', 'Lasso regressor', 'SVR']
121
+ for opt in opts:
122
+ kf = KFold(n_splits=n_splits, shuffle=True, random_state=32)
123
+ cv_scores = []
124
+ for train_index, val_index in kf.split(X_train):
125
+ model = estimator_model(opt)
126
+ X_train_cv, X_val_cv = X_train[train_index], X_train[val_index]
127
+ y_train_cv, y_val_cv = y_train[train_index], y_train[val_index]
128
+ model.fit(X_train_cv, y_train_cv)
129
+ y_val_pred = model.predict(X_val_cv)
130
+ cv_scores.append(mean_squared_error(y_val_cv, y_val_pred))
131
+ loss.append(np.mean(cv_scores))
132
+ best_model = estimator_model(opts[np.argmin(loss)])
133
+ best_model.fit(X_train, y_train)
134
+ y_pred = best_model.predict(X_test)
135
+ fig = plt.figure()
136
+ plt.title(f"Best model fit is of {opts[np.argmin(loss)]}")
137
+ plt.scatter(X_test[:,0], y_pred)
138
+ plt.scatter(X_test[:,0], y_test)
139
+ st.pyplot(fig)
140
+
141
+