PreyPatel's picture
Rename A3.py to app.py
b449595
raw
history blame
5.31 kB
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
st.title('Boosting in Regression')
DATE_COLUMN = 'date/time'
DATA_URL = ('https://s3-us-west-2.amazonaws.com/'
'streamlit-demo-data/uber-raw-data-sep14.csv.gz')
@st.cache_data
def load_data(nrows):
data = pd.read_csv(DATA_URL, nrows=nrows)
lowercase = lambda x: str(x).lower()
data.rename(lowercase, axis='columns', inplace=True)
data[DATE_COLUMN] = pd.to_datetime(data[DATE_COLUMN])
return data
@st.cache_data
def make_data(dataset_option):
opt = dataset_option.split()[0]
if opt == "100":
X, y = make_regression(n_samples=100,
n_features=10, n_informative=2,
random_state=2)
elif opt == "200":
X, y = make_regression(n_samples=200,
n_features=5, n_informative=2,
random_state=4)
elif opt == "150":
X, y = make_regression(n_samples=150,
n_features=7,n_informative=2,
random_state=2)
else:
X, y = make_regression(random_state=10)
return X, y
def estimator_model(estimator_type):
if estimator_type == "Linear regressor":
model = LinearRegression()
elif estimator_type == "Ridge regressor":
model = Ridge()
elif estimator_type == "Lasso regressor":
model = Lasso()
elif estimator_type == "SVR":
model = SVR()
else:
model = LinearRegression()
return model
options = ['100 samples with 10 features and 1 target', '200 samples with 5 features and 1 target', '150 samples with 7 features and 1 target']
dataset_option = st.selectbox('Select dataset size:', options)
X, y = make_data(dataset_option)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=4)
fig = plt.figure()
plt.xlabel("x")
plt.ylabel("y")
plt.title("Dataset")
plt.scatter(X[:,0], y)
st.pyplot(fig)
options = ['Linear regressor', 'Ridge regressor', 'Lasso regressor', 'SVR']
model_type = st.selectbox('Select model type to use:', options)
options = ['boosting', 'bagging', 'gradient descent']
ensemble_type = st.selectbox('Select the ensemble type:', options)
estimator_number = st.slider('n_estimators', 1, 20, 4)
fig = plt.figure()
if ensemble_type == "bagging":
estimator_ = estimator_model(model_type)
test_loss = []
train_loss = []
for i in range(1, estimator_number):
model = BaggingRegressor( n_estimators=i, random_state=45)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
temp = mean_squared_error(y_test, y_pred)
test_loss.append(temp)
y_pred = model.predict(X_train)
temp = mean_squared_error(y_train, y_pred)
train_loss.append(temp)
plt.plot(range(1, estimator_number), test_loss, label="test loss")
plt.plot(range(1, estimator_number), train_loss, label="train loss")
elif ensemble_type == "gradient descent":
test_loss = []
estimator_ = estimator_model(model_type)
for i in range(1, estimator_number):
model = GradientBoostingRegressor( n_estimators=i, learning_rate=0.1, random_state=45)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
test_loss.append(mean_squared_error(y_test, y_pred))
plt.plot(range(1, estimator_number), test_loss, label="test loss")
elif ensemble_type == "boosting":
test_loss = []
estimator_ = estimator_model(model_type)
for i in range(1, estimator_number):
model = AdaBoostRegressor(n_estimators=i)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
test_loss.append(mean_squared_error(y_test, y_pred))
plt.plot(range(1, estimator_number), test_loss, label="test loss")
plt.legend()
plt.title("loss plot")
plt.xlabel("n_estimators")
plt.ylabel("loss")
st.pyplot(fig)
if st.button('Magic'):
loss = []
n_splits=5
opts = ['Linear regressor', 'Ridge regressor', 'Lasso regressor', 'SVR']
for opt in opts:
kf = KFold(n_splits=n_splits, shuffle=True, random_state=32)
cv_scores = []
for train_index, val_index in kf.split(X_train):
model = estimator_model(opt)
X_train_cv, X_val_cv = X_train[train_index], X_train[val_index]
y_train_cv, y_val_cv = y_train[train_index], y_train[val_index]
model.fit(X_train_cv, y_train_cv)
y_val_pred = model.predict(X_val_cv)
cv_scores.append(mean_squared_error(y_val_cv, y_val_pred))
loss.append(np.mean(cv_scores))
best_model = estimator_model(opts[np.argmin(loss)])
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
fig = plt.figure()
plt.title(f"Best model fit is of {opts[np.argmin(loss)]}")
plt.scatter(X_test[:,0], y_pred)
plt.scatter(X_test[:,0], y_test)
st.pyplot(fig)