Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
from sklearn.datasets import make_regression | |
from sklearn.model_selection import train_test_split, KFold | |
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, AdaBoostRegressor | |
from sklearn.linear_model import LinearRegression, Lasso, Ridge | |
from sklearn.svm import SVR | |
from sklearn.metrics import mean_squared_error | |
import matplotlib.pyplot as plt | |
st.title('Boosting in Regression') | |
DATE_COLUMN = 'date/time' | |
DATA_URL = ('https://s3-us-west-2.amazonaws.com/' | |
'streamlit-demo-data/uber-raw-data-sep14.csv.gz') | |
def load_data(nrows): | |
data = pd.read_csv(DATA_URL, nrows=nrows) | |
lowercase = lambda x: str(x).lower() | |
data.rename(lowercase, axis='columns', inplace=True) | |
data[DATE_COLUMN] = pd.to_datetime(data[DATE_COLUMN]) | |
return data | |
def make_data(dataset_option): | |
opt = dataset_option.split()[0] | |
if opt == "100": | |
X, y = make_regression(n_samples=100, | |
n_features=10, n_informative=2, | |
random_state=2) | |
elif opt == "200": | |
X, y = make_regression(n_samples=200, | |
n_features=5, n_informative=2, | |
random_state=4) | |
elif opt == "150": | |
X, y = make_regression(n_samples=150, | |
n_features=7,n_informative=2, | |
random_state=2) | |
else: | |
X, y = make_regression(random_state=10) | |
return X, y | |
def estimator_model(estimator_type): | |
if estimator_type == "Linear regressor": | |
model = LinearRegression() | |
elif estimator_type == "Ridge regressor": | |
model = Ridge() | |
elif estimator_type == "Lasso regressor": | |
model = Lasso() | |
elif estimator_type == "SVR": | |
model = SVR() | |
else: | |
model = LinearRegression() | |
return model | |
options = ['100 samples with 10 features and 1 target', '200 samples with 5 features and 1 target', '150 samples with 7 features and 1 target'] | |
dataset_option = st.selectbox('Select dataset size:', options) | |
X, y = make_data(dataset_option) | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=4) | |
fig = plt.figure() | |
plt.xlabel("x") | |
plt.ylabel("y") | |
plt.title("Dataset") | |
plt.scatter(X[:,0], y) | |
st.pyplot(fig) | |
options = ['Linear regressor', 'Ridge regressor', 'Lasso regressor', 'SVR'] | |
model_type = st.selectbox('Select model type to use:', options) | |
options = ['boosting', 'bagging', 'gradient descent'] | |
ensemble_type = st.selectbox('Select the ensemble type:', options) | |
estimator_number = st.slider('n_estimators', 1, 20, 4) | |
fig = plt.figure() | |
if ensemble_type == "bagging": | |
estimator_ = estimator_model(model_type) | |
test_loss = [] | |
train_loss = [] | |
for i in range(1, estimator_number): | |
model = BaggingRegressor( n_estimators=i, random_state=45) | |
model.fit(X_train, y_train) | |
y_pred = model.predict(X_test) | |
temp = mean_squared_error(y_test, y_pred) | |
test_loss.append(temp) | |
y_pred = model.predict(X_train) | |
temp = mean_squared_error(y_train, y_pred) | |
train_loss.append(temp) | |
plt.plot(range(1, estimator_number), test_loss, label="test loss") | |
plt.plot(range(1, estimator_number), train_loss, label="train loss") | |
elif ensemble_type == "gradient descent": | |
test_loss = [] | |
estimator_ = estimator_model(model_type) | |
for i in range(1, estimator_number): | |
model = GradientBoostingRegressor( n_estimators=i, learning_rate=0.1, random_state=45) | |
model.fit(X_train, y_train) | |
y_pred = model.predict(X_test) | |
test_loss.append(mean_squared_error(y_test, y_pred)) | |
plt.plot(range(1, estimator_number), test_loss, label="test loss") | |
elif ensemble_type == "boosting": | |
test_loss = [] | |
estimator_ = estimator_model(model_type) | |
for i in range(1, estimator_number): | |
model = AdaBoostRegressor(n_estimators=i) | |
model.fit(X_train, y_train) | |
y_pred = model.predict(X_test) | |
test_loss.append(mean_squared_error(y_test, y_pred)) | |
plt.plot(range(1, estimator_number), test_loss, label="test loss") | |
plt.legend() | |
plt.title("loss plot") | |
plt.xlabel("n_estimators") | |
plt.ylabel("loss") | |
st.pyplot(fig) | |
if st.button('Magic'): | |
loss = [] | |
n_splits=5 | |
opts = ['Linear regressor', 'Ridge regressor', 'Lasso regressor', 'SVR'] | |
for opt in opts: | |
kf = KFold(n_splits=n_splits, shuffle=True, random_state=32) | |
cv_scores = [] | |
for train_index, val_index in kf.split(X_train): | |
model = estimator_model(opt) | |
X_train_cv, X_val_cv = X_train[train_index], X_train[val_index] | |
y_train_cv, y_val_cv = y_train[train_index], y_train[val_index] | |
model.fit(X_train_cv, y_train_cv) | |
y_val_pred = model.predict(X_val_cv) | |
cv_scores.append(mean_squared_error(y_val_cv, y_val_pred)) | |
loss.append(np.mean(cv_scores)) | |
best_model = estimator_model(opts[np.argmin(loss)]) | |
best_model.fit(X_train, y_train) | |
y_pred = best_model.predict(X_test) | |
fig = plt.figure() | |
plt.title(f"Best model fit is of {opts[np.argmin(loss)]}") | |
plt.scatter(X_test[:,0], y_pred) | |
plt.scatter(X_test[:,0], y_test) | |
st.pyplot(fig) | |