import streamlit as st
import pandas as pd
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt


st.title('Boosting in Regression')

DATE_COLUMN = 'date/time'
DATA_URL = ('https://s3-us-west-2.amazonaws.com/'
            'streamlit-demo-data/uber-raw-data-sep14.csv.gz')

@st.cache_data
def load_data(nrows):
    data = pd.read_csv(DATA_URL, nrows=nrows)
    lowercase = lambda x: str(x).lower()
    data.rename(lowercase, axis='columns', inplace=True)
    data[DATE_COLUMN] = pd.to_datetime(data[DATE_COLUMN])
    return data

@st.cache_data
def make_data(dataset_option):
    opt = dataset_option.split()[0]
    if opt == "100":
        X, y = make_regression(n_samples=100,
                               n_features=10, n_informative=2,
                               random_state=2)
    elif opt == "200":
        X, y = make_regression(n_samples=200,
                               n_features=5, n_informative=2,
                               random_state=4)
    elif opt == "150":
        X, y = make_regression(n_samples=150,
                               n_features=7,n_informative=2,
                               random_state=2)
    else:
        X, y = make_regression(random_state=10)
    return X, y

def estimator_model(estimator_type):
    if estimator_type == "Linear regressor":
        model = LinearRegression()
    elif estimator_type == "Ridge regressor":
        model = Ridge()
    elif estimator_type == "Lasso regressor":
        model = Lasso()
    elif estimator_type == "SVR":
        model = SVR()
    else:
        model = LinearRegression()
    return model

options = ['100 samples with 10 features and 1 target', '200 samples with 5 features and 1 target', '150 samples with 7 features and 1 target']
dataset_option = st.selectbox('Select dataset size:', options)
X, y = make_data(dataset_option)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=4)
fig = plt.figure()
plt.xlabel("x")
plt.ylabel("y")
plt.title("Dataset")
plt.scatter(X[:,0], y)
st.pyplot(fig)

options = ['Linear regressor', 'Ridge regressor', 'Lasso regressor', 'SVR']
model_type = st.selectbox('Select model type to use:', options)
options = ['boosting', 'bagging', 'gradient descent']
ensemble_type = st.selectbox('Select the ensemble type:', options)
estimator_number = st.slider('n_estimators', 1, 20, 4)

fig = plt.figure()
if ensemble_type == "bagging":
    estimator_ = estimator_model(model_type)
    test_loss = []
    train_loss = []
    for i in range(1, estimator_number):
        model = BaggingRegressor( n_estimators=i, random_state=45)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        temp = mean_squared_error(y_test, y_pred)
        test_loss.append(temp)
        y_pred = model.predict(X_train)
        temp = mean_squared_error(y_train, y_pred)
        train_loss.append(temp)
    plt.plot(range(1, estimator_number), test_loss, label="test loss")
    plt.plot(range(1, estimator_number), train_loss, label="train loss")
elif ensemble_type == "gradient descent":
    test_loss = []
    estimator_ = estimator_model(model_type)
    for i in range(1, estimator_number):
        model = GradientBoostingRegressor( n_estimators=i, learning_rate=0.1, random_state=45)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        test_loss.append(mean_squared_error(y_test, y_pred))
    plt.plot(range(1, estimator_number), test_loss, label="test loss")
elif ensemble_type == "boosting":
    test_loss = []
    estimator_ = estimator_model(model_type)
    for i in range(1, estimator_number):
        model = AdaBoostRegressor(n_estimators=i)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        test_loss.append(mean_squared_error(y_test, y_pred))
    plt.plot(range(1, estimator_number), test_loss, label="test loss")

plt.legend()
plt.title("loss plot")
plt.xlabel("n_estimators")
plt.ylabel("loss")
st.pyplot(fig)

if st.button('Magic'):
    loss = []
    n_splits=5
    opts = ['Linear regressor', 'Ridge regressor', 'Lasso regressor', 'SVR']
    for opt in opts:
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=32)
        cv_scores = []
        for train_index, val_index in kf.split(X_train):
            model = estimator_model(opt)
            X_train_cv, X_val_cv = X_train[train_index], X_train[val_index]
            y_train_cv, y_val_cv = y_train[train_index], y_train[val_index]
            model.fit(X_train_cv, y_train_cv)
            y_val_pred = model.predict(X_val_cv)
            cv_scores.append(mean_squared_error(y_val_cv, y_val_pred))
        loss.append(np.mean(cv_scores))
    best_model = estimator_model(opts[np.argmin(loss)])
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    fig = plt.figure()
    plt.title(f"Best model fit is of {opts[np.argmin(loss)]}")
    plt.scatter(X_test[:,0], y_pred)
    plt.scatter(X_test[:,0], y_test)
    st.pyplot(fig)