|
import os |
|
from itertools import combinations |
|
|
|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
from flaml import AutoML |
|
from flaml.automl.data import get_output_from_log |
|
import pickle |
|
import matplotlib.pyplot as plt |
|
import plotly.express as px |
|
import base64 |
|
import time |
|
from sklearn.compose import ColumnTransformer |
|
from sklearn.pipeline import Pipeline |
|
from sklearn.impute import SimpleImputer |
|
from sklearn.preprocessing import StandardScaler, OneHotEncoder |
|
from sklearn.inspection import permutation_importance |
|
from sklearn.inspection import PartialDependenceDisplay |
|
import shap |
|
|
|
def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimize_reg): |
|
|
|
progress_text="Training in progress. Please wait." |
|
my_bar = st.progress(0, text=progress_text) |
|
time.sleep(0.5) |
|
|
|
df = pd.read_csv(csv) |
|
|
|
msk = np.random.rand(len(df)) < 0.8 |
|
df_train, df_test = df[msk], df[~msk] |
|
|
|
df_features = df_train[df_train.columns.difference([label])] |
|
y = df_train[label] |
|
|
|
my_bar.progress(50, text=progress_text) |
|
|
|
|
|
if task == 'Classification': |
|
metric = metric_to_minimize_class |
|
log = 'classlog.log' |
|
automl_settings = { |
|
"time_budget": int(budget), |
|
"metric": metric, |
|
"task": 'classification', |
|
"log_file_name": log, |
|
"early_stop": True, |
|
"eval_method": "holdout" |
|
} |
|
|
|
if task == 'Regression': |
|
metric = metric_to_minimize_reg |
|
log = 'reglog.log' |
|
automl_settings = { |
|
"time_budget": int(budget), |
|
"metric": metric, |
|
"task": 'regression', |
|
"log_file_name": log, |
|
"early_stop": True, |
|
"eval_method": "holdout" |
|
} |
|
|
|
num_cols = df_features.select_dtypes(include=['float64', 'int64']).columns |
|
cat_cols = df_features.select_dtypes(include=['object']).columns |
|
|
|
numeric_transformer = Pipeline(steps=[ |
|
('imputer', SimpleImputer(strategy='mean')), |
|
('scaler', StandardScaler()) |
|
]) |
|
|
|
categorical_transformer = Pipeline(steps=[ |
|
('imputer', SimpleImputer(strategy='most_frequent')), |
|
('onehot', OneHotEncoder(handle_unknown='ignore')) |
|
]) |
|
|
|
preprocessor = ColumnTransformer( |
|
transformers=[ |
|
('num', numeric_transformer, num_cols), |
|
('cat', categorical_transformer, cat_cols) |
|
]) |
|
|
|
automl = AutoML(**automl_settings) |
|
|
|
pipeline = Pipeline(steps=[('preprocessor', preprocessor), |
|
('classifier', automl)]) |
|
|
|
pipeline.fit(df_features, y) |
|
|
|
my_bar.progress(100, text=progress_text) |
|
time.sleep(0.5) |
|
my_bar.empty() |
|
|
|
tab1, tab2, tab3, tab4 = st.tabs(["AutoML", "Best Model", "Partial Dependence", "Shap Values"]) |
|
|
|
with tab1: |
|
|
|
time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = get_output_from_log(filename=log, time_budget=120) |
|
|
|
def model(s): |
|
mod = s.get('Current Learner') |
|
return mod |
|
|
|
def hp(s): |
|
hparams = s.get('Current Hyper-parameters') |
|
return hparams |
|
|
|
df_res = pd.DataFrame({'time': time_history, |
|
metric: 1 - np.array(best_valid_loss_history), |
|
'model': list(map(model, config_history)), |
|
}) |
|
|
|
fig = px.line(df_res, |
|
title='evolution of best models found by AutoML', |
|
x='time', |
|
y=metric, |
|
hover_name='model', |
|
line_shape='hv', |
|
range_y=[0,1]) |
|
|
|
st.plotly_chart(fig, theme="streamlit") |
|
|
|
models = pd.DataFrame({'learner': list(map(model, config_history))}) |
|
hps = list(map(hp, config_history)) |
|
df_hp = pd.DataFrame(hps) |
|
df_models = pd.concat((models, df_hp), axis=1) |
|
|
|
def highlight_last_row(s): |
|
return ['background-color: yellow' if i == len(s) - 1 else '' for i in range(len(s))] |
|
|
|
st.dataframe(df_models.style.apply(highlight_last_row, axis=0)) |
|
|
|
st.write('Estimator tested') |
|
st.table(automl.estimator_list) |
|
|
|
with tab2: |
|
st.header('Best Model') |
|
|
|
st.text(automl.model.estimator) |
|
|
|
col1, col2, col3 = st.columns((1,1,1)) |
|
|
|
with col1: |
|
st.metric(label=metric, value=round(1 - automl.best_loss, 2)) |
|
with col2: |
|
st.metric(label="Time to find", value=str(round(automl.time_to_find_best_model, 2))+' sec') |
|
with col3: |
|
st.metric(label="Time to train", value=str(round(automl.best_config_train_time, 2))+' sec') |
|
|
|
perm_importance = permutation_importance( |
|
pipeline, df_features, y, n_repeats=8 |
|
) |
|
|
|
df_features_importance = pd.DataFrame({'features name': df_features.columns, |
|
'features importance': perm_importance["importances_mean"], |
|
'std error': perm_importance["importances_std"]}).sort_values('features importance', ascending=True) |
|
|
|
fig_features = px.bar(df_features_importance, |
|
x='features importance', |
|
y='features name', |
|
error_x='std error', |
|
height=50*len(df_features_importance)) |
|
|
|
st.divider() |
|
st.plotly_chart(fig_features, theme="streamlit") |
|
|
|
def download_model(model): |
|
output_model = pickle.dumps(model) |
|
b64 = base64.b64encode(output_model).decode() |
|
href = f'<a href="data:file/output_model;base64,{b64}" download="automl.pkl">Download Trained Model File (.pkl)</a>' |
|
st.markdown(href, unsafe_allow_html=True) |
|
|
|
download_model(automl) |
|
|
|
with tab3: |
|
with st.container(): |
|
st.subheader('1D Partial Dependance for the three most important features') |
|
|
|
l_col_1D = list(st.columns((1,1,1))) |
|
|
|
common_params = { |
|
"subsample": 50, |
|
"n_jobs": 2, |
|
"grid_resolution": 20, |
|
"random_state": 0 |
|
} |
|
|
|
most_important_features = list(df_features_importance.iloc[-3:]['features name']) |
|
|
|
for i, col in enumerate(l_col_1D): |
|
with col: |
|
features_info = { |
|
"features": [most_important_features[i]], |
|
"kind": "average", |
|
"categorical_features": cat_cols |
|
} |
|
|
|
_, ax = plt.subplots(ncols=1, constrained_layout=True) |
|
display = PartialDependenceDisplay.from_estimator( |
|
pipeline, |
|
df_features, |
|
**features_info, |
|
target=len(set(y)), |
|
ax=ax, |
|
**common_params, |
|
) |
|
|
|
st.pyplot(display.figure_) |
|
|
|
st.divider() |
|
|
|
with st.container(): |
|
st.subheader('2D Partial Dependance for the three most important features') |
|
|
|
l_col_2D = list(st.columns((1,1,1))) |
|
|
|
most_important_features_comb = list(combinations(most_important_features, 2)) |
|
|
|
for i, col in enumerate(l_col_2D): |
|
with col: |
|
features_info = { |
|
"features": [most_important_features_comb[i]], |
|
"kind": "average" |
|
} |
|
|
|
_, ax = plt.subplots(ncols=1, constrained_layout=True) |
|
|
|
with st.spinner(f'Compute partial dependeces with {most_important_features_comb[i][0]} and {most_important_features_comb[i][1]}'): |
|
|
|
display = PartialDependenceDisplay.from_estimator( |
|
pipeline, |
|
df_features, |
|
**features_info, |
|
target=len(set(y)), |
|
ax=ax, |
|
**common_params, |
|
) |
|
|
|
st.pyplot(display.figure_) |
|
|
|
|
|
with tab4: |
|
|
|
df_features_test = df_test[df_test.columns.difference([label])] |
|
|
|
with st.spinner(f'Compute Shap Values'): |
|
explainer = shap.Explainer(pipeline.predict, df_features_test) |
|
shap_values = explainer(df_features_test) |
|
|
|
st.set_option('deprecation.showPyplotGlobalUse', False) |
|
st.pyplot(shap.plots.beeswarm(shap_values)) |
|
st.pyplot(shap.summary_plot(shap_values, plot_type='violin')) |
|
|
|
|
|
|
|
if os.path.isfile('datasets/temp_file.csv'): |
|
os.remove('datasets/temp_file.csv') |