|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
from flaml import AutoML |
|
from flaml.automl.data import get_output_from_log |
|
import pickle |
|
import plotly.express as px |
|
import base64 |
|
import time |
|
|
|
from utils import csv_to_featuers_list, pre_process_df, pre_process_features |
|
|
|
def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimize_reg): |
|
|
|
progress_text="Training in progress. Please wait." |
|
my_bar = st.progress(0, text=progress_text) |
|
time.sleep(0.5) |
|
|
|
df = pd.read_csv(csv) |
|
df = pre_process_df(df) |
|
df_features = df[df.columns.difference([label])] |
|
df_features=(df_features-df_features.mean())/df_features.std() |
|
y = df[label] |
|
|
|
my_bar.progress(50, text=progress_text) |
|
|
|
|
|
if task == 'Classification': |
|
automl_settings = { |
|
"time_budget": int(budget), |
|
"metric": metric_to_minimize_class, |
|
"task": 'classification', |
|
"log_file_name": 'classlog.log', |
|
"early_stop": True, |
|
"eval_method": "holdout" |
|
} |
|
|
|
if task == 'Regression': |
|
automl_settings = { |
|
"time_budget": int(budget), |
|
"metric": metric_to_minimize_reg, |
|
"task": 'regression', |
|
"log_file_name": 'reglog.log', |
|
"early_stop": True, |
|
"eval_method": "holdout" |
|
} |
|
|
|
automl = AutoML() |
|
automl.fit(df_features, y, **automl_settings) |
|
|
|
my_bar.progress(100, text=progress_text) |
|
time.sleep(0.5) |
|
my_bar.empty() |
|
|
|
tab1, tab2 = st.tabs(["AutoML", "Best Model"]) |
|
|
|
with tab1: |
|
|
|
if task == 'Classification': |
|
log = 'classlog.log' |
|
metric = metric_to_minimize_class |
|
if task == 'Regression': |
|
log = 'reglog.log' |
|
metric = metric_to_minimize_reg |
|
|
|
time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = get_output_from_log(filename=log, time_budget=120) |
|
|
|
def model(s): |
|
mod = s.get('Current Learner') |
|
return mod |
|
|
|
def hp(s): |
|
hparams = s.get('Current Hyper-parameters') |
|
return hparams |
|
|
|
df_res = pd.DataFrame({'time': time_history, |
|
metric: 1 - np.array(best_valid_loss_history), |
|
'model': list(map(model, config_history)), |
|
}) |
|
|
|
fig = px.line(df_res, |
|
title='evolution of best models found by AutoML', |
|
x='time', |
|
y=metric, |
|
hover_name='model', |
|
line_shape='hv', |
|
range_y=[0,1]) |
|
|
|
st.plotly_chart(fig, theme="streamlit") |
|
|
|
models = pd.DataFrame({'learner': list(map(model, config_history))}) |
|
hps = list(map(hp, config_history)) |
|
df_hp = pd.DataFrame(hps) |
|
df_models = pd.concat((models, df_hp), axis=1) |
|
|
|
def highlight_last_row(s): |
|
return ['background-color: yellow' if i == len(s) - 1 else '' for i in range(len(s))] |
|
|
|
st.dataframe(df_models.style.apply(highlight_last_row, axis=0)) |
|
|
|
st.write('Estimator tested') |
|
st.table(automl.estimator_list) |
|
|
|
with tab2: |
|
st.header('Best Model') |
|
|
|
st.text(automl.model.estimator) |
|
|
|
col1, col2, col3 = st.columns((1,1,1)) |
|
|
|
with col1: |
|
st.metric(label="r2_score", value=round(1 - automl.best_loss, 2)) |
|
with col2: |
|
st.metric(label="Time to find", value=str(round(automl.time_to_find_best_model, 2))+' sec') |
|
with col3: |
|
st.metric(label="Time to train", value=str(round(automl.best_config_train_time, 2))+' sec') |
|
|
|
df_features_importance = pd.DataFrame({'features name': automl.model.estimator.feature_name_, 'features importance': automl.model.estimator.feature_importances_}) |
|
fig_features = px.bar(df_features_importance, x='features importance', y='features name') |
|
|
|
st.divider() |
|
st.plotly_chart(fig_features, theme="streamlit") |
|
|
|
|
|
def download_model(model): |
|
output_model = pickle.dumps(model) |
|
b64 = base64.b64encode(output_model).decode() |
|
href = f'<a href="data:file/output_model;base64,{b64}" download="automl.pkl">Download Trained Model File (.pkl)</a>' |
|
st.markdown(href, unsafe_allow_html=True) |
|
|
|
download_model(automl) |