Upload autoML.py
Browse files
autoML.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from flaml import AutoML
|
5 |
+
from flaml.automl.data import get_output_from_log
|
6 |
+
import pickle
|
7 |
+
import plotly.express as px
|
8 |
+
import base64
|
9 |
+
import time
|
10 |
+
|
11 |
+
from utils import csv_to_featuers_list, pre_process_df, pre_process_features
|
12 |
+
|
13 |
+
def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimize_reg):
|
14 |
+
|
15 |
+
progress_text="Training in progress. Please wait."
|
16 |
+
my_bar = st.progress(0, text=progress_text)
|
17 |
+
time.sleep(0.5)
|
18 |
+
|
19 |
+
df = pd.read_csv(csv)
|
20 |
+
df.drop(columns='Unnamed: 0', inplace=True)
|
21 |
+
df = pre_process_df(df)
|
22 |
+
df_features = df[df.columns.difference([label])]
|
23 |
+
df_features=(df_features-df_features.mean())/df_features.std()
|
24 |
+
y = df[label]
|
25 |
+
|
26 |
+
my_bar.progress(50, text=progress_text)
|
27 |
+
|
28 |
+
|
29 |
+
if task == 'Classification':
|
30 |
+
automl_settings = {
|
31 |
+
"time_budget": int(budget),
|
32 |
+
"metric": metric_to_minimize_class,
|
33 |
+
"task": 'classification',
|
34 |
+
"log_file_name": 'classlog.log',
|
35 |
+
"early_stop": True,
|
36 |
+
"eval_method": "holdout"
|
37 |
+
}
|
38 |
+
|
39 |
+
if task == 'Regression':
|
40 |
+
automl_settings = {
|
41 |
+
"time_budget": int(budget),
|
42 |
+
"metric": metric_to_minimize_reg,
|
43 |
+
"task": 'regression',
|
44 |
+
"log_file_name": 'reglog.log',
|
45 |
+
"early_stop": True,
|
46 |
+
"eval_method": "holdout"
|
47 |
+
}
|
48 |
+
|
49 |
+
automl = AutoML()
|
50 |
+
automl.fit(df_features, y, **automl_settings)
|
51 |
+
|
52 |
+
my_bar.progress(100, text=progress_text)
|
53 |
+
time.sleep(0.5)
|
54 |
+
my_bar.empty()
|
55 |
+
|
56 |
+
tab1, tab2 = st.tabs(["AutoML", "Best Model"])
|
57 |
+
|
58 |
+
with tab1:
|
59 |
+
|
60 |
+
if task == 'Classification':
|
61 |
+
log = 'classlog.log'
|
62 |
+
metric = metric_to_minimize_class
|
63 |
+
if task == 'Regression':
|
64 |
+
log = 'reglog.log'
|
65 |
+
metric = metric_to_minimize_reg
|
66 |
+
|
67 |
+
time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = get_output_from_log(filename=log, time_budget=120)
|
68 |
+
|
69 |
+
def model(s):
|
70 |
+
mod = s.get('Current Learner')
|
71 |
+
return mod
|
72 |
+
|
73 |
+
def hp(s):
|
74 |
+
hparams = s.get('Current Hyper-parameters')
|
75 |
+
return hparams
|
76 |
+
|
77 |
+
df_res = pd.DataFrame({'time': time_history,
|
78 |
+
metric: 1 - np.array(best_valid_loss_history),
|
79 |
+
'model': list(map(model, config_history)),
|
80 |
+
})
|
81 |
+
|
82 |
+
fig = px.line(df_res,
|
83 |
+
title='evolution of best models found by AutoML',
|
84 |
+
x='time',
|
85 |
+
y=metric,
|
86 |
+
hover_name='model',
|
87 |
+
line_shape='hv',
|
88 |
+
range_y=[0,1])
|
89 |
+
|
90 |
+
st.plotly_chart(fig, theme="streamlit")
|
91 |
+
|
92 |
+
models = pd.DataFrame({'learner': list(map(model, config_history))})
|
93 |
+
hps = list(map(hp, config_history))
|
94 |
+
df_hp = pd.DataFrame(hps)
|
95 |
+
df_models = pd.concat((models, df_hp), axis=1)
|
96 |
+
|
97 |
+
def highlight_last_row(s):
|
98 |
+
return ['background-color: yellow' if i == len(s) - 1 else '' for i in range(len(s))]
|
99 |
+
|
100 |
+
st.dataframe(df_models.style.apply(highlight_last_row, axis=0))
|
101 |
+
|
102 |
+
st.write('Estimator tested')
|
103 |
+
st.table(automl.estimator_list)
|
104 |
+
|
105 |
+
with tab2:
|
106 |
+
st.header('Best Model')
|
107 |
+
|
108 |
+
st.text(automl.model.estimator)
|
109 |
+
|
110 |
+
col1, col2, col3 = st.columns((1,1,1))
|
111 |
+
|
112 |
+
with col1:
|
113 |
+
st.metric(label="r2_score", value=round(1 - automl.best_loss, 2))
|
114 |
+
with col2:
|
115 |
+
st.metric(label="Time to find", value=str(round(automl.time_to_find_best_model, 2))+' sec')
|
116 |
+
with col3:
|
117 |
+
st.metric(label="Time to train", value=str(round(automl.best_config_train_time, 2))+' sec')
|
118 |
+
|
119 |
+
df_features_importance = pd.DataFrame({'features name': automl.model.estimator.feature_name_, 'features importance': automl.model.estimator.feature_importances_})
|
120 |
+
fig_features = px.bar(df_features_importance, x='features importance', y='features name')
|
121 |
+
|
122 |
+
st.divider()
|
123 |
+
st.plotly_chart(fig_features, theme="streamlit")
|
124 |
+
|
125 |
+
|
126 |
+
def download_model(model):
|
127 |
+
output_model = pickle.dumps(model)
|
128 |
+
b64 = base64.b64encode(output_model).decode()
|
129 |
+
href = f'<a href="data:file/output_model;base64,{b64}" download="automl.pkl">Download Trained Model File (.pkl)</a>'
|
130 |
+
st.markdown(href, unsafe_allow_html=True)
|
131 |
+
|
132 |
+
download_model(automl)
|