add 1D & 2D partial dependencies for regression tasks
Browse files- autoML.py +73 -4
- requirements.txt +1 -0
autoML.py
CHANGED
@@ -1,10 +1,13 @@
|
|
1 |
import os
|
|
|
|
|
2 |
import streamlit as st
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
5 |
from flaml import AutoML
|
6 |
from flaml.automl.data import get_output_from_log
|
7 |
import pickle
|
|
|
8 |
import plotly.express as px
|
9 |
import base64
|
10 |
import time
|
@@ -13,6 +16,7 @@ from sklearn.pipeline import Pipeline
|
|
13 |
from sklearn.impute import SimpleImputer
|
14 |
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
15 |
from sklearn.inspection import permutation_importance
|
|
|
16 |
|
17 |
def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimize_reg):
|
18 |
|
@@ -21,7 +25,6 @@ def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimiz
|
|
21 |
time.sleep(0.5)
|
22 |
|
23 |
df = pd.read_csv(csv)
|
24 |
-
print(df)
|
25 |
df_features = df[df.columns.difference([label])]
|
26 |
y = df[label]
|
27 |
|
@@ -82,7 +85,7 @@ def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimiz
|
|
82 |
time.sleep(0.5)
|
83 |
my_bar.empty()
|
84 |
|
85 |
-
tab1, tab2 = st.tabs(["AutoML", "Best Model"])
|
86 |
|
87 |
with tab1:
|
88 |
|
@@ -144,8 +147,8 @@ def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimiz
|
|
144 |
|
145 |
df_features_importance = pd.DataFrame({'features name': df_features.columns,
|
146 |
'features importance': perm_importance["importances_mean"],
|
147 |
-
'std error': perm_importance["importances_std"]})
|
148 |
-
|
149 |
fig_features = px.bar(df_features_importance,
|
150 |
x='features importance',
|
151 |
y='features name',
|
@@ -163,5 +166,71 @@ def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimiz
|
|
163 |
|
164 |
download_model(automl)
|
165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
if os.path.isfile('datasets/temp_file.csv'):
|
167 |
os.remove('datasets/temp_file.csv')
|
|
|
1 |
import os
|
2 |
+
from itertools import combinations
|
3 |
+
|
4 |
import streamlit as st
|
5 |
import pandas as pd
|
6 |
import numpy as np
|
7 |
from flaml import AutoML
|
8 |
from flaml.automl.data import get_output_from_log
|
9 |
import pickle
|
10 |
+
import matplotlib.pyplot as plt
|
11 |
import plotly.express as px
|
12 |
import base64
|
13 |
import time
|
|
|
16 |
from sklearn.impute import SimpleImputer
|
17 |
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
18 |
from sklearn.inspection import permutation_importance
|
19 |
+
from sklearn.inspection import PartialDependenceDisplay
|
20 |
|
21 |
def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimize_reg):
|
22 |
|
|
|
25 |
time.sleep(0.5)
|
26 |
|
27 |
df = pd.read_csv(csv)
|
|
|
28 |
df_features = df[df.columns.difference([label])]
|
29 |
y = df[label]
|
30 |
|
|
|
85 |
time.sleep(0.5)
|
86 |
my_bar.empty()
|
87 |
|
88 |
+
tab1, tab2, tab3 = st.tabs(["AutoML", "Best Model", "Partial Dependence"])
|
89 |
|
90 |
with tab1:
|
91 |
|
|
|
147 |
|
148 |
df_features_importance = pd.DataFrame({'features name': df_features.columns,
|
149 |
'features importance': perm_importance["importances_mean"],
|
150 |
+
'std error': perm_importance["importances_std"]}).sort_values('features importance', ascending=True)
|
151 |
+
|
152 |
fig_features = px.bar(df_features_importance,
|
153 |
x='features importance',
|
154 |
y='features name',
|
|
|
166 |
|
167 |
download_model(automl)
|
168 |
|
169 |
+
with tab3:
|
170 |
+
with st.container():
|
171 |
+
st.subheader('1D Partial Dependance for the three most important features')
|
172 |
+
|
173 |
+
l_col_1D = list(st.columns((1,1,1)))
|
174 |
+
|
175 |
+
common_params = {
|
176 |
+
"subsample": 25,
|
177 |
+
"n_jobs": 2,
|
178 |
+
"grid_resolution": 20,
|
179 |
+
"random_state": 0
|
180 |
+
}
|
181 |
+
|
182 |
+
most_important_features = list(df_features_importance.iloc[-3:]['features name'])
|
183 |
+
|
184 |
+
for i, col in enumerate(l_col_1D):
|
185 |
+
with col:
|
186 |
+
features_info = {
|
187 |
+
"features": [most_important_features[i]],
|
188 |
+
"kind": "average",
|
189 |
+
"categorical_features": cat_cols
|
190 |
+
}
|
191 |
+
|
192 |
+
_, ax = plt.subplots(ncols=1, constrained_layout=True)
|
193 |
+
display = PartialDependenceDisplay.from_estimator(
|
194 |
+
pipeline,
|
195 |
+
df_features,
|
196 |
+
**features_info,
|
197 |
+
ax=ax,
|
198 |
+
**common_params,
|
199 |
+
)
|
200 |
+
|
201 |
+
st.pyplot(display.figure_)
|
202 |
+
|
203 |
+
|
204 |
+
st.divider()
|
205 |
+
|
206 |
+
with st.container():
|
207 |
+
st.subheader('2D Partial Dependance for the three most important features')
|
208 |
+
|
209 |
+
l_col_2D = list(st.columns((1,1,1)))
|
210 |
+
|
211 |
+
most_important_features_comb = list(combinations(most_important_features, 2))
|
212 |
+
|
213 |
+
for i, col in enumerate(l_col_2D):
|
214 |
+
with col:
|
215 |
+
features_info = {
|
216 |
+
"features": [most_important_features_comb[i]],
|
217 |
+
"kind": "average"
|
218 |
+
}
|
219 |
+
|
220 |
+
_, ax = plt.subplots(ncols=1, constrained_layout=True)
|
221 |
+
|
222 |
+
with st.spinner(f'Compute partial dependeces with {most_important_features_comb[i][0]} and {most_important_features_comb[i][1]}'):
|
223 |
+
|
224 |
+
display = PartialDependenceDisplay.from_estimator(
|
225 |
+
pipeline,
|
226 |
+
df_features,
|
227 |
+
**features_info,
|
228 |
+
ax=ax,
|
229 |
+
**common_params,
|
230 |
+
)
|
231 |
+
|
232 |
+
st.pyplot(display.figure_)
|
233 |
+
|
234 |
+
|
235 |
if os.path.isfile('datasets/temp_file.csv'):
|
236 |
os.remove('datasets/temp_file.csv')
|
requirements.txt
CHANGED
@@ -4,3 +4,4 @@ numpy
|
|
4 |
scikit-learn
|
5 |
flaml[automl]
|
6 |
plotly
|
|
|
|
4 |
scikit-learn
|
5 |
flaml[automl]
|
6 |
plotly
|
7 |
+
matplotlib
|