thov commited on
Commit
8496c78
1 Parent(s): 8bca716

add 1D & 2D partial dependencies for regression tasks

Browse files
Files changed (2) hide show
  1. autoML.py +73 -4
  2. requirements.txt +1 -0
autoML.py CHANGED
@@ -1,10 +1,13 @@
1
  import os
 
 
2
  import streamlit as st
3
  import pandas as pd
4
  import numpy as np
5
  from flaml import AutoML
6
  from flaml.automl.data import get_output_from_log
7
  import pickle
 
8
  import plotly.express as px
9
  import base64
10
  import time
@@ -13,6 +16,7 @@ from sklearn.pipeline import Pipeline
13
  from sklearn.impute import SimpleImputer
14
  from sklearn.preprocessing import StandardScaler, OneHotEncoder
15
  from sklearn.inspection import permutation_importance
 
16
 
17
  def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimize_reg):
18
 
@@ -21,7 +25,6 @@ def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimiz
21
  time.sleep(0.5)
22
 
23
  df = pd.read_csv(csv)
24
- print(df)
25
  df_features = df[df.columns.difference([label])]
26
  y = df[label]
27
 
@@ -82,7 +85,7 @@ def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimiz
82
  time.sleep(0.5)
83
  my_bar.empty()
84
 
85
- tab1, tab2 = st.tabs(["AutoML", "Best Model"])
86
 
87
  with tab1:
88
 
@@ -144,8 +147,8 @@ def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimiz
144
 
145
  df_features_importance = pd.DataFrame({'features name': df_features.columns,
146
  'features importance': perm_importance["importances_mean"],
147
- 'std error': perm_importance["importances_std"]})
148
-
149
  fig_features = px.bar(df_features_importance,
150
  x='features importance',
151
  y='features name',
@@ -163,5 +166,71 @@ def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimiz
163
 
164
  download_model(automl)
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  if os.path.isfile('datasets/temp_file.csv'):
167
  os.remove('datasets/temp_file.csv')
 
1
  import os
2
+ from itertools import combinations
3
+
4
  import streamlit as st
5
  import pandas as pd
6
  import numpy as np
7
  from flaml import AutoML
8
  from flaml.automl.data import get_output_from_log
9
  import pickle
10
+ import matplotlib.pyplot as plt
11
  import plotly.express as px
12
  import base64
13
  import time
 
16
  from sklearn.impute import SimpleImputer
17
  from sklearn.preprocessing import StandardScaler, OneHotEncoder
18
  from sklearn.inspection import permutation_importance
19
+ from sklearn.inspection import PartialDependenceDisplay
20
 
21
  def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimize_reg):
22
 
 
25
  time.sleep(0.5)
26
 
27
  df = pd.read_csv(csv)
 
28
  df_features = df[df.columns.difference([label])]
29
  y = df[label]
30
 
 
85
  time.sleep(0.5)
86
  my_bar.empty()
87
 
88
+ tab1, tab2, tab3 = st.tabs(["AutoML", "Best Model", "Partial Dependence"])
89
 
90
  with tab1:
91
 
 
147
 
148
  df_features_importance = pd.DataFrame({'features name': df_features.columns,
149
  'features importance': perm_importance["importances_mean"],
150
+ 'std error': perm_importance["importances_std"]}).sort_values('features importance', ascending=True)
151
+
152
  fig_features = px.bar(df_features_importance,
153
  x='features importance',
154
  y='features name',
 
166
 
167
  download_model(automl)
168
 
169
+ with tab3:
170
+ with st.container():
171
+ st.subheader('1D Partial Dependance for the three most important features')
172
+
173
+ l_col_1D = list(st.columns((1,1,1)))
174
+
175
+ common_params = {
176
+ "subsample": 25,
177
+ "n_jobs": 2,
178
+ "grid_resolution": 20,
179
+ "random_state": 0
180
+ }
181
+
182
+ most_important_features = list(df_features_importance.iloc[-3:]['features name'])
183
+
184
+ for i, col in enumerate(l_col_1D):
185
+ with col:
186
+ features_info = {
187
+ "features": [most_important_features[i]],
188
+ "kind": "average",
189
+ "categorical_features": cat_cols
190
+ }
191
+
192
+ _, ax = plt.subplots(ncols=1, constrained_layout=True)
193
+ display = PartialDependenceDisplay.from_estimator(
194
+ pipeline,
195
+ df_features,
196
+ **features_info,
197
+ ax=ax,
198
+ **common_params,
199
+ )
200
+
201
+ st.pyplot(display.figure_)
202
+
203
+
204
+ st.divider()
205
+
206
+ with st.container():
207
+ st.subheader('2D Partial Dependance for the three most important features')
208
+
209
+ l_col_2D = list(st.columns((1,1,1)))
210
+
211
+ most_important_features_comb = list(combinations(most_important_features, 2))
212
+
213
+ for i, col in enumerate(l_col_2D):
214
+ with col:
215
+ features_info = {
216
+ "features": [most_important_features_comb[i]],
217
+ "kind": "average"
218
+ }
219
+
220
+ _, ax = plt.subplots(ncols=1, constrained_layout=True)
221
+
222
+ with st.spinner(f'Compute partial dependeces with {most_important_features_comb[i][0]} and {most_important_features_comb[i][1]}'):
223
+
224
+ display = PartialDependenceDisplay.from_estimator(
225
+ pipeline,
226
+ df_features,
227
+ **features_info,
228
+ ax=ax,
229
+ **common_params,
230
+ )
231
+
232
+ st.pyplot(display.figure_)
233
+
234
+
235
  if os.path.isfile('datasets/temp_file.csv'):
236
  os.remove('datasets/temp_file.csv')
requirements.txt CHANGED
@@ -4,3 +4,4 @@ numpy
4
  scikit-learn
5
  flaml[automl]
6
  plotly
 
 
4
  scikit-learn
5
  flaml[automl]
6
  plotly
7
+ matplotlib