Théo Villette commited on
Commit
e390497
1 Parent(s): 82c6295
Files changed (2) hide show
  1. app.py +0 -2
  2. autoML.py +41 -12
app.py CHANGED
@@ -1,6 +1,4 @@
1
  import streamlit as st
2
- from flaml.automl.data import get_output_from_log
3
- import plotly.express as px
4
 
5
  from utils import csv_to_featuers_list
6
  from autoML import autoML
 
1
  import streamlit as st
 
 
2
 
3
  from utils import csv_to_featuers_list
4
  from autoML import autoML
autoML.py CHANGED
@@ -7,8 +7,11 @@ import pickle
7
  import plotly.express as px
8
  import base64
9
  import time
10
-
11
- from utils import csv_to_featuers_list, pre_process_df, pre_process_features
 
 
 
12
 
13
  def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimize_reg):
14
 
@@ -17,9 +20,7 @@ def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimiz
17
  time.sleep(0.5)
18
 
19
  df = pd.read_csv(csv)
20
- df = pre_process_df(df)
21
  df_features = df[df.columns.difference([label])]
22
- df_features=(df_features-df_features.mean())/df_features.std()
23
  y = df[label]
24
 
25
  my_bar.progress(50, text=progress_text)
@@ -49,8 +50,31 @@ def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimiz
49
  "eval_method": "holdout"
50
  }
51
 
52
- automl = AutoML()
53
- automl.fit(df_features, y, **automl_settings)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  my_bar.progress(100, text=progress_text)
56
  time.sleep(0.5)
@@ -112,13 +136,18 @@ def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimiz
112
  with col3:
113
  st.metric(label="Time to train", value=str(round(automl.best_config_train_time, 2))+' sec')
114
 
115
- if automl.best_estimator == 'lgbm':
116
- df_features_importance = pd.DataFrame({'features name': automl.model.estimator.feature_name_, 'features importance': automl.model.estimator.feature_importances_})
117
- fig_features = px.bar(df_features_importance, x='features importance', y='features name')
118
-
119
- st.divider()
120
- st.plotly_chart(fig_features, theme="streamlit")
121
 
 
 
 
 
 
 
 
 
122
 
123
  def download_model(model):
124
  output_model = pickle.dumps(model)
 
7
  import plotly.express as px
8
  import base64
9
  import time
10
+ from sklearn.compose import ColumnTransformer
11
+ from sklearn.pipeline import Pipeline
12
+ from sklearn.impute import SimpleImputer
13
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
14
+ from sklearn.inspection import permutation_importance
15
 
16
  def autoML(csv, task, budget, label, metric_to_minimize_class, metric_to_minimize_reg):
17
 
 
20
  time.sleep(0.5)
21
 
22
  df = pd.read_csv(csv)
 
23
  df_features = df[df.columns.difference([label])]
 
24
  y = df[label]
25
 
26
  my_bar.progress(50, text=progress_text)
 
50
  "eval_method": "holdout"
51
  }
52
 
53
+ num_cols = df_features.select_dtypes(include=['float64', 'int64']).columns
54
+ cat_cols = df_features.select_dtypes(include=['object']).columns
55
+
56
+ numeric_transformer = Pipeline(steps=[
57
+ ('imputer', SimpleImputer(strategy='mean')),
58
+ ('scaler', StandardScaler())
59
+ ])
60
+
61
+ categorical_transformer = Pipeline(steps=[
62
+ ('imputer', SimpleImputer(strategy='most_frequent')),
63
+ ('onehot', OneHotEncoder(handle_unknown='ignore'))
64
+ ])
65
+
66
+ preprocessor = ColumnTransformer(
67
+ transformers=[
68
+ ('num', numeric_transformer, num_cols),
69
+ ('cat', categorical_transformer, cat_cols)
70
+ ])
71
+
72
+ automl = AutoML(**automl_settings)
73
+
74
+ pipeline = Pipeline(steps=[('preprocessor', preprocessor),
75
+ ('classifier', automl)])
76
+
77
+ pipeline.fit(df_features, y)
78
 
79
  my_bar.progress(100, text=progress_text)
80
  time.sleep(0.5)
 
136
  with col3:
137
  st.metric(label="Time to train", value=str(round(automl.best_config_train_time, 2))+' sec')
138
 
139
+ perm_importance = permutation_importance(
140
+ pipeline, df_features, y, n_repeats=8
141
+ )
 
 
 
142
 
143
+ df_features_importance = pd.DataFrame({'features name': df_features.columns,
144
+ 'features importance': perm_importance["importances_mean"],
145
+ 'std error': perm_importance["importances_std"]})
146
+
147
+ fig_features = px.bar(df_features_importance, x='features importance', y='features name', error_x='std error')
148
+
149
+ st.divider()
150
+ st.plotly_chart(fig_features, theme="streamlit")
151
 
152
  def download_model(model):
153
  output_model = pickle.dumps(model)