singhk28
commited on
Commit
•
23cce76
1
Parent(s):
3293c6c
Feature update: introduce mode_type and opt_type
Browse files
app.py
CHANGED
@@ -8,15 +8,23 @@ from sklearn.metrics import mean_absolute_error, max_error, r2_score, mean_squar
|
|
8 |
import matplotlib.pyplot as plt
|
9 |
import streamlit.components.v1 as components
|
10 |
import mpld3
|
|
|
11 |
# ---------------------------------------------------------------------------------------------------------------------- #
|
12 |
# Settings:
|
13 |
st.set_option('deprecation.showPyplotGlobalUse', False)
|
14 |
|
15 |
# ---------------------------------------------------------------------------------------------------------------------- #
|
16 |
# Collecting User Input
|
17 |
-
## Preamble
|
|
|
18 |
st.markdown(f'<h1 style="color:#0096FF;font-size:54px;">{"No Code ML"}</h1>', unsafe_allow_html=True)
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
st.markdown(f"**To use this tool**, fill out all the requested fields from top to bottom.")
|
21 |
st.markdown(f"**Note:** If an error is obtained refresh the page and start over.")
|
22 |
## Column Name
|
@@ -25,21 +33,32 @@ target_col = st.text_input("Enter the exact name of the column with your target
|
|
25 |
## Model Type: Regression or Classifier
|
26 |
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"2) Select type of model you would like to build"}</h3>', unsafe_allow_html=True)
|
27 |
mod_type = st.selectbox("What type of model would you like to train? Pick regression model for continous values and classifier for categorical values.", ('regression', 'classifier'))
|
28 |
-
##
|
29 |
-
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"3)
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
else:
|
33 |
-
|
34 |
-
|
35 |
-
st.
|
36 |
-
uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
|
37 |
|
38 |
# ---------------------------------------------------------------------------------------------------------------------- #
|
39 |
if uploaded_file:
|
40 |
# Read CSV File and Provide Preview of Data and Statistical Summary:
|
41 |
data = pd.read_csv(uploaded_file)
|
42 |
-
|
|
|
43 |
if target_col not in list(data.columns):
|
44 |
st.error("ERROR: Provided name of the target column is not in the CSV file. Please make sure you provide the exact match (case sensitive).Please provide the correct label and try again.")
|
45 |
exit()
|
@@ -50,40 +69,54 @@ if uploaded_file:
|
|
50 |
st.write(data.describe())
|
51 |
|
52 |
# Prepare Train/Test Split:
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
-
# ---------------------------------------------------------------------------------------------------------------------- #
|
59 |
# Figure out Column Data Types
|
60 |
object_columns = data.select_dtypes(include="object").columns.tolist()
|
|
|
|
|
61 |
# Build Regression Model
|
62 |
if mod_type == "regression":
|
63 |
# Setup Regressor Problem
|
64 |
if object_columns:
|
65 |
-
if
|
66 |
-
s = reg.setup(train_data, target = target_col, log_experiment=True, normalize=True, categorical_features=object_columns, fold=
|
67 |
else:
|
68 |
-
s = reg.setup(data, target = target_col, log_experiment=True, normalize=True, categorical_features=object_columns,
|
69 |
else:
|
70 |
-
if
|
71 |
-
s = reg.setup(train_data, target = target_col, log_experiment=True, normalize=True, silent= True, experiment_name = 'No_code_ML')
|
72 |
else:
|
73 |
s = reg.setup(data, target = target_col, log_experiment=True, normalize=True, silent= True, experiment_name = 'No_code_ML')
|
74 |
|
75 |
# Find the best algorithm to build Model:
|
76 |
st.subheader("Algorithm Selection")
|
|
|
77 |
with st.spinner(text="Finding the best algorithm for your dataset..."):
|
78 |
best_mod = reg.compare_models()
|
79 |
regression_results = reg.pull()
|
80 |
best_mod_name = regression_results.Model[0]
|
81 |
st.write(regression_results)
|
|
|
|
|
82 |
|
83 |
# Tune the hyperparameters for the best algorithm:
|
84 |
st.subheader("Tuning the Model")
|
|
|
85 |
with st.spinner(text="Tuning the algorithm..."):
|
86 |
-
tuned_mod = reg.tune_model(best_mod, optimize = 'RMSE', n_iter=
|
|
|
|
|
87 |
|
88 |
# Finalize the model (Train on the entire train dataset):
|
89 |
with st.spinner("Finalizing the model..."):
|
@@ -125,37 +158,86 @@ if uploaded_file:
|
|
125 |
|
126 |
# ---------------------------------------------------------------------------------------------------------------------- #
|
127 |
# Use Trained Model to Explore Parameter Space
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
# ---------------------------------------------------------------------------------------------------------------------- #
|
161 |
# Build Classifier Model
|
|
|
8 |
import matplotlib.pyplot as plt
|
9 |
import streamlit.components.v1 as components
|
10 |
import mpld3
|
11 |
+
import time
|
12 |
# ---------------------------------------------------------------------------------------------------------------------- #
|
13 |
# Settings:
|
14 |
st.set_option('deprecation.showPyplotGlobalUse', False)
|
15 |
|
16 |
# ---------------------------------------------------------------------------------------------------------------------- #
|
17 |
# Collecting User Input
|
18 |
+
## Preamble & Formatting
|
19 |
+
|
20 |
st.markdown(f'<h1 style="color:#0096FF;font-size:54px;">{"No Code ML"}</h1>', unsafe_allow_html=True)
|
21 |
+
col1, mid, col2 = st.columns([10,1,20])
|
22 |
+
with col1:
|
23 |
+
st.image('https://images.pexels.com/photos/2599244/pexels-photo-2599244.jpeg?auto=compress&cs=tinysrgb&w=1260&h=750&dpr=1')
|
24 |
+
with col2:
|
25 |
+
st.markdown("""This tool prepares a machine learning model, using your tabular data, from scratch. The model is then used to make predictions for various combinations of the provided data to try to obtain a combination that achieves the desired target value (if possible). **Please direct any bug reports or inquiries to the <a href="http://cleanenergy.utoronto.ca/">clean energy lab at UofT</a>**""", unsafe_allow_html=True)
|
26 |
+
st.markdown("""---""")
|
27 |
+
|
28 |
st.markdown(f"**To use this tool**, fill out all the requested fields from top to bottom.")
|
29 |
st.markdown(f"**Note:** If an error is obtained refresh the page and start over.")
|
30 |
## Column Name
|
|
|
33 |
## Model Type: Regression or Classifier
|
34 |
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"2) Select type of model you would like to build"}</h3>', unsafe_allow_html=True)
|
35 |
mod_type = st.selectbox("What type of model would you like to train? Pick regression model for continous values and classifier for categorical values.", ('regression', 'classifier'))
|
36 |
+
## Mode of Use
|
37 |
+
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"3) Select mode of use"}</h3>', unsafe_allow_html=True)
|
38 |
+
mode_type = st.selectbox("What would you like to use the tool for?", ('Benchmarking (finding the best algorithm for your problem)', 'Parameter Search (find combination of parameters to get a desired value)'))
|
39 |
+
if mode_type == 'Parameter Search (find combination of parameters to get a desired value)':
|
40 |
+
## Desired Target Value
|
41 |
+
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"4) Type of parameter search"}</h3>', unsafe_allow_html=True)
|
42 |
+
opt_type = st.selectbox("What do you want to do with the output?", ('Maximize it', 'Minimize it', 'Obtain a desired value'))
|
43 |
+
if mod_type == 'regression':
|
44 |
+
if opt_type == 'Move towards a desired value':
|
45 |
+
desired_value = float(st.number_input("Enter the desired value for the target variable."))
|
46 |
+
else:
|
47 |
+
desired_value = st.text_input("Enter the desired target parameter value. This field is case sensitive. (i.e., capital letters must match.)", key="DV for Classifier")
|
48 |
+
## Ask for Dataset
|
49 |
+
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"5) Upload CSV file "}</h3>', unsafe_allow_html=True)
|
50 |
+
uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
|
51 |
else:
|
52 |
+
## Ask for Dataset
|
53 |
+
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"4) Upload CSV file "}</h3>', unsafe_allow_html=True)
|
54 |
+
uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
|
|
|
55 |
|
56 |
# ---------------------------------------------------------------------------------------------------------------------- #
|
57 |
if uploaded_file:
|
58 |
# Read CSV File and Provide Preview of Data and Statistical Summary:
|
59 |
data = pd.read_csv(uploaded_file)
|
60 |
+
data_size = len(data)
|
61 |
+
|
62 |
if target_col not in list(data.columns):
|
63 |
st.error("ERROR: Provided name of the target column is not in the CSV file. Please make sure you provide the exact match (case sensitive).Please provide the correct label and try again.")
|
64 |
exit()
|
|
|
69 |
st.write(data.describe())
|
70 |
|
71 |
# Prepare Train/Test Split:
|
72 |
+
fraction_check = 20_000/data_size # Cap Training Dataset to 20k to allow tuning to occur in a timely manner
|
73 |
+
if fraction_check < 0.8:
|
74 |
+
train_frac = fraction_check
|
75 |
+
train_data = data.sample(frac=train_frac, random_state=0)
|
76 |
+
test_data = data.drop(train_data.index)
|
77 |
+
if len(test_data) > 5_000:
|
78 |
+
test_data = test_data[0:5000]
|
79 |
+
else:
|
80 |
+
train_frac = 0.8
|
81 |
+
train_data = data.sample(frac=train_frac, random_state=0)
|
82 |
+
test_data = data.drop(train_data.index)
|
83 |
|
|
|
84 |
# Figure out Column Data Types
|
85 |
object_columns = data.select_dtypes(include="object").columns.tolist()
|
86 |
+
|
87 |
+
# ---------------------------------------------------------------------------------------------------------------------- #
|
88 |
# Build Regression Model
|
89 |
if mod_type == "regression":
|
90 |
# Setup Regressor Problem
|
91 |
if object_columns:
|
92 |
+
if data_size > 20:
|
93 |
+
s = reg.setup(train_data, target = target_col, log_experiment=True, normalize=True, categorical_features=object_columns, fold=5, silent= True, experiment_name = 'No_code_ML')
|
94 |
else:
|
95 |
+
s = reg.setup(data, target = target_col, log_experiment=True, normalize=True, categorical_features=object_columns, silent= True, experiment_name = 'No_code_ML')
|
96 |
else:
|
97 |
+
if data_size > 20:
|
98 |
+
s = reg.setup(train_data, target = target_col, log_experiment=True, normalize=True, silent= True, fold=5, experiment_name = 'No_code_ML')
|
99 |
else:
|
100 |
s = reg.setup(data, target = target_col, log_experiment=True, normalize=True, silent= True, experiment_name = 'No_code_ML')
|
101 |
|
102 |
# Find the best algorithm to build Model:
|
103 |
st.subheader("Algorithm Selection")
|
104 |
+
start_algo = time.time()
|
105 |
with st.spinner(text="Finding the best algorithm for your dataset..."):
|
106 |
best_mod = reg.compare_models()
|
107 |
regression_results = reg.pull()
|
108 |
best_mod_name = regression_results.Model[0]
|
109 |
st.write(regression_results)
|
110 |
+
end_algo = time.time()
|
111 |
+
st.write('Time taken to select algorithm:', end_algo - start_algo, 'seconds')
|
112 |
|
113 |
# Tune the hyperparameters for the best algorithm:
|
114 |
st.subheader("Tuning the Model")
|
115 |
+
start_tune = time.time()
|
116 |
with st.spinner(text="Tuning the algorithm..."):
|
117 |
+
tuned_mod = reg.tune_model(best_mod, optimize = 'RMSE', n_iter=5)
|
118 |
+
end_tune = time.time()
|
119 |
+
st.write('Time taken to select hyperparameters:', end_tune - start_tune, 'seconds')
|
120 |
|
121 |
# Finalize the model (Train on the entire train dataset):
|
122 |
with st.spinner("Finalizing the model..."):
|
|
|
158 |
|
159 |
# ---------------------------------------------------------------------------------------------------------------------- #
|
160 |
# Use Trained Model to Explore Parameter Space
|
161 |
+
if mode_type == 'Parameter Search (find combination of parameters to get a desired value)':
|
162 |
+
if object_columns:
|
163 |
+
st.write("Optimization with string data types not currently supported.")
|
164 |
+
else:
|
165 |
+
with st.spinner("Generating parameter combinations for search"):
|
166 |
+
# Creating Variables for Data Generation Used in the Optimization Segment
|
167 |
+
list_of_cols = list(data.columns[0:-1])
|
168 |
+
|
169 |
+
# Find min and max value for the input features in the training dataset
|
170 |
+
max_list = [data[i].max() for i in list_of_cols]
|
171 |
+
min_list = [data[i].min() for i in list_of_cols]
|
172 |
+
|
173 |
+
# Generate DF from New Parameters
|
174 |
+
generated_data = np.array([np.random.randint(low=min_list[i], high=max_list[i], size=50_000) for i in range(0,len(max_list))]).T
|
175 |
+
generated_data_df = pd.DataFrame(generated_data, columns = list_of_cols)
|
176 |
+
|
177 |
+
# Make Predictions with Trained Model
|
178 |
+
generated_predictions = reg.predict_model(final_mod, data = generated_data_df)
|
179 |
+
|
180 |
+
if opt_type == 'Obtain a desired value':
|
181 |
+
st.subheader("Using the trained model to obtain the desired target value:")
|
182 |
+
|
183 |
+
# Filter results to get the places closed to desired value
|
184 |
+
## Determine +/- window for search
|
185 |
+
data_spread = data[target_col].std()/3
|
186 |
+
dv_min = desired_value - data_spread
|
187 |
+
dv_max = desired_value + data_spread
|
188 |
+
|
189 |
+
## Apply +/- window to determine lower and upper bound to filter DF ('Generated_predictions)
|
190 |
+
lower_bound = generated_predictions["Label"] >=dv_min
|
191 |
+
upper_bound = generated_predictions["Label"] <=dv_max
|
192 |
+
|
193 |
+
## Filter DF using upper and lower bounds - sort values based on absolute distance to desired value provided by user.
|
194 |
+
proposed_values_to_try = generated_predictions[lower_bound & upper_bound]
|
195 |
+
generated_predictions['distance_to_desired_value'] = np.abs(generated_predictions['Label'] - desired_value)
|
196 |
+
proposed_values_to_try.sort_values('distance_to_desired_value', inplace=True)
|
197 |
+
proposed_values_to_try.reset_index(drop=True, inplace=True)
|
198 |
+
|
199 |
+
## Display top 10 rows
|
200 |
+
final_proposed_parameters = proposed_values_to_try[0:10]
|
201 |
+
|
202 |
+
if len(final_proposed_parameters) == 0:
|
203 |
+
st.write("No parameters could be found for the desired value based on current model. Try collecting additional data or provide a different target value.")
|
204 |
+
else:
|
205 |
+
st.write(final_proposed_parameters)
|
206 |
+
st.download_button(label="Download the Proposed Parameters to Try", data = final_proposed_parameters.to_csv(index=False), file_name='Final_proposed_parameters.csv')
|
207 |
+
|
208 |
+
if opt_type == 'Maximize it':
|
209 |
+
st.subheader("Using the trained model to maximize target value:")
|
210 |
+
generated_preds = generated_predictions.copy()
|
211 |
+
|
212 |
+
# Sort results in descending order based on predicted values
|
213 |
+
generated_preds.sort_values('Label', ascending=False, inplace=True)
|
214 |
+
generated_preds.reset_index(drop=True, inplace=True)
|
215 |
+
|
216 |
+
## Display top 10 rows
|
217 |
+
final_proposed_parameters = generated_preds[0:10]
|
218 |
+
|
219 |
+
if len(final_proposed_parameters) == 0:
|
220 |
+
st.write("No parameters could be found for the desired value based on current model. Try collecting additional data or provide a different target value.")
|
221 |
+
else:
|
222 |
+
st.write(final_proposed_parameters)
|
223 |
+
st.download_button(label="Download the Proposed Parameters to Try", data = final_proposed_parameters.to_csv(index=False), file_name='Final_proposed_parameters.csv')
|
224 |
+
|
225 |
+
if opt_type == 'Minimize it':
|
226 |
+
st.subheader("Using the trained model to minimize target value:")
|
227 |
+
generated_preds = generated_predictions.copy()
|
228 |
+
|
229 |
+
# Sort results in descending order based on predicted values
|
230 |
+
generated_preds.sort_values('Label', inplace=True)
|
231 |
+
generated_preds.reset_index(drop=True, inplace=True)
|
232 |
+
|
233 |
+
## Display top 10 rows
|
234 |
+
final_proposed_parameters = generated_preds[0:10]
|
235 |
+
|
236 |
+
if len(final_proposed_parameters) == 0:
|
237 |
+
st.write("No parameters could be found for the desired value based on current model. Try collecting additional data or provide a different target value.")
|
238 |
+
else:
|
239 |
+
st.write(final_proposed_parameters)
|
240 |
+
st.download_button(label="Download the Proposed Parameters to Try", data = final_proposed_parameters.to_csv(index=False), file_name='Final_proposed_parameters.csv')
|
241 |
|
242 |
# ---------------------------------------------------------------------------------------------------------------------- #
|
243 |
# Build Classifier Model
|