|
|
|
|
|
|
|
import pandas as pd |
|
import numpy as np |
|
import streamlit as st |
|
from pycaret import regression as reg |
|
from pycaret import classification as clf |
|
from sklearn.metrics import mean_absolute_error, max_error, r2_score, mean_squared_error, confusion_matrix, ConfusionMatrixDisplay, accuracy_score |
|
import matplotlib.pyplot as plt |
|
import streamlit.components.v1 as components |
|
import mpld3 |
|
import time |
|
|
|
|
|
st.set_option('deprecation.showPyplotGlobalUse', False) |
|
|
|
|
|
|
|
|
|
|
|
st.markdown(f'<h1 style="color:#0096FF;font-size:54px;">{"No Code ML"}</h1>', unsafe_allow_html=True) |
|
col1, mid, col2 = st.columns([10,1,20]) |
|
with col1: |
|
st.image('https://images.pexels.com/photos/2599244/pexels-photo-2599244.jpeg?auto=compress&cs=tinysrgb&w=1260&h=750&dpr=1') |
|
with col2: |
|
st.markdown("""This tool prepares a machine learning model using your tabular data. The tool can be used in 2 ways:""", unsafe_allow_html=True) |
|
st.markdown("""1) Benchmark different algorithms for your dataset to find the best algorithm and then tune that model to determine best hyperparameters.""", unsafe_allow_html=True) |
|
st.markdown("""2) In the case of experimental science, the best obtained model can be used to make predictions for various combinations of the provided data to try to obtain a combination that achieves a desired target value (if possible).""", unsafe_allow_html=True) |
|
st.markdown("""**The tool is currently under active development. Please direct any bug reports or inquiries to the <a href="http://cleanenergy.utoronto.ca/">clean energy lab at UofT.</a>**""", unsafe_allow_html=True) |
|
st.markdown("""---""") |
|
|
|
st.markdown(f"**To use this tool**, fill out all the requested fields from top to bottom.") |
|
st.markdown(f"**Note:** If an error is obtained refresh the page and start over.") |
|
|
|
|
|
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"1) Provide name of the column you want to predict with model."}</h3>', unsafe_allow_html=True) |
|
target_col = st.text_input("Enter the exact name of the column with your target variable. This field is case sensitive. (i.e., capital letters must match.)") |
|
|
|
|
|
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"2) Select type of model you would like to build"}</h3>', unsafe_allow_html=True) |
|
mod_type = st.selectbox("What type of model would you like to train? Pick regression model for continous values or classifier for categorical values.", ('regression', 'classifier')) |
|
|
|
|
|
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"3) Select mode of use"}</h3>', unsafe_allow_html=True) |
|
mode_type = st.selectbox("What would you like to use the tool for?", ('Benchmarking (finding the best algorithm for your problem)', 'Parameter Search (find combination of parameters to get a desired value)')) |
|
if mode_type == 'Parameter Search (find combination of parameters to get a desired value)': |
|
|
|
if mod_type == 'classifier': |
|
st.write('Parameter search is not currently supported with classifier type models.') |
|
st.write('Please refresh page and try again with the supported tasks.') |
|
exit() |
|
|
|
if mod_type == 'regression': |
|
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"4) Type of parameter search"}</h3>', unsafe_allow_html=True) |
|
opt_type = st.selectbox("What do you want to do with the output?", ('Maximize it', 'Minimize it', 'Obtain a desired value')) |
|
if opt_type == 'Obtain a desired value': |
|
desired_value = float(st.number_input("Enter the desired value for the target variable.")) |
|
|
|
|
|
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"5) Upload CSV file "}</h3>', unsafe_allow_html=True) |
|
uploaded_file = st.file_uploader("Upload a CSV file", type="csv") |
|
|
|
else: |
|
|
|
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"4) Upload CSV file "}</h3>', unsafe_allow_html=True) |
|
uploaded_file = st.file_uploader("Upload a CSV file", type="csv") |
|
|
|
|
|
if uploaded_file: |
|
|
|
data = pd.read_csv(uploaded_file) |
|
data_size = len(data) |
|
|
|
if target_col not in list(data.columns): |
|
st.error("ERROR: Provided name of the target column is not in the CSV file. Please make sure you provide the exact match (case sensitive).Please provide the correct label and try again.") |
|
exit() |
|
|
|
st.subheader("Data preview:") |
|
st.write(data.head()) |
|
st.subheader("Statistical Summary of the Provided Data:") |
|
st.write(data.describe()) |
|
|
|
|
|
fraction_check = 10_000/data_size |
|
if fraction_check < 0.8: |
|
train_frac = fraction_check |
|
train_data = data.sample(frac=train_frac, random_state=0) |
|
test_data = data.drop(train_data.index) |
|
if len(test_data) > 5_000: |
|
test_data = test_data[0:5000] |
|
else: |
|
train_frac = 0.8 |
|
train_data = data.sample(frac=train_frac, random_state=0) |
|
test_data = data.drop(train_data.index) |
|
|
|
|
|
object_columns = data.select_dtypes(include="object").columns.tolist() |
|
|
|
|
|
tree_mods_list = ['Extra Trees Regressor', 'Extra Trees Classifier', 'Random Forest Regressor', 'Random Forest Classifier', 'Decision Tree Regressor', 'Decision Tree Classifier', 'CatBoost Regressor', 'Light Gradient Boosting Machine'] |
|
|
|
|
|
|
|
if mod_type == "regression": |
|
|
|
if object_columns: |
|
if data_size > 20: |
|
s = reg.setup(train_data, target = target_col, normalize=True, categorical_features=object_columns, fold=5, silent= True) |
|
else: |
|
s = reg.setup(data, target = target_col, normalize=True, categorical_features=object_columns, silent= True) |
|
else: |
|
if data_size > 20: |
|
s = reg.setup(train_data, target = target_col, normalize=True, silent= True, fold=5) |
|
else: |
|
s = reg.setup(data, target = target_col, normalize=True, silent= True) |
|
|
|
|
|
st.subheader("Algorithm Selection") |
|
start_algo = time.time() |
|
with st.spinner(text="Finding the best algorithm for your dataset..."): |
|
best_mod = reg.compare_models() |
|
regression_results = reg.pull() |
|
best_mod_name = regression_results.Model[0] |
|
st.write(regression_results) |
|
end_algo = time.time() |
|
st.write('Time taken to select algorithm:', end_algo - start_algo, 'seconds') |
|
|
|
|
|
st.subheader("Tuning the Model") |
|
start_tune = time.time() |
|
with st.spinner(text="Tuning the algorithm..."): |
|
tuned_mod = reg.tune_model(best_mod, optimize = 'RMSE', n_iter=5) |
|
end_tune = time.time() |
|
st.write('Time taken to select hyperparameters:', end_tune - start_tune, 'seconds') |
|
|
|
|
|
with st.spinner("Finalizing the model..."): |
|
final_mod = reg.finalize_model(tuned_mod) |
|
|
|
st.success('Model successfully trained! Here are your results:') |
|
st.write('Best algorithm: ', best_mod_name) |
|
st.write('Best hyperparameters: ', final_mod.get_params()) |
|
|
|
|
|
if best_mod_name in tree_mods_list: |
|
st.subheader("SHAP Analysis Summary Plot") |
|
st.pyplot(reg.interpret_model(final_mod)) |
|
|
|
if len(data) > 20: |
|
|
|
st.subheader("Evaluating model on the test/hold out data:") |
|
predictions = reg.predict_model(final_mod, data=test_data) |
|
st.success('Here are your results:') |
|
st.write(predictions) |
|
st.caption('"Label" is the value predicted by the model.') |
|
|
|
|
|
MAE_val = mean_absolute_error(predictions[target_col], predictions['Label']) |
|
RMSE_err = mean_squared_error(predictions[target_col], predictions['Label'], squared=False) |
|
Max_err = max_error(predictions[target_col], predictions['Label']) |
|
r2_val = r2_score(predictions[target_col], predictions['Label']) |
|
err_dict = {'Mean Absolute Error': MAE_val, 'Root Mean Squared Error': RMSE_err, 'Maximum Error': Max_err} |
|
df_err = pd.DataFrame(err_dict, index=[0]) |
|
st.write(df_err) |
|
|
|
|
|
fig = plt.figure(figsize=(8,8)) |
|
plt.grid(b=None) |
|
plt.scatter(x=predictions[target_col], y=predictions['Label']) |
|
plt.xlabel("True Value", fontsize=18) |
|
plt.ylabel("Predicted Value", fontsize=18) |
|
fig_html = mpld3.fig_to_html(fig) |
|
components.html(fig_html, height=1000) |
|
|
|
|
|
|
|
if mode_type == 'Parameter Search (find combination of parameters to get a desired value)': |
|
if object_columns: |
|
st.write("Optimization with string data types not currently supported.") |
|
else: |
|
with st.spinner("Generating parameter combinations for search"): |
|
|
|
list_of_cols = list(data.columns[0:-1]) |
|
|
|
|
|
max_list = [data[i].max() for i in list_of_cols] |
|
min_list = [data[i].min() for i in list_of_cols] |
|
|
|
|
|
generated_data = np.array([np.random.randint(low=min_list[i], high=max_list[i], size=50_000) for i in range(0,len(max_list))]).T |
|
generated_data_df = pd.DataFrame(generated_data, columns = list_of_cols) |
|
|
|
|
|
generated_predictions = reg.predict_model(final_mod, data = generated_data_df) |
|
|
|
if opt_type == 'Obtain a desired value': |
|
st.subheader("Using the trained model to obtain the desired target value:") |
|
|
|
|
|
|
|
data_spread = data[target_col].std()/3 |
|
dv_min = desired_value - data_spread |
|
dv_max = desired_value + data_spread |
|
|
|
|
|
lower_bound = generated_predictions["Label"] >=dv_min |
|
upper_bound = generated_predictions["Label"] <=dv_max |
|
|
|
|
|
proposed_values_to_try = generated_predictions[lower_bound & upper_bound] |
|
proposed_values_to_try['distance_to_desired_value'] = np.abs(generated_predictions['Label'] - desired_value) |
|
proposed_values_to_try.sort_values('distance_to_desired_value', inplace=True) |
|
proposed_values_to_try.reset_index(drop=True, inplace=True) |
|
|
|
|
|
final_proposed_parameters = proposed_values_to_try[0:10] |
|
|
|
if opt_type == 'Maximize it': |
|
st.subheader("Using the trained model to maximize target value:") |
|
generated_preds = generated_predictions.copy() |
|
|
|
|
|
generated_preds.sort_values('Label', ascending=False, inplace=True) |
|
generated_preds.reset_index(drop=True, inplace=True) |
|
|
|
|
|
final_proposed_parameters = generated_preds[0:10] |
|
|
|
if opt_type == 'Minimize it': |
|
st.subheader("Using the trained model to minimize target value:") |
|
generated_preds = generated_predictions.copy() |
|
|
|
|
|
generated_preds.sort_values('Label', inplace=True) |
|
generated_preds.reset_index(drop=True, inplace=True) |
|
|
|
|
|
final_proposed_parameters = generated_preds[0:10] |
|
|
|
if len(final_proposed_parameters) == 0: |
|
st.write("No parameters could be found for the desired value based on current model. Try collecting additional data or provide a different target value.") |
|
else: |
|
st.write(final_proposed_parameters) |
|
st.download_button(label="Download the Proposed Parameters to Try", data = final_proposed_parameters.to_csv(index=False), file_name='Final_proposed_parameters.csv') |
|
|
|
|
|
|
|
|
|
if mod_type == "classifier": |
|
|
|
if data_size > 20: |
|
s = clf.setup(train_data, target = target_col, normalize=True, silent= True, fold=5) |
|
else: |
|
s = clf.setup(data, target = target_col, normalize=True, silent= True) |
|
|
|
|
|
st.subheader("Algorithm Selection") |
|
start_algo = time.time() |
|
with st.spinner(text="Finding the best algorithm for your dataset..."): |
|
best_mod = clf.compare_models() |
|
classifier_results = clf.pull() |
|
best_mod_name = classifier_results.Model[0] |
|
st.write(classifier_results) |
|
end_algo = time.time() |
|
st.write('Time taken to select algorithm:', end_algo - start_algo, 'seconds') |
|
|
|
|
|
st.subheader("Tuning the Model") |
|
start_tune = time.time() |
|
with st.spinner(text="Tuning the algorithm..."): |
|
tuned_mod = clf.tune_model(best_mod, optimize = 'AUC', n_iter=5) |
|
end_tune = time.time() |
|
st.write('Time taken to select hyperparameters:', end_tune - start_tune, 'seconds') |
|
|
|
|
|
with st.spinner("Finalizing the model..."): |
|
final_mod = clf.finalize_model(tuned_mod) |
|
|
|
st.success('Model successfully trained! Here are your results:') |
|
st.write('Best algorithm: ', best_mod_name) |
|
st.write('Best hyperparameters: ', final_mod.get_params()) |
|
|
|
|
|
if best_mod_name in tree_mods_list: |
|
st.subheader("Feature Importance Plot") |
|
st.pyplot(clf.plot_model(final_mod, plot='feature')) |
|
|
|
if len(data) > 20: |
|
|
|
st.subheader("Evaluating model on the test/hold out data:") |
|
predictions = clf.predict_model(final_mod, data=test_data) |
|
st.success('Here are your results:') |
|
st.write(predictions) |
|
st.caption('"Label" is the value predicted by the model.') |
|
st.write('---') |
|
|
|
|
|
mod_accuracy = accuracy_score(predictions[target_col], predictions['Label']) |
|
st.write('**Model accuracy on test set :**', f'{(mod_accuracy):.2f}') |
|
|
|
|
|
st.subheader("Confusion Matrix for test set:") |
|
cm = confusion_matrix(predictions[target_col], predictions['Label']) |
|
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=predictions[target_col].unique()) |
|
disp.plot() |
|
plt.grid(b=None) |
|
st.pyplot() |
|
|
|
|
|
st.markdown("![visitor badge](https://visitor-badge.glitch.me/badge?page_id=singhk28_nocodeml)") |