|
|
|
import pandas as pd |
|
import numpy as np |
|
import streamlit as st |
|
from pycaret import regression as reg |
|
from pycaret import classification as clf |
|
from sklearn.metrics import mean_absolute_error, max_error, r2_score, mean_squared_error |
|
import matplotlib.pyplot as plt |
|
import streamlit.components.v1 as components |
|
import mpld3 |
|
|
|
|
|
|
|
st.markdown(f'<h1 style="color:#0096FF;font-size:54px;">{"No Code ML"}</h1>', unsafe_allow_html=True) |
|
st.markdown(f"This tool prepares a machine learning model, using your tabular data, from scratch. The model is then used to make predictions for various combinations of the provided data to try to obtain a combination that achieves the desired target value (if possible).") |
|
st.markdown(f"**To use this tool**, fill out all the requested fields from top to bottom.") |
|
st.markdown(f"**Note:** If an error is obtained refresh the page and start over.") |
|
|
|
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"1) Provide name of the column you want to predict with model."}</h3>', unsafe_allow_html=True) |
|
target_col = st.text_input("Enter the exact name of the column with your target variable. This field is case sensitive. (i.e., capital letters must match.)") |
|
|
|
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"2) Select type of model you would like to build"}</h3>', unsafe_allow_html=True) |
|
mod_type = st.selectbox("What type of model would you like to train? Pick regression model for continous values and classifier for categorical values.", ('regression', 'classifier')) |
|
|
|
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"3) What is the desired value?"}</h3>', unsafe_allow_html=True) |
|
if mod_type == 'regression': |
|
desired_value = float(st.number_input("Enter the desired value for the target variable.")) |
|
else: |
|
desired_value = st.text_input("Enter the desired target parameter value. This field is case sensitive. (i.e., capital letters must match.)", key="DV for Classifier") |
|
|
|
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"4) Upload CSV file "}</h3>', unsafe_allow_html=True) |
|
uploaded_file = st.file_uploader("Upload a CSV file", type="csv") |
|
|
|
|
|
if uploaded_file: |
|
|
|
data = pd.read_csv(uploaded_file) |
|
|
|
if target_col not in list(data.columns): |
|
st.error("ERROR: Provided name of the target column is not in the CSV file. Please make sure you provide the exact match (case sensitive).Please provide the correct label and try again.") |
|
exit() |
|
|
|
st.subheader("Data preview:") |
|
st.write(data.head()) |
|
st.subheader("Statistical Summary of the Provided Data:") |
|
st.write(data.describe()) |
|
|
|
|
|
train_frac = 0.8 |
|
test_frac = 1 - train_frac |
|
train_data = data.sample(frac=train_frac, random_state=0) |
|
test_data = data.drop(train_data.index) |
|
|
|
|
|
|
|
object_columns = data.select_dtypes(include="object").columns.tolist() |
|
|
|
if mod_type == "regression": |
|
|
|
if object_columns: |
|
if len(data) > 20: |
|
s = reg.setup(train_data, target = target_col, log_experiment=True, normalize=True, categorical_features=object_columns, fold=20, silent= True, experiment_name = 'No_code_ML') |
|
else: |
|
s = reg.setup(data, target = target_col, log_experiment=True, normalize=True, categorical_features=object_columns, fold=20, silent= True, experiment_name = 'No_code_ML') |
|
else: |
|
if len(data) > 20: |
|
s = reg.setup(train_data, target = target_col, log_experiment=True, normalize=True, silent= True, experiment_name = 'No_code_ML') |
|
else: |
|
s = reg.setup(data, target = target_col, log_experiment=True, normalize=True, silent= True, experiment_name = 'No_code_ML') |
|
|
|
|
|
st.subheader("Algorithm Selection") |
|
with st.spinner(text="Finding the best algorithm for your dataset..."): |
|
best_mod = reg.compare_models() |
|
regression_results = reg.pull() |
|
best_mod_name = regression_results.Model[0] |
|
st.write(regression_results) |
|
|
|
|
|
st.subheader("Tuning the Model") |
|
with st.spinner(text="Tuning the algorithm..."): |
|
tuned_mod = reg.tune_model(best_mod, optimize = 'RMSE', n_iter=25) |
|
|
|
|
|
with st.spinner("Finalizing the model..."): |
|
final_mod = reg.finalize_model(tuned_mod) |
|
|
|
st.success('Model successfully trained! Here are your results:') |
|
st.write('Best algorithm: ', best_mod_name) |
|
st.write('Best hyperparameters: ', final_mod.get_params()) |
|
|
|
|
|
st.subheader("SHAP Analysis Summary Plot") |
|
st.pyplot(reg.interpret_model(final_mod)) |
|
|
|
if len(data) > 20: |
|
|
|
st.subheader("Evaluating model on the test/hold out data:") |
|
predictions = reg.predict_model(final_mod, data=test_data) |
|
st.success('Here are your results:') |
|
st.write(predictions) |
|
st.caption('"Label" is the value predicted by the model.') |
|
|
|
|
|
MAE_val = mean_absolute_error(predictions[target_col], predictions['Label']) |
|
RMSE_err = mean_squared_error(predictions[target_col], predictions['Label'], squared=False) |
|
Max_err = max_error(predictions[target_col], predictions['Label']) |
|
r2_val = r2_score(predictions[target_col], predictions['Label']) |
|
err_dict = {'Mean Absolute Error': MAE_val, 'Root Mean Squared Error': RMSE_err, 'Maximum Error': Max_err} |
|
df_err = pd.DataFrame(err_dict, index=[0]) |
|
st.write(df_err) |
|
|
|
|
|
fig = plt.figure(figsize=(8,8)) |
|
plt.grid(b=None) |
|
plt.scatter(x=predictions[target_col], y=predictions['Label']) |
|
plt.xlabel("True Value", fontsize=18) |
|
plt.ylabel("Predicted Value", fontsize=18) |
|
fig_html = mpld3.fig_to_html(fig) |
|
components.html(fig_html, height=1000) |
|
|
|
|
|
|
|
st.subheader("Using the Trained Model to Optimize Target Variable:") |
|
if object_columns: |
|
st.write("Optimization with string data types not currently supported.") |
|
else: |
|
with st.spinner("Generating Parameter Combinations for Desired Value of the Target Variable"): |
|
|
|
list_of_cols = list(data.columns[0:-1]) |
|
|
|
data_spread = data[target_col].std()/5 |
|
max_list = [data[i].max() for i in list_of_cols] |
|
min_list = [data[i].min() for i in list_of_cols] |
|
dv_min = desired_value - data_spread |
|
dv_max = desired_value + data_spread |
|
|
|
|
|
generated_data = np.array([np.random.randint(low=min_list[i], high=max_list[i], size=10000) for i in range(0,len(max_list))]).T |
|
generated_data_df = pd.DataFrame(generated_data) |
|
generated_data_df.columns = list_of_cols |
|
|
|
|
|
generated_predictions = reg.predict_model(final_mod, data = generated_data_df) |
|
generated_predictions['distance_to_dv'] = np.abs(predictions['Label'] - desired_value) |
|
proposed_values_to_try = generated_predictions[(generated_predictions["Label"] >=dv_min) & (generated_predictions["Label"] <=dv_max)] |
|
proposed_values_to_try.sort_values('distance_to_dv', inplace=True) |
|
proposed_values_to_try.reset_index(drop=True, inplace=True) |
|
final_proposed_parameters = proposed_values_to_try[0:10] |
|
if len(final_proposed_parameters) == 0: |
|
st.write("No parameters could be found for the desired value based on current model. Try collecting additional data or provide a different target value.") |
|
else: |
|
st.write(final_proposed_parameters) |
|
st.download_button(label="Download the Proposed Parameters to Try", data = final_proposed_parameters.to_csv(index=False), file_name='Final_proposed_parameters.csv') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|