nocodeml / streamlit_app.py
singhk28
first commit
81adc8e
raw
history blame
No virus
10.4 kB
# Module Imports
import pandas as pd
import numpy as np
import streamlit as st
from pycaret import regression as reg
from pycaret import classification as clf
from sklearn.metrics import mean_absolute_error, max_error, r2_score, mean_squared_error
import matplotlib.pyplot as plt
import streamlit.components.v1 as components
import mpld3
# ---------------------------------------------------------------------------------------------------------------------- #
# Collecting User Input
## Preamble
st.markdown(f'<h1 style="color:#0096FF;font-size:54px;">{"No Code ML"}</h1>', unsafe_allow_html=True)
st.markdown(f"This tool prepares a machine learning model, using your tabular data, from scratch. The model is then used to make predictions for various combinations of the provided data to try to obtain a combination that achieves the desired target value (if possible).")
st.markdown(f"**To use this tool**, fill out all the requested fields from top to bottom.")
st.markdown(f"**Note:** If an error is obtained refresh the page and start over.")
## Column Name
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"1) Provide name of the column you want to predict with model."}</h3>', unsafe_allow_html=True)
target_col = st.text_input("Enter the exact name of the column with your target variable. This field is case sensitive. (i.e., capital letters must match.)")
## Model Type: Regression or Classifier
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"2) Select type of model you would like to build"}</h3>', unsafe_allow_html=True)
mod_type = st.selectbox("What type of model would you like to train? Pick regression model for continous values and classifier for categorical values.", ('regression', 'classifier'))
## Desired Target Value
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"3) What is the desired value?"}</h3>', unsafe_allow_html=True)
if mod_type == 'regression':
desired_value = float(st.number_input("Enter the desired value for the target variable."))
else:
desired_value = st.text_input("Enter the desired target parameter value. This field is case sensitive. (i.e., capital letters must match.)", key="DV for Classifier")
## Ask for Dataset
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"4) Upload CSV file "}</h3>', unsafe_allow_html=True)
uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
# ---------------------------------------------------------------------------------------------------------------------- #
if uploaded_file:
# Read CSV File and Provide Preview of Data and Statistical Summary:
data = pd.read_csv(uploaded_file)
if target_col not in list(data.columns):
st.error("ERROR: Provided name of the target column is not in the CSV file. Please make sure you provide the exact match (case sensitive).Please provide the correct label and try again.")
exit()
st.subheader("Data preview:")
st.write(data.head())
st.subheader("Statistical Summary of the Provided Data:")
st.write(data.describe())
# Prepare Train/Test Split:
train_frac = 0.8
test_frac = 1 - train_frac
train_data = data.sample(frac=train_frac, random_state=0)
test_data = data.drop(train_data.index)
# ---------------------------------------------------------------------------------------------------------------------- #
# Figure out Column Data Types
object_columns = data.select_dtypes(include="object").columns.tolist()
# Build Regression Model
if mod_type == "regression":
# Setup Regressor Problem
if object_columns:
if len(data) > 20:
s = reg.setup(train_data, target = target_col, log_experiment=True, normalize=True, categorical_features=object_columns, fold=20, silent= True, experiment_name = 'No_code_ML')
else:
s = reg.setup(data, target = target_col, log_experiment=True, normalize=True, categorical_features=object_columns, fold=20, silent= True, experiment_name = 'No_code_ML')
else:
if len(data) > 20:
s = reg.setup(train_data, target = target_col, log_experiment=True, normalize=True, silent= True, experiment_name = 'No_code_ML')
else:
s = reg.setup(data, target = target_col, log_experiment=True, normalize=True, silent= True, experiment_name = 'No_code_ML')
# Find the best algorithm to build Model:
st.subheader("Algorithm Selection")
with st.spinner(text="Finding the best algorithm for your dataset..."):
best_mod = reg.compare_models()
regression_results = reg.pull()
best_mod_name = regression_results.Model[0]
st.write(regression_results)
# Tune the hyperparameters for the best algorithm:
st.subheader("Tuning the Model")
with st.spinner(text="Tuning the algorithm..."):
tuned_mod = reg.tune_model(best_mod, optimize = 'RMSE', n_iter=25)
# Finalize the model (Train on the entire train dataset):
with st.spinner("Finalizing the model..."):
final_mod = reg.finalize_model(tuned_mod)
st.success('Model successfully trained! Here are your results:')
st.write('Best algorithm: ', best_mod_name)
st.write('Best hyperparameters: ', final_mod.get_params())
# Print a SHAP Analysis Summary Plot:
st.subheader("SHAP Analysis Summary Plot")
st.pyplot(reg.interpret_model(final_mod))
if len(data) > 20:
# Predict on the test set if it was created:
st.subheader("Evaluating model on the test/hold out data:")
predictions = reg.predict_model(final_mod, data=test_data)
st.success('Here are your results:')
st.write(predictions)
st.caption('"Label" is the value predicted by the model.')
# Accuracy of predictions:
MAE_val = mean_absolute_error(predictions[target_col], predictions['Label'])
RMSE_err = mean_squared_error(predictions[target_col], predictions['Label'], squared=False)
Max_err = max_error(predictions[target_col], predictions['Label'])
r2_val = r2_score(predictions[target_col], predictions['Label'])
err_dict = {'Mean Absolute Error': MAE_val, 'Root Mean Squared Error': RMSE_err, 'Maximum Error': Max_err}
df_err = pd.DataFrame(err_dict, index=[0])
st.write(df_err)
# Create an true vs. predicted plot
fig = plt.figure(figsize=(8,8))
plt.grid(b=None)
plt.scatter(x=predictions[target_col], y=predictions['Label'])
plt.xlabel("True Value", fontsize=18)
plt.ylabel("Predicted Value", fontsize=18)
fig_html = mpld3.fig_to_html(fig)
components.html(fig_html, height=1000)
# ---------------------------------------------------------------------------------------------------------------------- #
# Use Trained Model to Explore Parameter Space
st.subheader("Using the Trained Model to Optimize Target Variable:")
if object_columns:
st.write("Optimization with string data types not currently supported.")
else:
with st.spinner("Generating Parameter Combinations for Desired Value of the Target Variable"):
# Creating Variables for Data Generation Used in the Optimization Segment
list_of_cols = list(data.columns[0:-1])
# Figuring out Data Distribution of Original Data & Set Upper and Lower Bounds for New Parameters
data_spread = data[target_col].std()/5
max_list = [data[i].max() for i in list_of_cols]
min_list = [data[i].min() for i in list_of_cols]
dv_min = desired_value - data_spread
dv_max = desired_value + data_spread
# Generate DF from New Parameters
generated_data = np.array([np.random.randint(low=min_list[i], high=max_list[i], size=10000) for i in range(0,len(max_list))]).T
generated_data_df = pd.DataFrame(generated_data)
generated_data_df.columns = list_of_cols
# Make Predictions with Trained Model & Display Top 10 Results Based on Distance from Desired Value
generated_predictions = reg.predict_model(final_mod, data = generated_data_df)
generated_predictions['distance_to_dv'] = np.abs(predictions['Label'] - desired_value)
proposed_values_to_try = generated_predictions[(generated_predictions["Label"] >=dv_min) & (generated_predictions["Label"] <=dv_max)]
proposed_values_to_try.sort_values('distance_to_dv', inplace=True)
proposed_values_to_try.reset_index(drop=True, inplace=True)
final_proposed_parameters = proposed_values_to_try[0:10]
if len(final_proposed_parameters) == 0:
st.write("No parameters could be found for the desired value based on current model. Try collecting additional data or provide a different target value.")
else:
st.write(final_proposed_parameters)
st.download_button(label="Download the Proposed Parameters to Try", data = final_proposed_parameters.to_csv(index=False), file_name='Final_proposed_parameters.csv')
# ---------------------------------------------------------------------------------------------------------------------- #
# Build Classifier Model
# if mod_type == "classifier":
# # Setup Classifier Problem
# s = clf.setup(train_data, target = target_col, log_experiment=True, normalize=True, silent= True, experiment_name = 'QD_ML')
# # Compare Model Performance:
# st.subheader("Algorithm Selection")
# with st.spinner(text="Finding the best algorithm for your model..."):
# best_mod = clf.compare_models()
# regression_results = clf.pull()
# st.balloons()
# st.success('Model successfully trained! Here are your results:')
# st.write(regression_results)
# # Print a SHAP Analysis Summary Plot:
# st.subheader("SHAP Analysis Summary Plot")
# st.pyplot(clf.interpret_model(best_mod))