Spaces:

singhk28
/

nocodeml

Sleeping

App Files Files Community

singhk28 commited on Mar 1, 2023

Commit

d6586a1

•

1 Parent(s): 6c16a82

Add Classifier. Improve error catching for SHAP analysis/ feature importance.

Browse files

Files changed (1) hide show

app.py +91 -16

app.py CHANGED Viewed

@@ -4,7 +4,8 @@ import numpy as np
 import streamlit as st
 from pycaret import regression as reg
 from pycaret import classification as clf
-from sklearn.metrics import mean_absolute_error, max_error, r2_score, mean_squared_error
 import matplotlib.pyplot as plt
 import streamlit.components.v1 as components
 import mpld3
@@ -22,32 +23,44 @@ col1, mid, col2 = st.columns([10,1,20])
 with col1:
     st.image('https://images.pexels.com/photos/2599244/pexels-photo-2599244.jpeg?auto=compress&cs=tinysrgb&w=1260&h=750&dpr=1')
 with col2:
-    st.markdown("""This tool prepares a machine learning model, using your tabular data, from scratch. The tool can be used to quickly determine the performance of various algorithms on your dataset and/or be used to make predictions for various combinations of the provided data to try to obtain a combination that achieves the desired target value (if possible). The tool is currently under active development. **Please direct any bug reports or inquiries to the <a href="http://cleanenergy.utoronto.ca/">clean energy lab at UofT</a>**""", unsafe_allow_html=True)
 st.markdown("""---""")
 st.markdown(f"**To use this tool**, fill out all the requested fields from top to bottom.")
 st.markdown(f"**Note:** If an error is obtained refresh the page and start over.")
 ## Column Name
 st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"1) Provide name of the column you want to predict with model."}</h3>', unsafe_allow_html=True)
 target_col = st.text_input("Enter the exact name of the column with your target variable. This field is case sensitive. (i.e., capital letters must match.)")
-## Model Type: Regression or Classifier
 st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"2) Select type of model you would like to build"}</h3>', unsafe_allow_html=True)
-mod_type = st.selectbox("What type of model would you like to train? Pick regression model for continous values and classifier for categorical values.", ('regression', 'classifier'))
 ## Mode of Use
 st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"3) Select mode of use"}</h3>', unsafe_allow_html=True)
 mode_type = st.selectbox("What would you like to use the tool for?", ('Benchmarking (finding the best algorithm for your problem)', 'Parameter Search (find combination of parameters to get a desired value)'))
 if mode_type == 'Parameter Search (find combination of parameters to get a desired value)':
-    ## Desired Target Value
-    st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"4) Type of parameter search"}</h3>', unsafe_allow_html=True)
-    opt_type = st.selectbox("What do you want to do with the output?", ('Maximize it', 'Minimize it', 'Obtain a desired value'))
     if mod_type == 'regression':
-        if opt_type == 'Obtain a desired value':
-            desired_value = float(st.number_input("Enter the desired value for the target variable."))
     else:
         desired_value = st.text_input("Enter the desired target parameter value. This field is case sensitive. (i.e., capital letters must match.)", key="DV for Classifier")
     ## Ask for Dataset
     st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"5) Upload CSV file "}</h3>', unsafe_allow_html=True)
     uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
 else:
     ## Ask for Dataset
     st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"4) Upload CSV file "}</h3>', unsafe_allow_html=True)
@@ -84,20 +97,23 @@ if uploaded_file:
     # Figure out Column Data Types
     object_columns = data.select_dtypes(include="object").columns.tolist()
 # ----------------------------------------------------------------------------------------------------------------------  #
     # Build Regression Model
     if mod_type == "regression":
         # Setup Regressor Problem
         if object_columns:
             if data_size > 20:
-                s = reg.setup(train_data, target = target_col, log_experiment=True, normalize=True, categorical_features=object_columns, fold=5, silent= True, experiment_name = 'No_code_ML')
             else:
-                s = reg.setup(data, target = target_col, log_experiment=True, normalize=True, categorical_features=object_columns, silent= True, experiment_name = 'No_code_ML')
         else:
             if data_size > 20:
-                s = reg.setup(train_data, target = target_col, log_experiment=True, normalize=True, silent= True, fold=5, experiment_name = 'No_code_ML')
             else:
-                s = reg.setup(data, target = target_col, log_experiment=True, normalize=True, silent= True, experiment_name = 'No_code_ML')
         # Find the best algorithm to build Model:
         st.subheader("Algorithm Selection")
@@ -127,8 +143,9 @@ if uploaded_file:
         st.write('Best hyperparameters: ', final_mod.get_params())
         # Print a SHAP Analysis Summary Plot:
-        st.subheader("SHAP Analysis Summary Plot")
-        st.pyplot(reg.interpret_model(final_mod))
         if len(data) > 20:
             # Predict on the test set if it was created:
@@ -231,6 +248,64 @@ if uploaded_file:
 # ----------------------------------------------------------------------------------------------------------------------  #
     # Build Classifier Model
     if mod_type == "classifier":
-        st.write('Classifier is not currently implemented.')
 st.markdown("![visitor badge](https://visitor-badge.glitch.me/badge?page_id=singhk28_nocodeml)")

 import streamlit as st
 from pycaret import regression as reg
 from pycaret import classification as clf
+from sklearn.metrics import mean_absolute_error, max_error, r2_score, mean_squared_error, confusion_matrix, ConfusionMatrixDisplay
+from sklearn.metrics import accuracy_score, auc, recall_score, precision_score, f1_score, cohen_kappa_score
 import matplotlib.pyplot as plt
 import streamlit.components.v1 as components
 import mpld3
 with col1:
     st.image('https://images.pexels.com/photos/2599244/pexels-photo-2599244.jpeg?auto=compress&cs=tinysrgb&w=1260&h=750&dpr=1')
 with col2:
+    st.markdown("""This tool prepares a machine learning model using your tabular data. The tool can be used in 2 ways:""", unsafe_allow_html=True)
+    st.markdown("""1) Benchmark different algorithms for your dataset to find the best algorithm and then tune that model to determine best hyperparameters.""", unsafe_allow_html=True)
+st.markdown("""2) In the case of experimental science, the best obtained model can be used to make predictions for various combinations of the provided data to try to obtain a combination that achieves a desired target value (if possible).""", unsafe_allow_html=True)
+st.markdown("""**The tool is currently under active development. Please direct any bug reports or inquiries to the <a href="http://cleanenergy.utoronto.ca/">clean energy lab at UofT.</a>**""", unsafe_allow_html=True)
 st.markdown("""---""")
 st.markdown(f"**To use this tool**, fill out all the requested fields from top to bottom.")
 st.markdown(f"**Note:** If an error is obtained refresh the page and start over.")
 ## Column Name
 st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"1) Provide name of the column you want to predict with model."}</h3>', unsafe_allow_html=True)
 target_col = st.text_input("Enter the exact name of the column with your target variable. This field is case sensitive. (i.e., capital letters must match.)")
+## Task Type: Regression or Classification
 st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"2) Select type of model you would like to build"}</h3>', unsafe_allow_html=True)
+mod_type = st.selectbox("What type of model would you like to train? Pick regression model for continous values or classifier for categorical values.", ('regression', 'classifier'))
 ## Mode of Use
 st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"3) Select mode of use"}</h3>', unsafe_allow_html=True)
 mode_type = st.selectbox("What would you like to use the tool for?", ('Benchmarking (finding the best algorithm for your problem)', 'Parameter Search (find combination of parameters to get a desired value)'))
 if mode_type == 'Parameter Search (find combination of parameters to get a desired value)':
+     ## Desired Target Value
+    if mod_type == 'classifier':
+        st.write('Parameter search not currently supported with classifier type models.')
+        st.write('Please refresh page and try again with the supported tasks.')
+        exit()
     if mod_type == 'regression':
+        st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"4) Type of parameter search"}</h3>', unsafe_allow_html=True)
+        opt_type = st.selectbox("What do you want to do with the output?", ('Maximize it', 'Minimize it', 'Obtain a desired value'))
+    if opt_type == 'Obtain a desired value':
+        desired_value = float(st.number_input("Enter the desired value for the target variable."))
     else:
         desired_value = st.text_input("Enter the desired target parameter value. This field is case sensitive. (i.e., capital letters must match.)", key="DV for Classifier")
     ## Ask for Dataset
     st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"5) Upload CSV file "}</h3>', unsafe_allow_html=True)
     uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
 else:
     ## Ask for Dataset
     st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"4) Upload CSV file "}</h3>', unsafe_allow_html=True)
     # Figure out Column Data Types
     object_columns = data.select_dtypes(include="object").columns.tolist()
+    # Create a list of Tree Models:
+    tree_mods_list = ['Extra Trees Regressor', 'Extra Trees Classifier', 'Random Forest Regressor', 'Random Forest Classifier', 'Decision Tree Regressor', 'Decision Tree Classifier', 'CatBoost Regressor', 'Light Gradient Boosting Machine']
 # ----------------------------------------------------------------------------------------------------------------------  #
     # Build Regression Model
     if mod_type == "regression":
         # Setup Regressor Problem
         if object_columns:
             if data_size > 20:
+                s = reg.setup(train_data, target = target_col, normalize=True, categorical_features=object_columns, fold=5, silent= True)
             else:
+                s = reg.setup(data, target = target_col, normalize=True, categorical_features=object_columns, silent= True)
         else:
             if data_size > 20:
+                s = reg.setup(train_data, target = target_col, normalize=True, silent= True, fold=5)
             else:
+                s = reg.setup(data, target = target_col, normalize=True, silent= True)
         # Find the best algorithm to build Model:
         st.subheader("Algorithm Selection")
         st.write('Best hyperparameters: ', final_mod.get_params())
         # Print a SHAP Analysis Summary Plot:
+        if best_mod_name in tree_mods_list:
+            st.subheader("SHAP Analysis Summary Plot")
+            st.pyplot(reg.interpret_model(final_mod))
         if len(data) > 20:
             # Predict on the test set if it was created:
 # ----------------------------------------------------------------------------------------------------------------------  #
     # Build Classifier Model
     if mod_type == "classifier":
+        # Setup Classifier Problem
+        if data_size > 20:
+            s = clf.setup(train_data, target = target_col, normalize=True, silent= True, fold=5)
+        else:
+            s = clf.setup(data, target = target_col, normalize=True, silent= True)
+        # Find the best algorithm to build Model:
+        st.subheader("Algorithm Selection")
+        start_algo = time.time()
+        with st.spinner(text="Finding the best algorithm for your dataset..."):
+            best_mod = clf.compare_models()
+            classifier_results = clf.pull()
+            best_mod_name = classifier_results.Model[0]
+            st.write(classifier_results)
+        end_algo = time.time()
+        st.write('Time taken to select algorithm:', end_algo - start_algo, 'seconds')
+        # Tune the hyperparameters for the best algorithm:
+        st.subheader("Tuning the Model")
+        start_tune = time.time()
+        with st.spinner(text="Tuning the algorithm..."):
+            tuned_mod = clf.tune_model(best_mod, optimize = 'AUC', n_iter=5)
+        end_tune = time.time()
+        st.write('Time taken to select hyperparameters:', end_tune - start_tune, 'seconds')
+        # Finalize the model (Train on the entire train dataset):
+        with st.spinner("Finalizing the model..."):
+            final_mod = clf.finalize_model(tuned_mod)
+        st.success('Model successfully trained! Here are your results:')
+        st.write('Best algorithm: ', best_mod_name)
+        st.write('Best hyperparameters: ', final_mod.get_params())
+        # Print a Feature Importance Plot:
+        if best_mod_name in tree_mods_list:
+            st.subheader("Feature Importance Plot")
+            st.pyplot(clf.plot_model(final_mod, plot='feature'))
+        if len(data) > 20:
+            # Predict on the test set if it was created:
+            st.subheader("Evaluating model on the test/hold out data:")
+            predictions = clf.predict_model(final_mod, data=test_data)
+            st.success('Here are your results:')
+            st.write(predictions)
+            st.caption('"Label" is the value predicted by the model.')
+            st.write('---')
+            # Provide Accuracy:
+            mod_accuracy = accuracy_score(predictions[target_col], predictions['Label'])
+            st.write('**Model accuracy on test set :**', f'{(mod_accuracy):.2f}')
+            # Create a confusion matrix:
+            st.subheader("Confusion Matrix for test set:")
+            cm = confusion_matrix(predictions[target_col], predictions['Label'])
+            disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=predictions[target_col].unique())
+            disp.plot()
+            plt.grid(b=None)
+            st.pyplot()
+# Visitor Badge
 st.markdown("![visitor badge](https://visitor-badge.glitch.me/badge?page_id=singhk28_nocodeml)")