singhk28
commited on
Commit
·
9d38374
1
Parent(s):
81adc8e
change app name
Browse files- streamlit_app.py +0 -174
streamlit_app.py
DELETED
@@ -1,174 +0,0 @@
|
|
1 |
-
# Module Imports
|
2 |
-
import pandas as pd
|
3 |
-
import numpy as np
|
4 |
-
import streamlit as st
|
5 |
-
from pycaret import regression as reg
|
6 |
-
from pycaret import classification as clf
|
7 |
-
from sklearn.metrics import mean_absolute_error, max_error, r2_score, mean_squared_error
|
8 |
-
import matplotlib.pyplot as plt
|
9 |
-
import streamlit.components.v1 as components
|
10 |
-
import mpld3
|
11 |
-
# ---------------------------------------------------------------------------------------------------------------------- #
|
12 |
-
# Collecting User Input
|
13 |
-
## Preamble
|
14 |
-
st.markdown(f'<h1 style="color:#0096FF;font-size:54px;">{"No Code ML"}</h1>', unsafe_allow_html=True)
|
15 |
-
st.markdown(f"This tool prepares a machine learning model, using your tabular data, from scratch. The model is then used to make predictions for various combinations of the provided data to try to obtain a combination that achieves the desired target value (if possible).")
|
16 |
-
st.markdown(f"**To use this tool**, fill out all the requested fields from top to bottom.")
|
17 |
-
st.markdown(f"**Note:** If an error is obtained refresh the page and start over.")
|
18 |
-
## Column Name
|
19 |
-
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"1) Provide name of the column you want to predict with model."}</h3>', unsafe_allow_html=True)
|
20 |
-
target_col = st.text_input("Enter the exact name of the column with your target variable. This field is case sensitive. (i.e., capital letters must match.)")
|
21 |
-
## Model Type: Regression or Classifier
|
22 |
-
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"2) Select type of model you would like to build"}</h3>', unsafe_allow_html=True)
|
23 |
-
mod_type = st.selectbox("What type of model would you like to train? Pick regression model for continous values and classifier for categorical values.", ('regression', 'classifier'))
|
24 |
-
## Desired Target Value
|
25 |
-
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"3) What is the desired value?"}</h3>', unsafe_allow_html=True)
|
26 |
-
if mod_type == 'regression':
|
27 |
-
desired_value = float(st.number_input("Enter the desired value for the target variable."))
|
28 |
-
else:
|
29 |
-
desired_value = st.text_input("Enter the desired target parameter value. This field is case sensitive. (i.e., capital letters must match.)", key="DV for Classifier")
|
30 |
-
## Ask for Dataset
|
31 |
-
st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"4) Upload CSV file "}</h3>', unsafe_allow_html=True)
|
32 |
-
uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
|
33 |
-
|
34 |
-
# ---------------------------------------------------------------------------------------------------------------------- #
|
35 |
-
if uploaded_file:
|
36 |
-
# Read CSV File and Provide Preview of Data and Statistical Summary:
|
37 |
-
data = pd.read_csv(uploaded_file)
|
38 |
-
|
39 |
-
if target_col not in list(data.columns):
|
40 |
-
st.error("ERROR: Provided name of the target column is not in the CSV file. Please make sure you provide the exact match (case sensitive).Please provide the correct label and try again.")
|
41 |
-
exit()
|
42 |
-
|
43 |
-
st.subheader("Data preview:")
|
44 |
-
st.write(data.head())
|
45 |
-
st.subheader("Statistical Summary of the Provided Data:")
|
46 |
-
st.write(data.describe())
|
47 |
-
|
48 |
-
# Prepare Train/Test Split:
|
49 |
-
train_frac = 0.8
|
50 |
-
test_frac = 1 - train_frac
|
51 |
-
train_data = data.sample(frac=train_frac, random_state=0)
|
52 |
-
test_data = data.drop(train_data.index)
|
53 |
-
|
54 |
-
# ---------------------------------------------------------------------------------------------------------------------- #
|
55 |
-
# Figure out Column Data Types
|
56 |
-
object_columns = data.select_dtypes(include="object").columns.tolist()
|
57 |
-
# Build Regression Model
|
58 |
-
if mod_type == "regression":
|
59 |
-
# Setup Regressor Problem
|
60 |
-
if object_columns:
|
61 |
-
if len(data) > 20:
|
62 |
-
s = reg.setup(train_data, target = target_col, log_experiment=True, normalize=True, categorical_features=object_columns, fold=20, silent= True, experiment_name = 'No_code_ML')
|
63 |
-
else:
|
64 |
-
s = reg.setup(data, target = target_col, log_experiment=True, normalize=True, categorical_features=object_columns, fold=20, silent= True, experiment_name = 'No_code_ML')
|
65 |
-
else:
|
66 |
-
if len(data) > 20:
|
67 |
-
s = reg.setup(train_data, target = target_col, log_experiment=True, normalize=True, silent= True, experiment_name = 'No_code_ML')
|
68 |
-
else:
|
69 |
-
s = reg.setup(data, target = target_col, log_experiment=True, normalize=True, silent= True, experiment_name = 'No_code_ML')
|
70 |
-
|
71 |
-
# Find the best algorithm to build Model:
|
72 |
-
st.subheader("Algorithm Selection")
|
73 |
-
with st.spinner(text="Finding the best algorithm for your dataset..."):
|
74 |
-
best_mod = reg.compare_models()
|
75 |
-
regression_results = reg.pull()
|
76 |
-
best_mod_name = regression_results.Model[0]
|
77 |
-
st.write(regression_results)
|
78 |
-
|
79 |
-
# Tune the hyperparameters for the best algorithm:
|
80 |
-
st.subheader("Tuning the Model")
|
81 |
-
with st.spinner(text="Tuning the algorithm..."):
|
82 |
-
tuned_mod = reg.tune_model(best_mod, optimize = 'RMSE', n_iter=25)
|
83 |
-
|
84 |
-
# Finalize the model (Train on the entire train dataset):
|
85 |
-
with st.spinner("Finalizing the model..."):
|
86 |
-
final_mod = reg.finalize_model(tuned_mod)
|
87 |
-
|
88 |
-
st.success('Model successfully trained! Here are your results:')
|
89 |
-
st.write('Best algorithm: ', best_mod_name)
|
90 |
-
st.write('Best hyperparameters: ', final_mod.get_params())
|
91 |
-
|
92 |
-
# Print a SHAP Analysis Summary Plot:
|
93 |
-
st.subheader("SHAP Analysis Summary Plot")
|
94 |
-
st.pyplot(reg.interpret_model(final_mod))
|
95 |
-
|
96 |
-
if len(data) > 20:
|
97 |
-
# Predict on the test set if it was created:
|
98 |
-
st.subheader("Evaluating model on the test/hold out data:")
|
99 |
-
predictions = reg.predict_model(final_mod, data=test_data)
|
100 |
-
st.success('Here are your results:')
|
101 |
-
st.write(predictions)
|
102 |
-
st.caption('"Label" is the value predicted by the model.')
|
103 |
-
|
104 |
-
# Accuracy of predictions:
|
105 |
-
MAE_val = mean_absolute_error(predictions[target_col], predictions['Label'])
|
106 |
-
RMSE_err = mean_squared_error(predictions[target_col], predictions['Label'], squared=False)
|
107 |
-
Max_err = max_error(predictions[target_col], predictions['Label'])
|
108 |
-
r2_val = r2_score(predictions[target_col], predictions['Label'])
|
109 |
-
err_dict = {'Mean Absolute Error': MAE_val, 'Root Mean Squared Error': RMSE_err, 'Maximum Error': Max_err}
|
110 |
-
df_err = pd.DataFrame(err_dict, index=[0])
|
111 |
-
st.write(df_err)
|
112 |
-
|
113 |
-
# Create an true vs. predicted plot
|
114 |
-
fig = plt.figure(figsize=(8,8))
|
115 |
-
plt.grid(b=None)
|
116 |
-
plt.scatter(x=predictions[target_col], y=predictions['Label'])
|
117 |
-
plt.xlabel("True Value", fontsize=18)
|
118 |
-
plt.ylabel("Predicted Value", fontsize=18)
|
119 |
-
fig_html = mpld3.fig_to_html(fig)
|
120 |
-
components.html(fig_html, height=1000)
|
121 |
-
|
122 |
-
# ---------------------------------------------------------------------------------------------------------------------- #
|
123 |
-
# Use Trained Model to Explore Parameter Space
|
124 |
-
st.subheader("Using the Trained Model to Optimize Target Variable:")
|
125 |
-
if object_columns:
|
126 |
-
st.write("Optimization with string data types not currently supported.")
|
127 |
-
else:
|
128 |
-
with st.spinner("Generating Parameter Combinations for Desired Value of the Target Variable"):
|
129 |
-
# Creating Variables for Data Generation Used in the Optimization Segment
|
130 |
-
list_of_cols = list(data.columns[0:-1])
|
131 |
-
# Figuring out Data Distribution of Original Data & Set Upper and Lower Bounds for New Parameters
|
132 |
-
data_spread = data[target_col].std()/5
|
133 |
-
max_list = [data[i].max() for i in list_of_cols]
|
134 |
-
min_list = [data[i].min() for i in list_of_cols]
|
135 |
-
dv_min = desired_value - data_spread
|
136 |
-
dv_max = desired_value + data_spread
|
137 |
-
|
138 |
-
# Generate DF from New Parameters
|
139 |
-
generated_data = np.array([np.random.randint(low=min_list[i], high=max_list[i], size=10000) for i in range(0,len(max_list))]).T
|
140 |
-
generated_data_df = pd.DataFrame(generated_data)
|
141 |
-
generated_data_df.columns = list_of_cols
|
142 |
-
|
143 |
-
# Make Predictions with Trained Model & Display Top 10 Results Based on Distance from Desired Value
|
144 |
-
generated_predictions = reg.predict_model(final_mod, data = generated_data_df)
|
145 |
-
generated_predictions['distance_to_dv'] = np.abs(predictions['Label'] - desired_value)
|
146 |
-
proposed_values_to_try = generated_predictions[(generated_predictions["Label"] >=dv_min) & (generated_predictions["Label"] <=dv_max)]
|
147 |
-
proposed_values_to_try.sort_values('distance_to_dv', inplace=True)
|
148 |
-
proposed_values_to_try.reset_index(drop=True, inplace=True)
|
149 |
-
final_proposed_parameters = proposed_values_to_try[0:10]
|
150 |
-
if len(final_proposed_parameters) == 0:
|
151 |
-
st.write("No parameters could be found for the desired value based on current model. Try collecting additional data or provide a different target value.")
|
152 |
-
else:
|
153 |
-
st.write(final_proposed_parameters)
|
154 |
-
st.download_button(label="Download the Proposed Parameters to Try", data = final_proposed_parameters.to_csv(index=False), file_name='Final_proposed_parameters.csv')
|
155 |
-
|
156 |
-
# ---------------------------------------------------------------------------------------------------------------------- #
|
157 |
-
# Build Classifier Model
|
158 |
-
# if mod_type == "classifier":
|
159 |
-
# # Setup Classifier Problem
|
160 |
-
# s = clf.setup(train_data, target = target_col, log_experiment=True, normalize=True, silent= True, experiment_name = 'QD_ML')
|
161 |
-
|
162 |
-
# # Compare Model Performance:
|
163 |
-
# st.subheader("Algorithm Selection")
|
164 |
-
# with st.spinner(text="Finding the best algorithm for your model..."):
|
165 |
-
# best_mod = clf.compare_models()
|
166 |
-
# regression_results = clf.pull()
|
167 |
-
|
168 |
-
# st.balloons()
|
169 |
-
# st.success('Model successfully trained! Here are your results:')
|
170 |
-
# st.write(regression_results)
|
171 |
-
|
172 |
-
# # Print a SHAP Analysis Summary Plot:
|
173 |
-
# st.subheader("SHAP Analysis Summary Plot")
|
174 |
-
# st.pyplot(clf.interpret_model(best_mod))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|