singhk28 commited on
Commit
81adc8e
1 Parent(s): 8f7f947

first commit

Browse files
Files changed (2) hide show
  1. requirements.txt +4 -0
  2. streamlit_app.py +174 -0
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ matplotlib==3.5.3
2
+ mpld3==0.5.9
3
+ pycaret==2.3.10
4
+ streamlit==1.16.0
streamlit_app.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Module Imports
2
+ import pandas as pd
3
+ import numpy as np
4
+ import streamlit as st
5
+ from pycaret import regression as reg
6
+ from pycaret import classification as clf
7
+ from sklearn.metrics import mean_absolute_error, max_error, r2_score, mean_squared_error
8
+ import matplotlib.pyplot as plt
9
+ import streamlit.components.v1 as components
10
+ import mpld3
11
+ # ---------------------------------------------------------------------------------------------------------------------- #
12
+ # Collecting User Input
13
+ ## Preamble
14
+ st.markdown(f'<h1 style="color:#0096FF;font-size:54px;">{"No Code ML"}</h1>', unsafe_allow_html=True)
15
+ st.markdown(f"This tool prepares a machine learning model, using your tabular data, from scratch. The model is then used to make predictions for various combinations of the provided data to try to obtain a combination that achieves the desired target value (if possible).")
16
+ st.markdown(f"**To use this tool**, fill out all the requested fields from top to bottom.")
17
+ st.markdown(f"**Note:** If an error is obtained refresh the page and start over.")
18
+ ## Column Name
19
+ st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"1) Provide name of the column you want to predict with model."}</h3>', unsafe_allow_html=True)
20
+ target_col = st.text_input("Enter the exact name of the column with your target variable. This field is case sensitive. (i.e., capital letters must match.)")
21
+ ## Model Type: Regression or Classifier
22
+ st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"2) Select type of model you would like to build"}</h3>', unsafe_allow_html=True)
23
+ mod_type = st.selectbox("What type of model would you like to train? Pick regression model for continous values and classifier for categorical values.", ('regression', 'classifier'))
24
+ ## Desired Target Value
25
+ st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"3) What is the desired value?"}</h3>', unsafe_allow_html=True)
26
+ if mod_type == 'regression':
27
+ desired_value = float(st.number_input("Enter the desired value for the target variable."))
28
+ else:
29
+ desired_value = st.text_input("Enter the desired target parameter value. This field is case sensitive. (i.e., capital letters must match.)", key="DV for Classifier")
30
+ ## Ask for Dataset
31
+ st.markdown(f'<h3 style="color:#000000;font-size:20px;">{"4) Upload CSV file "}</h3>', unsafe_allow_html=True)
32
+ uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
33
+
34
+ # ---------------------------------------------------------------------------------------------------------------------- #
35
+ if uploaded_file:
36
+ # Read CSV File and Provide Preview of Data and Statistical Summary:
37
+ data = pd.read_csv(uploaded_file)
38
+
39
+ if target_col not in list(data.columns):
40
+ st.error("ERROR: Provided name of the target column is not in the CSV file. Please make sure you provide the exact match (case sensitive).Please provide the correct label and try again.")
41
+ exit()
42
+
43
+ st.subheader("Data preview:")
44
+ st.write(data.head())
45
+ st.subheader("Statistical Summary of the Provided Data:")
46
+ st.write(data.describe())
47
+
48
+ # Prepare Train/Test Split:
49
+ train_frac = 0.8
50
+ test_frac = 1 - train_frac
51
+ train_data = data.sample(frac=train_frac, random_state=0)
52
+ test_data = data.drop(train_data.index)
53
+
54
+ # ---------------------------------------------------------------------------------------------------------------------- #
55
+ # Figure out Column Data Types
56
+ object_columns = data.select_dtypes(include="object").columns.tolist()
57
+ # Build Regression Model
58
+ if mod_type == "regression":
59
+ # Setup Regressor Problem
60
+ if object_columns:
61
+ if len(data) > 20:
62
+ s = reg.setup(train_data, target = target_col, log_experiment=True, normalize=True, categorical_features=object_columns, fold=20, silent= True, experiment_name = 'No_code_ML')
63
+ else:
64
+ s = reg.setup(data, target = target_col, log_experiment=True, normalize=True, categorical_features=object_columns, fold=20, silent= True, experiment_name = 'No_code_ML')
65
+ else:
66
+ if len(data) > 20:
67
+ s = reg.setup(train_data, target = target_col, log_experiment=True, normalize=True, silent= True, experiment_name = 'No_code_ML')
68
+ else:
69
+ s = reg.setup(data, target = target_col, log_experiment=True, normalize=True, silent= True, experiment_name = 'No_code_ML')
70
+
71
+ # Find the best algorithm to build Model:
72
+ st.subheader("Algorithm Selection")
73
+ with st.spinner(text="Finding the best algorithm for your dataset..."):
74
+ best_mod = reg.compare_models()
75
+ regression_results = reg.pull()
76
+ best_mod_name = regression_results.Model[0]
77
+ st.write(regression_results)
78
+
79
+ # Tune the hyperparameters for the best algorithm:
80
+ st.subheader("Tuning the Model")
81
+ with st.spinner(text="Tuning the algorithm..."):
82
+ tuned_mod = reg.tune_model(best_mod, optimize = 'RMSE', n_iter=25)
83
+
84
+ # Finalize the model (Train on the entire train dataset):
85
+ with st.spinner("Finalizing the model..."):
86
+ final_mod = reg.finalize_model(tuned_mod)
87
+
88
+ st.success('Model successfully trained! Here are your results:')
89
+ st.write('Best algorithm: ', best_mod_name)
90
+ st.write('Best hyperparameters: ', final_mod.get_params())
91
+
92
+ # Print a SHAP Analysis Summary Plot:
93
+ st.subheader("SHAP Analysis Summary Plot")
94
+ st.pyplot(reg.interpret_model(final_mod))
95
+
96
+ if len(data) > 20:
97
+ # Predict on the test set if it was created:
98
+ st.subheader("Evaluating model on the test/hold out data:")
99
+ predictions = reg.predict_model(final_mod, data=test_data)
100
+ st.success('Here are your results:')
101
+ st.write(predictions)
102
+ st.caption('"Label" is the value predicted by the model.')
103
+
104
+ # Accuracy of predictions:
105
+ MAE_val = mean_absolute_error(predictions[target_col], predictions['Label'])
106
+ RMSE_err = mean_squared_error(predictions[target_col], predictions['Label'], squared=False)
107
+ Max_err = max_error(predictions[target_col], predictions['Label'])
108
+ r2_val = r2_score(predictions[target_col], predictions['Label'])
109
+ err_dict = {'Mean Absolute Error': MAE_val, 'Root Mean Squared Error': RMSE_err, 'Maximum Error': Max_err}
110
+ df_err = pd.DataFrame(err_dict, index=[0])
111
+ st.write(df_err)
112
+
113
+ # Create an true vs. predicted plot
114
+ fig = plt.figure(figsize=(8,8))
115
+ plt.grid(b=None)
116
+ plt.scatter(x=predictions[target_col], y=predictions['Label'])
117
+ plt.xlabel("True Value", fontsize=18)
118
+ plt.ylabel("Predicted Value", fontsize=18)
119
+ fig_html = mpld3.fig_to_html(fig)
120
+ components.html(fig_html, height=1000)
121
+
122
+ # ---------------------------------------------------------------------------------------------------------------------- #
123
+ # Use Trained Model to Explore Parameter Space
124
+ st.subheader("Using the Trained Model to Optimize Target Variable:")
125
+ if object_columns:
126
+ st.write("Optimization with string data types not currently supported.")
127
+ else:
128
+ with st.spinner("Generating Parameter Combinations for Desired Value of the Target Variable"):
129
+ # Creating Variables for Data Generation Used in the Optimization Segment
130
+ list_of_cols = list(data.columns[0:-1])
131
+ # Figuring out Data Distribution of Original Data & Set Upper and Lower Bounds for New Parameters
132
+ data_spread = data[target_col].std()/5
133
+ max_list = [data[i].max() for i in list_of_cols]
134
+ min_list = [data[i].min() for i in list_of_cols]
135
+ dv_min = desired_value - data_spread
136
+ dv_max = desired_value + data_spread
137
+
138
+ # Generate DF from New Parameters
139
+ generated_data = np.array([np.random.randint(low=min_list[i], high=max_list[i], size=10000) for i in range(0,len(max_list))]).T
140
+ generated_data_df = pd.DataFrame(generated_data)
141
+ generated_data_df.columns = list_of_cols
142
+
143
+ # Make Predictions with Trained Model & Display Top 10 Results Based on Distance from Desired Value
144
+ generated_predictions = reg.predict_model(final_mod, data = generated_data_df)
145
+ generated_predictions['distance_to_dv'] = np.abs(predictions['Label'] - desired_value)
146
+ proposed_values_to_try = generated_predictions[(generated_predictions["Label"] >=dv_min) & (generated_predictions["Label"] <=dv_max)]
147
+ proposed_values_to_try.sort_values('distance_to_dv', inplace=True)
148
+ proposed_values_to_try.reset_index(drop=True, inplace=True)
149
+ final_proposed_parameters = proposed_values_to_try[0:10]
150
+ if len(final_proposed_parameters) == 0:
151
+ st.write("No parameters could be found for the desired value based on current model. Try collecting additional data or provide a different target value.")
152
+ else:
153
+ st.write(final_proposed_parameters)
154
+ st.download_button(label="Download the Proposed Parameters to Try", data = final_proposed_parameters.to_csv(index=False), file_name='Final_proposed_parameters.csv')
155
+
156
+ # ---------------------------------------------------------------------------------------------------------------------- #
157
+ # Build Classifier Model
158
+ # if mod_type == "classifier":
159
+ # # Setup Classifier Problem
160
+ # s = clf.setup(train_data, target = target_col, log_experiment=True, normalize=True, silent= True, experiment_name = 'QD_ML')
161
+
162
+ # # Compare Model Performance:
163
+ # st.subheader("Algorithm Selection")
164
+ # with st.spinner(text="Finding the best algorithm for your model..."):
165
+ # best_mod = clf.compare_models()
166
+ # regression_results = clf.pull()
167
+
168
+ # st.balloons()
169
+ # st.success('Model successfully trained! Here are your results:')
170
+ # st.write(regression_results)
171
+
172
+ # # Print a SHAP Analysis Summary Plot:
173
+ # st.subheader("SHAP Analysis Summary Plot")
174
+ # st.pyplot(clf.interpret_model(best_mod))