JohnAlexander23 commited on
Commit
51f6345
1 Parent(s): 4cc60b0

Upload 4 files

Browse files
.devcontainer/devcontainer.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "Python 3",
3
+ // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
4
+ "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
5
+ "customizations": {
6
+ "codespaces": {
7
+ "openFiles": [
8
+ "README.md",
9
+ "streamlit_app.py"
10
+ ]
11
+ },
12
+ "vscode": {
13
+ "settings": {},
14
+ "extensions": [
15
+ "ms-python.python",
16
+ "ms-python.vscode-pylance"
17
+ ]
18
+ }
19
+ },
20
+ "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
21
+ "postAttachCommand": {
22
+ "server": "streamlit run streamlit_app.py --server.enableCORS false --server.enableXsrfProtection false"
23
+ },
24
+ "portsAttributes": {
25
+ "8501": {
26
+ "label": "Application",
27
+ "onAutoForward": "openPreview"
28
+ }
29
+ },
30
+ "forwardPorts": [
31
+ 8501
32
+ ]
33
+ }
.streamlit/config.toml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [theme]
2
+ primaryColor="#F63366"
3
+ backgroundColor="#FFFFFF"
4
+ secondaryBackgroundColor="#F0F2F6"
5
+ textColor="#262730"
6
+ font="sans serif"
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit==1.29.0
2
+ pandas>=1.3.0
3
+ scikit-learn
4
+ altair>=4.0
streamlit_app.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.ensemble import RandomForestRegressor
6
+ from sklearn.metrics import mean_squared_error, r2_score
7
+ import altair as alt
8
+ import time
9
+ import zipfile
10
+
11
+ # Page title
12
+ st.set_page_config(page_title='ML Model Building', page_icon='🤖', layout='wide') # Set layout to wide for better use of space
13
+ st.title('🤖 ML Model Building')
14
+
15
+
16
+ with st.expander('About this app'):
17
+ st.markdown('**What can this app do?**')
18
+ st.info('This app allow users to build a machine learning (ML) model in an end-to-end workflow. Particularly, this encompasses data upload, data pre-processing, ML model building and post-model analysis.')
19
+
20
+ st.markdown('**How to use the app?**')
21
+ st.warning('To engage with the app, go to the sidebar and 1. Select a data set and 2. Adjust the model parameters by adjusting the various slider widgets. As a result, this would initiate the ML model building process, display the model results as well as allowing users to download the generated models and accompanying data.')
22
+
23
+ st.markdown('**Under the hood**')
24
+ st.markdown('Data sets:')
25
+ st.code('''- Drug solubility data set
26
+ ''', language='markdown')
27
+
28
+ st.markdown('Libraries used:')
29
+ st.code('''- Pandas for data wrangling
30
+ - Scikit-learn for building a machine learning model
31
+ - Altair for chart creation
32
+ - Streamlit for user interface
33
+ ''', language='markdown')
34
+
35
+
36
+ # Sidebar for accepting input parameters
37
+ with st.sidebar:
38
+ # Load data
39
+ st.header('1.1. Input data')
40
+
41
+ st.markdown('**1. Use custom data**')
42
+ uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
43
+ if uploaded_file is not None:
44
+ df = pd.read_csv(uploaded_file, index_col=False)
45
+
46
+ # Download example data
47
+ @st.cache_data
48
+ def convert_df(input_df):
49
+ return input_df.to_csv(index=False).encode('utf-8')
50
+ example_csv = pd.read_csv('https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv')
51
+ csv = convert_df(example_csv)
52
+ st.download_button(
53
+ label="Download example CSV",
54
+ data=csv,
55
+ file_name='delaney_solubility_with_descriptors.csv',
56
+ mime='text/csv',
57
+ )
58
+
59
+ # Select example data
60
+ st.markdown('**1.2. Use example data**')
61
+ example_data = st.toggle('Load example data')
62
+ if example_data:
63
+ df = pd.read_csv('https://raw.githubusercontent.com/dataprofessor/data/master/delaney_solubility_with_descriptors.csv')
64
+
65
+ st.header('2. Set Parameters')
66
+ parameter_split_size = st.slider('Data split ratio (% for Training Set)', 10, 90, 80, 5)
67
+
68
+ st.subheader('2.1. Learning Parameters')
69
+ with st.expander('See parameters'):
70
+ parameter_n_estimators = st.slider('Number of estimators (n_estimators)', 0, 1000, 100, 100)
71
+ parameter_max_features = st.select_slider('Max features (max_features)', options=['all', 'sqrt', 'log2'])
72
+ parameter_min_samples_split = st.slider('Minimum number of samples required to split an internal node (min_samples_split)', 2, 10, 2, 1)
73
+ parameter_min_samples_leaf = st.slider('Minimum number of samples required to be at a leaf node (min_samples_leaf)', 1, 10, 2, 1)
74
+
75
+ st.subheader('2.2. General Parameters')
76
+ with st.expander('See parameters', expanded=False):
77
+ parameter_random_state = st.slider('Seed number (random_state)', 0, 1000, 42, 1)
78
+ parameter_criterion = st.select_slider('Performance measure (criterion)', options=['squared_error', 'absolute_error', 'friedman_mse'])
79
+ parameter_bootstrap = st.select_slider('Bootstrap samples when building trees (bootstrap)', options=[True, False])
80
+ parameter_oob_score = st.select_slider('Whether to use out-of-bag samples to estimate the R^2 on unseen data (oob_score)', options=[False, True])
81
+
82
+ sleep_time = st.slider('Sleep time', 0, 3, 0)
83
+
84
+ # Initiate the model building process
85
+ if uploaded_file or example_data:
86
+ with st.status("Running ...", expanded=True) as status:
87
+
88
+ st.write("Loading data ...")
89
+ time.sleep(sleep_time)
90
+
91
+ st.write("Preparing data ...")
92
+ time.sleep(sleep_time)
93
+ X = df.iloc[:,:-1]
94
+ y = df.iloc[:,-1]
95
+
96
+ st.write("Splitting data ...")
97
+ time.sleep(sleep_time)
98
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(100-parameter_split_size)/100, random_state=parameter_random_state)
99
+
100
+ st.write("Model training ...")
101
+ time.sleep(sleep_time)
102
+
103
+ if parameter_max_features == 'all':
104
+ parameter_max_features = None
105
+ parameter_max_features_metric = X.shape[1]
106
+
107
+ rf = RandomForestRegressor(
108
+ n_estimators=parameter_n_estimators,
109
+ max_features=parameter_max_features,
110
+ min_samples_split=parameter_min_samples_split,
111
+ min_samples_leaf=parameter_min_samples_leaf,
112
+ random_state=parameter_random_state,
113
+ criterion=parameter_criterion,
114
+ bootstrap=parameter_bootstrap,
115
+ oob_score=parameter_oob_score)
116
+ rf.fit(X_train, y_train)
117
+
118
+ st.write("Applying model to make predictions ...")
119
+ time.sleep(sleep_time)
120
+ y_train_pred = rf.predict(X_train)
121
+ y_test_pred = rf.predict(X_test)
122
+
123
+ st.write("Evaluating performance metrics ...")
124
+ time.sleep(sleep_time)
125
+ train_mse = mean_squared_error(y_train, y_train_pred)
126
+ train_r2 = r2_score(y_train, y_train_pred)
127
+ test_mse = mean_squared_error(y_test, y_test_pred)
128
+ test_r2 = r2_score(y_test, y_test_pred)
129
+
130
+ st.write("Displaying performance metrics ...")
131
+ time.sleep(sleep_time)
132
+ parameter_criterion_string = ' '.join([x.capitalize() for x in parameter_criterion.split('_')])
133
+ #if 'Mse' in parameter_criterion_string:
134
+ # parameter_criterion_string = parameter_criterion_string.replace('Mse', 'MSE')
135
+ rf_results = pd.DataFrame(['Random forest', train_mse, train_r2, test_mse, test_r2]).transpose()
136
+ rf_results.columns = ['Method', f'Training {parameter_criterion_string}', 'Training R2', f'Test {parameter_criterion_string}', 'Test R2']
137
+ # Convert objects to numerics
138
+ for col in rf_results.columns:
139
+ rf_results[col] = pd.to_numeric(rf_results[col], errors='ignore')
140
+ # Round to 3 digits
141
+ rf_results = rf_results.round(3)
142
+
143
+ status.update(label="Status", state="complete", expanded=False)
144
+
145
+ # Display data info
146
+ st.header('Input data', divider='rainbow')
147
+ col = st.columns(4)
148
+ col[0].metric(label="No. of samples", value=X.shape[0], delta="")
149
+ col[1].metric(label="No. of X variables", value=X.shape[1], delta="")
150
+ col[2].metric(label="No. of Training samples", value=X_train.shape[0], delta="")
151
+ col[3].metric(label="No. of Test samples", value=X_test.shape[0], delta="")
152
+
153
+ with st.expander('Initial dataset', expanded=True):
154
+ st.dataframe(df, height=210, use_container_width=True)
155
+ with st.expander('Train split', expanded=False):
156
+ train_col = st.columns((3,1))
157
+ with train_col[0]:
158
+ st.markdown('**X**')
159
+ st.dataframe(X_train, height=210, hide_index=True, use_container_width=True)
160
+ with train_col[1]:
161
+ st.markdown('**y**')
162
+ st.dataframe(y_train, height=210, hide_index=True, use_container_width=True)
163
+ with st.expander('Test split', expanded=False):
164
+ test_col = st.columns((3,1))
165
+ with test_col[0]:
166
+ st.markdown('**X**')
167
+ st.dataframe(X_test, height=210, hide_index=True, use_container_width=True)
168
+ with test_col[1]:
169
+ st.markdown('**y**')
170
+ st.dataframe(y_test, height=210, hide_index=True, use_container_width=True)
171
+
172
+ # Zip dataset files
173
+ df.to_csv('dataset.csv', index=False)
174
+ X_train.to_csv('X_train.csv', index=False)
175
+ y_train.to_csv('y_train.csv', index=False)
176
+ X_test.to_csv('X_test.csv', index=False)
177
+ y_test.to_csv('y_test.csv', index=False)
178
+
179
+ list_files = ['dataset.csv', 'X_train.csv', 'y_train.csv', 'X_test.csv', 'y_test.csv']
180
+ with zipfile.ZipFile('dataset.zip', 'w') as zipF:
181
+ for file in list_files:
182
+ zipF.write(file, compress_type=zipfile.ZIP_DEFLATED)
183
+
184
+ with open('dataset.zip', 'rb') as datazip:
185
+ btn = st.download_button(
186
+ label='Download ZIP',
187
+ data=datazip,
188
+ file_name="dataset.zip",
189
+ mime="application/octet-stream"
190
+ )
191
+
192
+ # Display model parameters
193
+ st.header('Model parameters', divider='rainbow')
194
+ parameters_col = st.columns(3)
195
+ parameters_col[0].metric(label="Data split ratio (% for Training Set)", value=parameter_split_size, delta="")
196
+ parameters_col[1].metric(label="Number of estimators (n_estimators)", value=parameter_n_estimators, delta="")
197
+ parameters_col[2].metric(label="Max features (max_features)", value=parameter_max_features_metric, delta="")
198
+
199
+ # Display feature importance plot
200
+ importances = rf.feature_importances_
201
+ feature_names = list(X.columns)
202
+ forest_importances = pd.Series(importances, index=feature_names)
203
+ df_importance = forest_importances.reset_index().rename(columns={'index': 'feature', 0: 'value'})
204
+
205
+ bars = alt.Chart(df_importance).mark_bar(size=40).encode(
206
+ x='value:Q',
207
+ y=alt.Y('feature:N', sort='-x')
208
+ ).properties(height=250)
209
+
210
+ performance_col = st.columns((2, 0.2, 3))
211
+ with performance_col[0]:
212
+ st.header('Model performance', divider='rainbow')
213
+ st.dataframe(rf_results.T.reset_index().rename(columns={'index': 'Parameter', 0: 'Value'}))
214
+ with performance_col[2]:
215
+ st.header('Feature importance', divider='rainbow')
216
+ st.altair_chart(bars, theme='streamlit', use_container_width=True)
217
+
218
+ # Prediction results
219
+ st.header('Prediction results', divider='rainbow')
220
+ s_y_train = pd.Series(y_train, name='actual').reset_index(drop=True)
221
+ s_y_train_pred = pd.Series(y_train_pred, name='predicted').reset_index(drop=True)
222
+ df_train = pd.DataFrame(data=[s_y_train, s_y_train_pred], index=None).T
223
+ df_train['class'] = 'train'
224
+
225
+ s_y_test = pd.Series(y_test, name='actual').reset_index(drop=True)
226
+ s_y_test_pred = pd.Series(y_test_pred, name='predicted').reset_index(drop=True)
227
+ df_test = pd.DataFrame(data=[s_y_test, s_y_test_pred], index=None).T
228
+ df_test['class'] = 'test'
229
+
230
+ df_prediction = pd.concat([df_train, df_test], axis=0)
231
+
232
+ prediction_col = st.columns((2, 0.2, 3))
233
+
234
+ # Display dataframe
235
+ with prediction_col[0]:
236
+ st.dataframe(df_prediction, height=320, use_container_width=True)
237
+
238
+ # Display scatter plot of actual vs predicted values
239
+ with prediction_col[2]:
240
+ scatter = alt.Chart(df_prediction).mark_circle(size=60).encode(
241
+ x='actual',
242
+ y='predicted',
243
+ color='class'
244
+ )
245
+ st.altair_chart(scatter, theme='streamlit', use_container_width=True)
246
+
247
+
248
+ # Ask for CSV upload if none is detected
249
+ else:
250
+ st.warning('👈 Upload a CSV file or click *"Load example data"* to get started!')