Fanwang Meng commited on
Commit
2497be4
1 Parent(s): 2caab15

Add the initial draft

Browse files
Files changed (9) hide show
  1. .gitignore +111 -0
  2. .streamlit/config.toml +332 -0
  3. README.md +5 -4
  4. app.py +361 -0
  5. packages.txt +1 -0
  6. requirements.txt +17 -0
  7. sample_input.sdf +387 -0
  8. sample_input_smiles.csv +6 -0
  9. utils.py +155 -0
.gitignore ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Prerequisites
2
+ *.d
3
+
4
+ # Compiled object files
5
+ *.slo
6
+ *.lo
7
+ *.o
8
+ *.obj
9
+
10
+ # Precompiled headers
11
+ *.gch
12
+ *.pch
13
+
14
+ # Compiled dynamic libraries
15
+ *.so
16
+ *.so.[0-9]*
17
+ *.dylib
18
+ *.dll
19
+
20
+ # Fortran module files
21
+ *.mod
22
+ *.smod
23
+
24
+ # Compiled static libraries
25
+ *.lai
26
+ *.la
27
+ *.a
28
+ *.lib
29
+
30
+ # Executables
31
+ *.exe
32
+ *.out
33
+ *.app
34
+
35
+ # Byte-compiled / optimized / DLL files
36
+ __pycache__/
37
+ *.py[cod]
38
+ *$py.class
39
+
40
+ # Distribution / packaging
41
+ .Python
42
+ build/
43
+ develop-eggs/
44
+ dist/
45
+ downloads/
46
+ eggs/
47
+ .eggs/
48
+ lib/
49
+ lib64/
50
+ parts/
51
+ sdist/
52
+ var/
53
+ wheels/
54
+ .installed.cfg
55
+ MANIFEST
56
+ *.egg-info/
57
+ *.egg
58
+ *.manifest
59
+ *.spec
60
+ pip-log.txt
61
+ pip-delete-this-directory.txt
62
+ # Unit test / coverage reports
63
+ htmlcov/
64
+ .tox/
65
+ .coverage
66
+ .coverage.*
67
+ .cache
68
+ nosetests.xml
69
+ coverage.xml
70
+ *,cover
71
+ .pytest_cache/
72
+
73
+ # Documentation
74
+ doc/html/
75
+ doc/latex/
76
+ doc/man/
77
+ doc/xml/
78
+ doc/_build/
79
+ doc/source
80
+ doc/modules
81
+
82
+ # Environments
83
+ .env
84
+ .venv
85
+ env/
86
+ venv/
87
+ ENV/
88
+
89
+ # Editor junk
90
+ tags
91
+ [._]*.s[a-v][a-z]
92
+ [._]*.sw[a-p]
93
+ [._]s[a-v][a-z]
94
+ [._]sw[a-p]
95
+ *~
96
+ \#*\#
97
+ .\#*
98
+ .ropeproject
99
+ .idea/
100
+ .spyderproject
101
+ .spyproject
102
+ .vscode/
103
+ # Mac .DS_Store
104
+ .DS_Store
105
+
106
+ # jupyter notebook checkpoints
107
+ .ipynb_checkpoints
108
+
109
+ # version file generated by rob
110
+ B3clf/_version.py
111
+
.streamlit/config.toml ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ [global]
3
+
4
+ # By default, Streamlit checks if the Python watchdog module is available
5
+ # and, if not, prints a warning asking for you to install it. The watchdog
6
+ # module is not required, but highly recommended. It improves Streamlit's
7
+ # ability to detect changes to files in your filesystem.
8
+
9
+ # If you'd like to turn off this warning, set this to True.
10
+
11
+ # Default: false
12
+ # disableWatchdogWarning = false
13
+
14
+ # By default, Streamlit displays a warning when a user sets both a widget
15
+ # default value in the function defining the widget and a widget value via
16
+ # the widget's key in `st.session_state`.
17
+
18
+ # If you'd like to turn off this warning, set this to True.
19
+
20
+ # Default: false
21
+ # disableWidgetStateDuplicationWarning = false
22
+
23
+ # If True, will show a warning when you run a Streamlit-enabled script
24
+ # via "python my_script.py".
25
+
26
+ # Default: true
27
+ # showWarningOnDirectExecution = true
28
+
29
+ # DataFrame serialization.
30
+
31
+ # Acceptable values:
32
+ # - 'legacy': Serialize DataFrames using Streamlit's custom format. Slow
33
+ # but battle-tested.
34
+ # - 'arrow': Serialize DataFrames using Apache Arrow. Much faster and versatile.
35
+
36
+ # Default: "arrow"
37
+ dataFrameSerialization = "arrow"
38
+
39
+
40
+ [logger]
41
+
42
+ # Level of logging: 'error', 'warning', 'info', or 'debug'.
43
+
44
+ # Default: 'info'
45
+ # level = "info"
46
+
47
+ # String format for logging messages. If logger.datetimeFormat is set,
48
+ # logger messages will default to `%(asctime)s.%(msecs)03d %(message)s`. See
49
+ # [Python's documentation](https://docs.python.org/2.6/library/logging.html#formatter-objects)
50
+ # for available attributes.
51
+
52
+ # Default: "%(asctime)s %(message)s"
53
+ # messageFormat = "%(asctime)s %(message)s"
54
+
55
+
56
+ [client]
57
+
58
+ # Whether to enable st.cache. This does not affect st.cache_data or
59
+ # st.cache_resource.
60
+
61
+ # Default: true
62
+ caching = true
63
+
64
+ # If false, makes your Streamlit script not draw to a
65
+ # Streamlit app.
66
+
67
+ # Default: true
68
+ # displayEnabled = true
69
+
70
+ # Controls whether uncaught app exceptions and deprecation warnings
71
+ # are displayed in the browser. By default, this is set to True and
72
+ # Streamlit displays app exceptions and associated tracebacks, and
73
+ # deprecation warnings, in the browser.
74
+
75
+ # If set to False, deprecation warnings and full exception messages
76
+ # will print to the console only. Exceptions will still display in the
77
+ # browser with a generic error message. For now, the exception type and
78
+ # traceback show in the browser also, but they will be removed in the
79
+ # future.
80
+
81
+ # Default: true
82
+ # showErrorDetails = true
83
+
84
+ # Change the visibility of items in the toolbar, options menu,
85
+ # and settings dialog (top right of the app).
86
+
87
+ # Allowed values:
88
+ # * "auto" : Show the developer options if the app is accessed through
89
+ # localhost or through Streamlit Community Cloud as a developer.
90
+ # Hide them otherwise.
91
+ # * "developer" : Show the developer options.
92
+ # * "viewer" : Hide the developer options.
93
+ # * "minimal" : Show only options set externally (e.g. through
94
+ # Streamlit Community Cloud) or through st.set_page_config.
95
+ # If there are no options left, hide the menu.
96
+
97
+ # Default: "auto"
98
+ # toolbarMode = "auto"
99
+
100
+
101
+ [runner]
102
+
103
+ # Allows you to type a variable or string by itself in a single line of
104
+ # Python code to write it to the app.
105
+
106
+ # Default: true
107
+ # magicEnabled = true
108
+
109
+ # Install a Python tracer to allow you to stop or pause your script at
110
+ # any point and introspect it. As a side-effect, this slows down your
111
+ # script's execution.
112
+
113
+ # Default: false
114
+ # installTracer = false
115
+
116
+ # Sets the MPLBACKEND environment variable to Agg inside Streamlit to
117
+ # prevent Python crashing.
118
+
119
+ # Default: true
120
+ # fixMatplotlib = true
121
+
122
+ # Run the Python Garbage Collector after each script execution. This
123
+ # can help avoid excess memory use in Streamlit apps, but could
124
+ # introduce delay in rerunning the app script for high-memory-use
125
+ # applications.
126
+
127
+ # Default: true
128
+ # postScriptGC = true
129
+
130
+ # Handle script rerun requests immediately, rather than waiting for script
131
+ # execution to reach a yield point. This makes Streamlit much more
132
+ # responsive to user interaction, but it can lead to race conditions in
133
+ # apps that mutate session_state data outside of explicit session_state
134
+ # assignment statements.
135
+
136
+ # Default: true
137
+ # fastReruns = true
138
+
139
+ # Raise an exception after adding unserializable data to Session State.
140
+ # Some execution environments may require serializing all data in Session
141
+ # State, so it may be useful to detect incompatibility during development,
142
+ # or when the execution environment will stop supporting it in the future.
143
+
144
+ # Default: false
145
+ # enforceSerializableSessionState = false
146
+
147
+
148
+ [server]
149
+
150
+ # List of folders that should not be watched for changes. This
151
+ # impacts both "Run on Save" and @st.cache.
152
+
153
+ # Relative paths will be taken as relative to the current working directory.
154
+
155
+ # Example: ['/home/user1/env', 'relative/path/to/folder']
156
+
157
+ # Default: []
158
+ # folderWatchBlacklist = []
159
+
160
+ # Change the type of file watcher used by Streamlit, or turn it off
161
+ # completely.
162
+
163
+ # Allowed values:
164
+ # * "auto" : Streamlit will attempt to use the watchdog module, and
165
+ # falls back to polling if watchdog is not available.
166
+ # * "watchdog" : Force Streamlit to use the watchdog module.
167
+ # * "poll" : Force Streamlit to always use polling.
168
+ # * "none" : Streamlit will not watch files.
169
+
170
+ # Default: "auto"
171
+ # fileWatcherType = "auto"
172
+
173
+ # Symmetric key used to produce signed cookies. If deploying on multiple replicas, this should
174
+ # be set to the same value across all replicas to ensure they all share the same secret.
175
+
176
+ # Default: randomly generated secret key.
177
+ # cookieSecret = "59320264f737a53fb01de73458c8849b0b623a7ba8174de8612fd569c2c25035"
178
+
179
+ # If false, will attempt to open a browser window on start.
180
+
181
+ # Default: false unless (1) we are on a Linux box where DISPLAY is unset, or
182
+ # (2) we are running in the Streamlit Atom plugin.
183
+ # headless = false
184
+
185
+ # Automatically rerun script when the file is modified on disk.
186
+
187
+ # Default: false
188
+ # runOnSave = false
189
+
190
+ # The address where the server will listen for client and browser
191
+ # connections. Use this if you want to bind the server to a specific address.
192
+ # If set, the server will only be accessible from this address, and not from
193
+ # any aliases (like localhost).
194
+
195
+ # Default: (unset)
196
+ # address =
197
+
198
+ # The port where the server will listen for browser connections.
199
+
200
+ # Default: 8501
201
+ # port = 8501
202
+
203
+ # The base path for the URL where Streamlit should be served from.
204
+
205
+ # Default: ""
206
+ # baseUrlPath = ""
207
+
208
+ # Enables support for Cross-Origin Resource Sharing (CORS) protection, for added security.
209
+
210
+ # Due to conflicts between CORS and XSRF, if `server.enableXsrfProtection` is on and
211
+ # `server.enableCORS` is off at the same time, we will prioritize `server.enableXsrfProtection`.
212
+
213
+ # Default: true
214
+ # enableCORS = true
215
+
216
+ # Enables support for Cross-Site Request Forgery (XSRF) protection, for added security.
217
+
218
+ # Due to conflicts between CORS and XSRF, if `server.enableXsrfProtection` is on and
219
+ # `server.enableCORS` is off at the same time, we will prioritize `server.enableXsrfProtection`.
220
+
221
+ # Default: true
222
+ # enableXsrfProtection = true
223
+
224
+ # Max size, in megabytes, for files uploaded with the file_uploader.
225
+
226
+ # Default: 200
227
+ maxUploadSize = 2
228
+
229
+ # Max size, in megabytes, of messages that can be sent via the WebSocket connection.
230
+
231
+ # Default: 200
232
+ # maxMessageSize = 200
233
+
234
+ # Enables support for websocket compression.
235
+
236
+ # Default: false
237
+ # enableWebsocketCompression = false
238
+
239
+ # Enable serving files from a `static` directory in the running app's directory.
240
+
241
+ # Default: false
242
+ # enableStaticServing = false
243
+
244
+ # Server certificate file for connecting via HTTPS.
245
+ # Must be set at the same time as "server.sslKeyFile".
246
+
247
+ # ['DO NOT USE THIS OPTION IN A PRODUCTION ENVIRONMENT. It has not gone through security audits or performance tests. For the production environment, we recommend performing SSL termination by the load balancer or the reverse proxy.']
248
+ # sslCertFile =
249
+
250
+ # Cryptographic key file for connecting via HTTPS.
251
+ # Must be set at the same time as "server.sslCertFile".
252
+
253
+ # ['DO NOT USE THIS OPTION IN A PRODUCTION ENVIRONMENT. It has not gone through security audits or performance tests. For the production environment, we recommend performing SSL termination by the load balancer or the reverse proxy.']
254
+ # sslKeyFile =
255
+
256
+
257
+ [browser]
258
+
259
+ # Internet address where users should point their browsers in order to
260
+ # connect to the app. Can be IP address or DNS name and path.
261
+
262
+ # This is used to:
263
+ # - Set the correct URL for CORS and XSRF protection purposes.
264
+ # - Show the URL on the terminal
265
+ # - Open the browser
266
+
267
+ # Default: "localhost"
268
+ # serverAddress = "localhost"
269
+
270
+ # Whether to send usage statistics to Streamlit.
271
+
272
+ # Default: true
273
+ # gatherUsageStats = true
274
+
275
+ # Port where users should point their browsers in order to connect to the
276
+ # app.
277
+
278
+ # This is used to:
279
+ # - Set the correct URL for CORS and XSRF protection purposes.
280
+ # - Show the URL on the terminal
281
+ # - Open the browser
282
+
283
+ # Default: whatever value is set in server.port.
284
+ # serverPort = 8501
285
+
286
+
287
+ [mapbox]
288
+
289
+ # Configure Streamlit to use a custom Mapbox
290
+ # token for elements like st.pydeck_chart and st.map.
291
+ # To get a token for yourself, create an account at
292
+ # https://mapbox.com. It's free (for moderate usage levels)!
293
+
294
+ # Default: ""
295
+ # token = ""
296
+
297
+
298
+ [deprecation]
299
+
300
+ # Set to false to disable the deprecation warning for the file uploader encoding.
301
+
302
+ # Default: true
303
+ # showfileUploaderEncoding = true
304
+
305
+ # Set to false to disable the deprecation warning for using the global pyplot instance.
306
+
307
+ # Default: true
308
+ # showPyplotGlobalUse = true
309
+
310
+
311
+ [theme]
312
+
313
+ # The preset Streamlit theme that your custom theme inherits from.
314
+ # One of "light" or "dark".
315
+ # base = "light"
316
+
317
+ # Primary accent color for interactive elements.
318
+ # primaryColor =
319
+
320
+ # Background color for the main content area.
321
+ # backgroundColor =
322
+
323
+ # Background color used for the sidebar and most interactive widgets.
324
+ # secondaryBackgroundColor =
325
+
326
+ # Color used for almost all text.
327
+ # textColor =
328
+
329
+ # Font family for all text in the app, except code blocks. One of "sans serif",
330
+ # "serif", or "monospace".
331
+ font = "sans serif"
332
+
README.md CHANGED
@@ -1,10 +1,11 @@
1
  ---
2
- title: Debugging
3
- emoji: 🏆
4
- colorFrom: purple
5
- colorTo: gray
6
  sdk: streamlit
7
  sdk_version: 1.27.2
 
8
  app_file: app.py
9
  pinned: false
10
  license: gpl-3.0
 
1
  ---
2
+ title: B3clf
3
+ emoji: 🏢
4
+ colorFrom: green
5
+ colorTo: pink
6
  sdk: streamlit
7
  sdk_version: 1.27.2
8
+ python_version: 3.8
9
  app_file: app.py
10
  pinned: false
11
  license: gpl-3.0
app.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools as it
2
+ import os
3
+ import tempfile
4
+ from io import StringIO
5
+
6
+ import joblib
7
+ import numpy as np
8
+ import pandas as pd
9
+ import pkg_resources
10
+ # page set up
11
+ import streamlit as st
12
+ from b3clf.descriptor_padel import compute_descriptors
13
+ from b3clf.geometry_opt import geometry_optimize
14
+ from b3clf.utils import get_descriptors, scale_descriptors, select_descriptors
15
+ # from PIL import Image
16
+ from streamlit_extras.let_it_rain import rain
17
+ from streamlit_ketcher import st_ketcher
18
+
19
+ from utils import generate_predictions, load_all_models
20
+
21
+ st.cache_data.clear()
22
+
23
+ st.set_page_config(
24
+ page_title="BBB Permeability Prediction with Imbalanced Learning",
25
+ # page_icon="🧊",
26
+ layout="wide",
27
+ # initial_sidebar_state="expanded",
28
+ # menu_items={
29
+ # "Get Help": "https://www.extremelycoolapp.com/help",
30
+ # "Report a bug": "https://www.extremelycoolapp.com/bug",
31
+ # "About": "# This is a header. This is an *extremely* cool app!"
32
+ # }
33
+ )
34
+
35
+
36
+ keep_features = "no"
37
+ keep_sdf = "no"
38
+ classifiers_dict = {
39
+ "decision tree": "dtree",
40
+ "kNN": "knn",
41
+ "logistic regression": "logreg",
42
+ "XGBoost": "xgb",
43
+ }
44
+ resample_methods_dict = {
45
+ "random undersampling": "classic_RandUndersampling",
46
+ "SMOTE": "classic_SMOTE",
47
+ "Borderline SMOTE": "borderline_SMOTE",
48
+ "k-means SMOTE": "kmeans_SMOTE",
49
+ "ADASYN": "classic_ADASYN",
50
+ "no resampling": "common",
51
+ }
52
+
53
+ pandas_display_options = {
54
+ "line_limit": 50,
55
+ }
56
+ mol_features = None
57
+ info_df = None
58
+ results = None
59
+ temp_file_path = None
60
+ all_models = load_all_models()
61
+
62
+ # Create the Streamlit app
63
+ st.title(":blue[BBB Permeability Prediction with Imbalanced Learning]")
64
+ info_column, upload_column = st.columns(2)
65
+
66
+ # inatialize the molecule features and info dataframe session state
67
+ if "mol_features" not in st.session_state:
68
+ st.session_state.mol_features = None
69
+ if "info_df" not in st.session_state:
70
+ st.session_state.info_df = None
71
+
72
+
73
+ # download sample files
74
+ with info_column:
75
+ st.subheader("About `B3clf`")
76
+ # fmt: off
77
+ st.markdown(
78
+ """
79
+ `B3clf` is a Python package for predicting the blood-brain barrier (BBB) permeability of small molecules using imbalanced learning. It supports decision tree, XGBoost, kNN, logistical regression and 5 resampling strategies (SMOTE, Borderline SMOTE, k-means SMOTE and ADASYN). The workflow of `B3clf` is summarized as below. The Source code and more details are available at https://github.com/theochem/B3clf. This project is supported by Digital Research Alliance of Canada (originally known as Compute Canada) and NSERC. This project is maintained by QC-Dev comminity. For further information and inquiries please contact us at qcdevs@gmail.com."""
80
+ )
81
+ st.text(" \n")
82
+ # text_body = """
83
+ # `B3clf` is a Python package for predicting the blood-brain barrier (BBB) permeability of small molecules using imbalanced learning. It supports decision tree, XGBoost, kNN, logistical regression and 5 resampling strategies (SMOTE, Borderline SMOTE, k-means SMOTE and ADASYN). The workflow of `B3clf` is summarized as below. The Source code and more details are available at https://github.com/theochem/B3clf.
84
+ # """
85
+ # st.markdown(f"<p align="justify">{text_body}</p>",
86
+ # unsafe_allow_html=True)
87
+
88
+ # image = Image.open("images/b3clf_workflow.png")
89
+ # st.image(image=image, use_column_width=True)
90
+
91
+ # image_path = "images/b3clf_workflow.png"
92
+ # image_width_percent = 80
93
+ # info_column.markdown(
94
+ # f"<img src="{image_path}" style="max-width: {image_width_percent}%; height: auto;">",
95
+ # unsafe_allow_html=True
96
+ # )
97
+
98
+ # fmt: on
99
+ sdf_col, smi_col = st.columns(2)
100
+ with sdf_col:
101
+ # uneven columns
102
+ # st.columns((2, 1, 1, 1))
103
+ # two subcolumns for sample input files
104
+ # download sample sdf
105
+ # st.markdown(" \n \n")
106
+ with open("sample_input.sdf", "r") as file_sdf:
107
+ btn = st.download_button(
108
+ label="Download SDF sample file",
109
+ data=file_sdf,
110
+ file_name="sample_input.sdf",
111
+ )
112
+ with smi_col:
113
+ with open("sample_input_smiles.csv", "r") as file_smi:
114
+ btn = st.download_button(
115
+ label="Download SMILES sample file",
116
+ data=file_smi,
117
+ file_name="sample_input_smiles.csv",
118
+ )
119
+
120
+ # Create a file uploader
121
+ with upload_column:
122
+ st.subheader("Model Selection")
123
+ with st.container():
124
+ algorithm_col, resampler_col = st.columns(2)
125
+ # algorithm and resampling method selection column
126
+ with algorithm_col:
127
+ classifier = st.selectbox(
128
+ label="Classification Algorithm:",
129
+ options=("XGBoost", "kNN", "decision tree", "logistic regression"),
130
+ )
131
+ with resampler_col:
132
+ resampler = st.selectbox(
133
+ label="Resampling Method:",
134
+ options=(
135
+ "ADASYN",
136
+ "random undersampling",
137
+ "Borderline SMOTE",
138
+ "k-means SMOTE",
139
+ "SMOTE",
140
+ "no resampling",
141
+ ),
142
+ )
143
+
144
+ # horizontal line
145
+ st.divider()
146
+ # upload_col, submit_job_col = st.columns((2, 1))
147
+ upload_col, _, submit_job_col, _ = st.columns((4, 0.05, 1, 0.05))
148
+ # upload file column
149
+ with upload_col:
150
+ # session state tracking of the file uploader
151
+ if "uploaded_file" not in st.session_state:
152
+ st.session_state.uploaded_file = None
153
+ if "uploaded_file_changed" not in st.session_state:
154
+ st.session_state.uploaded_file_changed = False
155
+
156
+ # def update_uploader_session_info():
157
+ # """Update the session state of the file uploader."""
158
+ # st.session_state.uploaded_file = uploaded_file
159
+
160
+ uploaded_file = st.file_uploader(
161
+ label="Upload a CSV, SDF, TXT or SMI file",
162
+ type=["csv", "sdf", "txt", "smi"],
163
+ help="Input molecule file only supports *.csv, *.sdf, *.txt and *.smi.",
164
+ accept_multiple_files=False,
165
+ # key="uploaded_file",
166
+ # on_change=update_uploader_session_info,
167
+ )
168
+
169
+ if uploaded_file:
170
+ # st.write(f"the uploaded file: {uploaded_file}")
171
+ # when new file is uploaded is different from the previous one
172
+ if st.session_state.uploaded_file != uploaded_file:
173
+ st.session_state.uploaded_file_changed = True
174
+ else:
175
+ st.session_state.uploaded_file_changed = False
176
+ st.session_state.uploaded_file = uploaded_file
177
+ # when new file is the same as the previous one
178
+ # else:
179
+ # st.session_state.uploaded_file_changed = False
180
+ # st.session_state.uploaded_file = uploaded_file
181
+
182
+ # set session state for the file uploader
183
+ # st.write(f"the state of uploaded file: {st.session_state.uploaded_file}")
184
+ # st.write(f"the state of uploaded file changed: {st.session_state.uploaded_file_changed}")
185
+
186
+ # submit job column
187
+ with submit_job_col:
188
+ st.text(" \n")
189
+ st.text(" \n")
190
+ st.markdown(
191
+ "<div style='display: flex; justify-content: center;'>",
192
+ unsafe_allow_html=True,
193
+ )
194
+ submit_job_button = st.button(
195
+ label="Submit Job", type="secondary", key="job_button"
196
+ )
197
+ # submit_job_col.markdown("<div style="display: flex; justify-content: center;">",
198
+ # unsafe_allow_html=True)
199
+ # submit_job_button = submit_job_col.button(
200
+ # label="Submit job", key="submit_job_button", type="secondary"
201
+ # )
202
+ # submit_job_col.markdown("</div>", unsafe_allow_html=True)
203
+
204
+
205
+ # st.write("The content of the file will be displayed below once uploaded.")
206
+ # if file:
207
+ # if "csv" in file.name or "txt" in file.name:
208
+ # st.write(file.read().decode("utf-8"))
209
+ # st.write(file)
210
+
211
+
212
+ feature_column, prediction_column = st.columns(2)
213
+ with feature_column:
214
+ st.subheader("Molecular Features")
215
+
216
+ placeholder_features = st.empty()
217
+ # placeholder_features = pd.DataFrame(index=[1, 2, 3, 4],
218
+ # columns=["ID", "nAcid", "ALogP", "Alogp2",
219
+ # "AMR", "naAromAtom", "nH", "nN"])
220
+ # st.dataframe(placeholder_features)
221
+ # placeholder_features.text("molecular features")
222
+
223
+ with prediction_column:
224
+ st.subheader("Predictions")
225
+ # placeholder_predictions = st.empty()
226
+ # placeholder_predictions.text("prediction")
227
+
228
+
229
+ st.write(
230
+ f"the state of uploaded file changed before checking: {st.session_state.uploaded_file_changed}"
231
+ )
232
+ # Generate predictions when the user uploads a file
233
+ # if submit_job_button:
234
+
235
+ # if "job_button" in st.session_state:
236
+ # when new file is uploaded
237
+ # update_uploader_session_info()
238
+ # st.write(
239
+ # f"the state of uploaded file changed after checking: {st.session_state.uploaded_file_changed}"
240
+ # )
241
+ # if st.session_state.uploaded_file_changed:
242
+ # temp_dir = tempfile.mkdtemp()
243
+ # # Create a temporary file path for the uploaded file
244
+ # temp_file_path = os.path.join(temp_dir, uploaded_file.name)
245
+ # # Save the uploaded file to the temporary file path
246
+ # with open(temp_file_path, "wb") as temp_file:
247
+ # temp_file.write(uploaded_file.read())
248
+
249
+ # mol_features, info_df, results = generate_predictions(
250
+ # input_fname=temp_file_path,
251
+ # sep="\s+|\t+",
252
+ # clf=classifiers_dict[classifier],
253
+ # _models_dict=all_models,
254
+ # sampling=resample_methods_dict[resampler],
255
+ # time_per_mol=120,
256
+ # mol_features=None,
257
+ # info_df=None,
258
+ # )
259
+ # st.session_state.mol_features = mol_features
260
+ # st.session_state.info_df = info_df
261
+ # else:
262
+ # mol_features, info_df, results = generate_predictions(
263
+ # input_fname=None,
264
+ # sep="\s+|\t+",
265
+ # clf=classifiers_dict[classifier],
266
+ # _models_dict=all_models,
267
+ # sampling=resample_methods_dict[resampler],
268
+ # time_per_mol=120,
269
+ # mol_features=st.session_state.mol_features,
270
+ # info_df=st.session_state.info_df,
271
+ # )
272
+ if submit_job_button and uploaded_file:
273
+ temp_dir = tempfile.mkdtemp()
274
+ # Create a temporary file path for the uploaded file
275
+ temp_file_path = os.path.join(temp_dir, uploaded_file.name)
276
+ # Save the uploaded file to the temporary file path
277
+ with open(temp_file_path, "wb") as temp_file:
278
+ temp_file.write(uploaded_file.read())
279
+ mol_features, info_df, results = generate_predictions(
280
+ input_fname=temp_file_path,
281
+ sep="\s+|\t+",
282
+ clf=classifiers_dict[classifier],
283
+ _models_dict=all_models,
284
+ sampling=resample_methods_dict[resampler],
285
+ time_per_mol=120,
286
+ mol_features=None,
287
+ info_df=None,
288
+ )
289
+
290
+ # feture table
291
+ with feature_column:
292
+ if mol_features is not None:
293
+ selected_feature_rows = np.min(
294
+ [mol_features.shape[0], pandas_display_options["line_limit"]]
295
+ )
296
+ st.dataframe(mol_features.iloc[:selected_feature_rows, :], hide_index=False)
297
+ # placeholder_features.dataframe(mol_features, hide_index=False)
298
+ feature_file_name = uploaded_file.name.split(".")[0] + "_b3clf_features.csv"
299
+ features_csv = mol_features.to_csv(index=True)
300
+ st.download_button(
301
+ "Download features as CSV",
302
+ data=features_csv,
303
+ file_name=feature_file_name,
304
+ )
305
+ # prediction table
306
+ with prediction_column:
307
+ # st.subheader("Predictions")
308
+ if results is not None:
309
+ # Display the predictions in a table
310
+ selected_result_rows = np.min(
311
+ [results.shape[0], pandas_display_options["line_limit"]]
312
+ )
313
+ results_df_display = results.iloc[:selected_result_rows, :].style.format(
314
+ {"B3clf_predicted_probability": "{:.6f}".format}
315
+ )
316
+ st.dataframe(results_df_display, hide_index=True)
317
+ # Add a button to download the predictions as a CSV file
318
+ predictions_csv = results.to_csv(index=True)
319
+ results_file_name = (
320
+ uploaded_file.name.split(".")[0] + "_b3clf_predictions.csv"
321
+ )
322
+ st.download_button(
323
+ "Download predictions as CSV",
324
+ data=predictions_csv,
325
+ file_name=results_file_name,
326
+ )
327
+ # indicate the success of the job
328
+ # rain(
329
+ # emoji="🎈",
330
+ # font_size=54,
331
+ # falling_speed=5,
332
+ # animation_length=10,
333
+ # )
334
+ st.balloons()
335
+
336
+
337
+ # hide footer
338
+ # https://github.com/streamlit/streamlit/issues/892
339
+ hide_streamlit_style = """
340
+ <style>
341
+ #MainMenu {visibility: hidden;}
342
+ footer {visibility: hidden;}
343
+ </style>
344
+ """
345
+ st.markdown(hide_streamlit_style, unsafe_allow_html=True)
346
+
347
+ # add google analytics
348
+ st.markdown(
349
+ """
350
+ <!-- Google tag (gtag.js) -->
351
+ <script async src="https://www.googletagmanager.com/gtag/js?id=G-WG8QYRELP9"></script>
352
+ <script>
353
+ window.dataLayer = window.dataLayer || [];
354
+ function gtag(){dataLayer.push(arguments);}
355
+ gtag("js", new Date());
356
+
357
+ gtag("config", "G-WG8QYRELP9");
358
+ </script>
359
+ """,
360
+ unsafe_allow_html=True,
361
+ )
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ default-jre
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy==1.24.4
2
+ scipy==1.10.1
3
+ scikit-learn==0.24.2
4
+ joblib==1.3.2
5
+ pandas==2.0.3
6
+ openpyxl==3.1.2
7
+ xgboost==1.4.2
8
+ padelpy>=0.1.11
9
+ rdkit==2023.03.3
10
+ # streamlit-extra==0.3.4
11
+ git+https://github.com/arnaudmiribel/streamlit-extras@v0.3.4
12
+ # for visualization
13
+ streamlit-ketcher
14
+ # for single molecule
15
+ # py3Dmol==2.0.0.post2
16
+ # stmol==0.0.9
17
+ git+https://github.com/theochem/B3clf.git
sample_input.sdf ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ H1_Bepotastine
2
+ RDKit 3D
3
+
4
+ 52 54 0 0 1 0 0 0 0 0999 V2000
5
+ 6.2601 3.8627 -0.7580 Cl 0 0 0 0 0 0 0 0 0 0 0 0
6
+ 0.7350 0.2169 -0.1032 O 0 0 0 0 0 0 0 0 0 0 0 0
7
+ -7.2627 2.0029 -1.7812 O 0 0 0 0 0 0 0 0 0 0 0 0
8
+ -7.8739 -0.0429 -1.1421 O 0 0 0 0 0 0 0 0 0 0 0 0
9
+ -3.2826 0.1387 1.0997 N 0 0 0 0 0 0 0 0 0 0 0 0
10
+ 2.0420 -2.0119 -1.2138 N 0 0 0 0 0 0 0 0 0 0 0 0
11
+ -0.4341 -0.2713 0.5552 C 0 0 0 0 0 0 0 0 0 0 0 0
12
+ -1.5088 -0.5144 -0.4974 C 0 0 0 0 0 0 0 0 0 0 0 0
13
+ -0.9255 0.7694 1.5572 C 0 0 0 0 0 0 0 0 0 0 0 0
14
+ -2.8345 -0.8975 0.1550 C 0 0 0 0 0 0 0 0 0 0 0 0
15
+ -2.2740 0.3674 2.1479 C 0 0 0 0 0 0 0 0 0 0 0 0
16
+ -4.5811 -0.1850 1.7144 C 0 0 0 0 0 0 0 0 0 0 0 0
17
+ -5.7574 -0.2607 0.7330 C 0 0 0 0 0 0 0 0 0 0 0 0
18
+ 1.9672 -0.2099 0.5040 C 0 0 2 0 0 0 0 0 0 0 0 0
19
+ -5.9298 1.0111 -0.0974 C 0 0 0 0 0 0 0 0 0 0 0 0
20
+ 3.0410 0.8232 0.1855 C 0 0 0 0 0 0 0 0 0 0 0 0
21
+ 2.3687 -1.6155 0.0463 C 0 0 0 0 0 0 0 0 0 0 0 0
22
+ 3.9935 1.1819 1.1545 C 0 0 0 0 0 0 0 0 0 0 0 0
23
+ 3.1185 1.4155 -1.0867 C 0 0 0 0 0 0 0 0 0 0 0 0
24
+ -7.1061 0.8976 -1.0266 C 0 0 0 0 0 0 0 0 0 0 0 0
25
+ 3.0746 -2.4482 0.9176 C 0 0 0 0 0 0 0 0 0 0 0 0
26
+ 4.9873 2.1194 0.8610 C 0 0 0 0 0 0 0 0 0 0 0 0
27
+ 4.1084 2.3564 -1.3784 C 0 0 0 0 0 0 0 0 0 0 0 0
28
+ 3.4496 -3.7187 0.4871 C 0 0 0 0 0 0 0 0 0 0 0 0
29
+ 5.0380 2.7045 -0.4026 C 0 0 0 0 0 0 0 0 0 0 0 0
30
+ 2.4252 -3.2455 -1.6060 C 0 0 0 0 0 0 0 0 0 0 0 0
31
+ 3.1214 -4.1271 -0.7990 C 0 0 0 0 0 0 0 0 0 0 0 0
32
+ -0.2263 -1.2199 1.0679 H 0 0 0 0 0 0 0 0 0 0 0 0
33
+ -1.6364 0.3807 -1.1209 H 0 0 0 0 0 0 0 0 0 0 0 0
34
+ -1.1831 -1.3082 -1.1808 H 0 0 0 0 0 0 0 0 0 0 0 0
35
+ -0.1894 0.8975 2.3595 H 0 0 0 0 0 0 0 0 0 0 0 0
36
+ -1.0042 1.7496 1.0680 H 0 0 0 0 0 0 0 0 0 0 0 0
37
+ -3.5642 -1.0250 -0.6514 H 0 0 0 0 0 0 0 0 0 0 0 0
38
+ -2.7343 -1.8665 0.6611 H 0 0 0 0 0 0 0 0 0 0 0 0
39
+ -2.1498 -0.5299 2.7684 H 0 0 0 0 0 0 0 0 0 0 0 0
40
+ -2.6054 1.1766 2.8103 H 0 0 0 0 0 0 0 0 0 0 0 0
41
+ -4.5185 -1.1314 2.2673 H 0 0 0 0 0 0 0 0 0 0 0 0
42
+ -4.8272 0.5917 2.4507 H 0 0 0 0 0 0 0 0 0 0 0 0
43
+ -5.6514 -1.1306 0.0739 H 0 0 0 0 0 0 0 0 0 0 0 0
44
+ -6.6737 -0.4399 1.3108 H 0 0 0 0 0 0 0 0 0 0 0 0
45
+ 1.8204 -0.2159 1.5927 H 0 0 0 0 0 0 0 0 0 0 0 0
46
+ -6.0945 1.8686 0.5639 H 0 0 0 0 0 0 0 0 0 0 0 0
47
+ -5.0396 1.1941 -0.7083 H 0 0 0 0 0 0 0 0 0 0 0 0
48
+ 3.9687 0.7355 2.1458 H 0 0 0 0 0 0 0 0 0 0 0 0
49
+ 2.3964 1.1402 -1.8552 H 0 0 0 0 0 0 0 0 0 0 0 0
50
+ 3.3355 -2.1177 1.9176 H 0 0 0 0 0 0 0 0 0 0 0 0
51
+ 5.7167 2.3889 1.6199 H 0 0 0 0 0 0 0 0 0 0 0 0
52
+ 4.1451 2.8085 -2.3655 H 0 0 0 0 0 0 0 0 0 0 0 0
53
+ 3.9993 -4.3824 1.1485 H 0 0 0 0 0 0 0 0 0 0 0 0
54
+ 2.1492 -3.5132 -2.6219 H 0 0 0 0 0 0 0 0 0 0 0 0
55
+ 3.4047 -5.1069 -1.1664 H 0 0 0 0 0 0 0 0 0 0 0 0
56
+ -8.0410 1.8004 -2.3409 H 0 0 0 0 0 0 0 0 0 0 0 0
57
+ 1 25 1 0
58
+ 2 7 1 0
59
+ 2 14 1 0
60
+ 3 20 1 0
61
+ 3 52 1 0
62
+ 4 20 2 0
63
+ 5 10 1 0
64
+ 5 11 1 0
65
+ 5 12 1 0
66
+ 6 17 2 0
67
+ 6 26 1 0
68
+ 7 8 1 0
69
+ 7 9 1 0
70
+ 7 28 1 0
71
+ 8 10 1 0
72
+ 8 29 1 0
73
+ 8 30 1 0
74
+ 9 11 1 0
75
+ 9 31 1 0
76
+ 9 32 1 0
77
+ 10 33 1 0
78
+ 10 34 1 0
79
+ 11 35 1 0
80
+ 11 36 1 0
81
+ 12 13 1 0
82
+ 12 37 1 0
83
+ 12 38 1 0
84
+ 13 15 1 0
85
+ 13 39 1 0
86
+ 13 40 1 0
87
+ 14 16 1 0
88
+ 14 17 1 0
89
+ 14 41 1 1
90
+ 15 20 1 0
91
+ 15 42 1 0
92
+ 15 43 1 0
93
+ 16 18 2 0
94
+ 16 19 1 0
95
+ 17 21 1 0
96
+ 18 22 1 0
97
+ 18 44 1 0
98
+ 19 23 2 0
99
+ 19 45 1 0
100
+ 21 24 2 0
101
+ 21 46 1 0
102
+ 22 25 2 0
103
+ 22 47 1 0
104
+ 23 25 1 0
105
+ 23 48 1 0
106
+ 24 27 1 0
107
+ 24 49 1 0
108
+ 26 27 2 0
109
+ 26 50 1 0
110
+ 27 51 1 0
111
+ M END
112
+ > <compoud_name> (1)
113
+ H1_Bepotastine
114
+
115
+ > <SMILES> (1)
116
+ [H]OC(=O)C([H])([H])C([H])([H])C([H])([H])N1C([H])([H])C([H])([H])C([H])(OC([H])(c2nc([H])c([H])c([H])c2[H])c2c([H])c([H])c(Cl)c([H])c2[H])C([H])([H])C1([H])[H]
117
+
118
+ > <cid> (1)
119
+ 2350
120
+
121
+ > <category> (1)
122
+ N
123
+
124
+ > <inchi> (1)
125
+ InChI=1S/C21H25ClN2O3/c22-17-8-6-16(7-9-17)21(19-4-1-2-12-23-19)27-18-10-14-24(15-11-18)13-3-5-20(25)26/h1-2,4,6-9,12,18,21H,3,5,10-11,13-15H2,(H,25,26)/t21-/m1/s1
126
+
127
+ > <Energy> (1)
128
+ 49.1758
129
+
130
+ $$$$
131
+ H1_Quifenadine
132
+ RDKit 3D
133
+
134
+ 45 48 0 0 1 0 0 0 0 0999 V2000
135
+ 0.1106 0.2102 -1.7897 O 0 0 0 0 0 0 0 0 0 0 0 0
136
+ 3.4646 1.0770 -0.0854 N 0 0 0 0 0 0 0 0 0 0 0 0
137
+ 2.0931 -1.1209 0.1252 C 0 0 0 0 0 0 0 0 0 0 0 0
138
+ 1.1729 0.1166 0.3820 C 0 0 1 0 0 0 0 0 0 0 0 0
139
+ 2.0299 1.3864 0.1159 C 0 0 0 0 0 0 0 0 0 0 0 0
140
+ 2.7971 -1.0339 -1.2379 C 0 0 0 0 0 0 0 0 0 0 0 0
141
+ 3.2148 -1.0584 1.1848 C 0 0 0 0 0 0 0 0 0 0 0 0
142
+ 3.5902 0.2772 -1.3240 C 0 0 0 0 0 0 0 0 0 0 0 0
143
+ 3.9592 0.2796 1.0561 C 0 0 0 0 0 0 0 0 0 0 0 0
144
+ -0.2029 0.1255 -0.3860 C 0 0 0 0 0 0 0 0 0 0 0 0
145
+ -1.1272 1.3230 -0.0602 C 0 0 0 0 0 0 0 0 0 0 0 0
146
+ -0.9736 -1.1857 -0.1269 C 0 0 0 0 0 0 0 0 0 0 0 0
147
+ -1.0387 2.0636 1.1310 C 0 0 0 0 0 0 0 0 0 0 0 0
148
+ -1.3454 -2.0428 -1.1782 C 0 0 0 0 0 0 0 0 0 0 0 0
149
+ -2.1533 1.6708 -0.9653 C 0 0 0 0 0 0 0 0 0 0 0 0
150
+ -1.3459 -1.5543 1.1811 C 0 0 0 0 0 0 0 0 0 0 0 0
151
+ -1.9065 3.1310 1.3840 C 0 0 0 0 0 0 0 0 0 0 0 0
152
+ -2.0526 -3.2227 -0.9327 C 0 0 0 0 0 0 0 0 0 0 0 0
153
+ -3.0179 2.7377 -0.7134 C 0 0 0 0 0 0 0 0 0 0 0 0
154
+ -2.0493 -2.7364 1.4259 C 0 0 0 0 0 0 0 0 0 0 0 0
155
+ -2.8897 3.4721 0.4604 C 0 0 0 0 0 0 0 0 0 0 0 0
156
+ -2.4022 -3.5700 0.3691 C 0 0 0 0 0 0 0 0 0 0 0 0
157
+ 1.5541 -2.0675 0.2237 H 0 0 0 0 0 0 0 0 0 0 0 0
158
+ 0.9532 0.0967 1.4588 H 0 0 0 0 0 0 0 0 0 0 0 0
159
+ 1.6691 1.9630 -0.7430 H 0 0 0 0 0 0 0 0 0 0 0 0
160
+ 1.9423 2.0685 0.9712 H 0 0 0 0 0 0 0 0 0 0 0 0
161
+ 2.0851 -1.1104 -2.0638 H 0 0 0 0 0 0 0 0 0 0 0 0
162
+ 3.4846 -1.8820 -1.3506 H 0 0 0 0 0 0 0 0 0 0 0 0
163
+ 3.9137 -1.8918 1.0436 H 0 0 0 0 0 0 0 0 0 0 0 0
164
+ 2.7942 -1.1596 2.1923 H 0 0 0 0 0 0 0 0 0 0 0 0
165
+ 4.6485 0.0638 -1.5199 H 0 0 0 0 0 0 0 0 0 0 0 0
166
+ 3.2467 0.8670 -2.1831 H 0 0 0 0 0 0 0 0 0 0 0 0
167
+ 3.8541 0.8576 1.9828 H 0 0 0 0 0 0 0 0 0 0 0 0
168
+ 5.0353 0.0986 0.9430 H 0 0 0 0 0 0 0 0 0 0 0 0
169
+ 0.1304 1.1516 -2.0295 H 0 0 0 0 0 0 0 0 0 0 0 0
170
+ -0.3059 1.8245 1.8958 H 0 0 0 0 0 0 0 0 0 0 0 0
171
+ -1.0856 -1.7976 -2.2061 H 0 0 0 0 0 0 0 0 0 0 0 0
172
+ -2.2926 1.0941 -1.8795 H 0 0 0 0 0 0 0 0 0 0 0 0
173
+ -1.0974 -0.9178 2.0267 H 0 0 0 0 0 0 0 0 0 0 0 0
174
+ -1.8179 3.6927 2.3110 H 0 0 0 0 0 0 0 0 0 0 0 0
175
+ -2.3308 -3.8683 -1.7614 H 0 0 0 0 0 0 0 0 0 0 0 0
176
+ -3.7962 2.9864 -1.4300 H 0 0 0 0 0 0 0 0 0 0 0 0
177
+ -2.3260 -3.0022 2.4429 H 0 0 0 0 0 0 0 0 0 0 0 0
178
+ -3.5643 4.2999 0.6616 H 0 0 0 0 0 0 0 0 0 0 0 0
179
+ -2.9530 -4.4872 0.5586 H 0 0 0 0 0 0 0 0 0 0 0 0
180
+ 1 10 1 0
181
+ 1 35 1 0
182
+ 2 5 1 0
183
+ 2 8 1 0
184
+ 2 9 1 0
185
+ 3 4 1 0
186
+ 3 6 1 0
187
+ 3 7 1 0
188
+ 3 23 1 0
189
+ 4 5 1 0
190
+ 4 10 1 0
191
+ 4 24 1 1
192
+ 5 25 1 0
193
+ 5 26 1 0
194
+ 6 8 1 0
195
+ 6 27 1 0
196
+ 6 28 1 0
197
+ 7 9 1 0
198
+ 7 29 1 0
199
+ 7 30 1 0
200
+ 8 31 1 0
201
+ 8 32 1 0
202
+ 9 33 1 0
203
+ 9 34 1 0
204
+ 10 11 1 0
205
+ 10 12 1 0
206
+ 11 13 2 0
207
+ 11 15 1 0
208
+ 12 14 2 0
209
+ 12 16 1 0
210
+ 13 17 1 0
211
+ 13 36 1 0
212
+ 14 18 1 0
213
+ 14 37 1 0
214
+ 15 19 2 0
215
+ 15 38 1 0
216
+ 16 20 2 0
217
+ 16 39 1 0
218
+ 17 21 2 0
219
+ 17 40 1 0
220
+ 18 22 2 0
221
+ 18 41 1 0
222
+ 19 21 1 0
223
+ 19 42 1 0
224
+ 20 22 1 0
225
+ 20 43 1 0
226
+ 21 44 1 0
227
+ 22 45 1 0
228
+ M END
229
+ > <compoud_name> (2)
230
+ H1_Quifenadine
231
+
232
+ > <SMILES> (2)
233
+ [H]OC(c1c([H])c([H])c([H])c([H])c1[H])(c1c([H])c([H])c([H])c([H])c1[H])C1([H])C([H])([H])N2C([H])([H])C([H])([H])C1([H])C([H])([H])C2([H])[H]
234
+
235
+ > <cid> (2)
236
+ 65600
237
+
238
+ > <category> (2)
239
+ N
240
+
241
+ > <inchi> (2)
242
+ InChI=1S/C20H23NO/c22-20(17-7-3-1-4-8-17,18-9-5-2-6-10-18)19-15-21-13-11-16(19)12-14-21/h1-10,16,19,22H,11-15H2/t19-/m1/s1
243
+
244
+ > <Energy> (2)
245
+ 84.891
246
+
247
+ $$$$
248
+ H1_Rupatadine
249
+ RDKit 3D
250
+
251
+ 56 60 0 0 0 0 0 0 0 0999 V2000
252
+ 6.5298 3.3080 0.0562 Cl 0 0 0 0 0 0 0 0 0 0 0 0
253
+ -2.1780 1.1440 -0.1081 N 0 0 0 0 0 0 0 0 0 0 0 0
254
+ 1.8055 -2.5028 1.6263 N 0 0 0 0 0 0 0 0 0 0 0 0
255
+ -6.5347 -0.2932 -1.5666 N 0 0 0 0 0 0 0 0 0 0 0 0
256
+ 0.4984 0.2017 0.7391 C 0 0 0 0 0 0 0 0 0 0 0 0
257
+ -0.7596 -0.6401 0.9176 C 0 0 0 0 0 0 0 0 0 0 0 0
258
+ 0.1325 1.6779 0.6992 C 0 0 0 0 0 0 0 0 0 0 0 0
259
+ -1.8276 -0.2907 -0.1321 C 0 0 0 0 0 0 0 0 0 0 0 0
260
+ -0.9697 1.9571 -0.3378 C 0 0 0 0 0 0 0 0 0 0 0 0
261
+ 1.7535 -0.3064 0.5966 C 0 0 0 0 0 0 0 0 0 0 0 0
262
+ -3.2065 1.4670 -1.1132 C 0 0 0 0 0 0 0 0 0 0 0 0
263
+ 2.9347 0.5760 0.4016 C 0 0 0 0 0 0 0 0 0 0 0 0
264
+ 1.9383 -1.7730 0.4937 C 0 0 0 0 0 0 0 0 0 0 0 0
265
+ 3.7669 0.4917 -0.7359 C 0 0 0 0 0 0 0 0 0 0 0 0
266
+ 3.6248 -0.5108 -1.8705 C 0 0 0 0 0 0 0 0 0 0 0 0
267
+ 2.3939 -1.4219 -1.9523 C 0 0 0 0 0 0 0 0 0 0 0 0
268
+ 2.2514 -2.3194 -0.7533 C 0 0 0 0 0 0 0 0 0 0 0 0
269
+ -4.5656 0.8945 -0.7963 C 0 0 0 0 0 0 0 0 0 0 0 0
270
+ 3.2715 1.4705 1.4385 C 0 0 0 0 0 0 0 0 0 0 0 0
271
+ 4.8769 1.3617 -0.8210 C 0 0 0 0 0 0 0 0 0 0 0 0
272
+ 2.4290 -3.7014 -0.8308 C 0 0 0 0 0 0 0 0 0 0 0 0
273
+ 4.3729 2.3200 1.3344 C 0 0 0 0 0 0 0 0 0 0 0 0
274
+ 5.1670 2.2679 0.1982 C 0 0 0 0 0 0 0 0 0 0 0 0
275
+ -5.1566 1.0467 0.4633 C 0 0 0 0 0 0 0 0 0 0 0 0
276
+ -5.3042 0.2290 -1.7686 C 0 0 0 0 0 0 0 0 0 0 0 0
277
+ 2.2947 -4.4730 0.3198 C 0 0 0 0 0 0 0 0 0 0 0 0
278
+ 1.9875 -3.8347 1.5112 C 0 0 0 0 0 0 0 0 0 0 0 0
279
+ -6.4311 0.5316 0.7094 C 0 0 0 0 0 0 0 0 0 0 0 0
280
+ -7.0633 -0.1364 -0.3325 C 0 0 0 0 0 0 0 0 0 0 0 0
281
+ -7.0626 0.6338 2.0605 C 0 0 0 0 0 0 0 0 0 0 0 0
282
+ -0.5731 -1.7154 0.8560 H 0 0 0 0 0 0 0 0 0 0 0 0
283
+ -1.1596 -0.4557 1.9235 H 0 0 0 0 0 0 0 0 0 0 0 0
284
+ -0.2119 1.9818 1.6961 H 0 0 0 0 0 0 0 0 0 0 0 0
285
+ 0.9793 2.3217 0.4489 H 0 0 0 0 0 0 0 0 0 0 0 0
286
+ -1.4699 -0.5848 -1.1284 H 0 0 0 0 0 0 0 0 0 0 0 0
287
+ -2.7127 -0.8992 0.0866 H 0 0 0 0 0 0 0 0 0 0 0 0
288
+ -1.2287 3.0211 -0.2712 H 0 0 0 0 0 0 0 0 0 0 0 0
289
+ -0.5727 1.7824 -1.3473 H 0 0 0 0 0 0 0 0 0 0 0 0
290
+ -2.8776 1.1445 -2.1102 H 0 0 0 0 0 0 0 0 0 0 0 0
291
+ -3.3405 2.5558 -1.1674 H 0 0 0 0 0 0 0 0 0 0 0 0
292
+ 3.6660 0.0536 -2.8120 H 0 0 0 0 0 0 0 0 0 0 0 0
293
+ 4.5182 -1.1506 -1.8447 H 0 0 0 0 0 0 0 0 0 0 0 0
294
+ 2.4771 -2.0361 -2.8582 H 0 0 0 0 0 0 0 0 0 0 0 0
295
+ 1.4795 -0.8292 -2.0837 H 0 0 0 0 0 0 0 0 0 0 0 0
296
+ 2.6674 1.5029 2.3444 H 0 0 0 0 0 0 0 0 0 0 0 0
297
+ 5.5326 1.3154 -1.6888 H 0 0 0 0 0 0 0 0 0 0 0 0
298
+ 2.6741 -4.1805 -1.7747 H 0 0 0 0 0 0 0 0 0 0 0 0
299
+ 4.6043 3.0064 2.1437 H 0 0 0 0 0 0 0 0 0 0 0 0
300
+ -4.6110 1.5606 1.2526 H 0 0 0 0 0 0 0 0 0 0 0 0
301
+ -4.9162 0.0859 -2.7735 H 0 0 0 0 0 0 0 0 0 0 0 0
302
+ 2.4295 -5.5486 0.2902 H 0 0 0 0 0 0 0 0 0 0 0 0
303
+ 1.8762 -4.3969 2.4339 H 0 0 0 0 0 0 0 0 0 0 0 0
304
+ -8.0471 -0.5796 -0.2022 H 0 0 0 0 0 0 0 0 0 0 0 0
305
+ -8.1536 0.6818 1.9793 H 0 0 0 0 0 0 0 0 0 0 0 0
306
+ -6.7913 -0.2348 2.6683 H 0 0 0 0 0 0 0 0 0 0 0 0
307
+ -6.7355 1.5422 2.5773 H 0 0 0 0 0 0 0 0 0 0 0 0
308
+ 1 23 1 0
309
+ 2 8 1 0
310
+ 2 9 1 0
311
+ 2 11 1 0
312
+ 3 13 2 0
313
+ 3 27 1 0
314
+ 4 25 2 0
315
+ 4 29 1 0
316
+ 5 6 1 0
317
+ 5 7 1 0
318
+ 5 10 2 3
319
+ 6 8 1 0
320
+ 6 31 1 0
321
+ 6 32 1 0
322
+ 7 9 1 0
323
+ 7 33 1 0
324
+ 7 34 1 0
325
+ 8 35 1 0
326
+ 8 36 1 0
327
+ 9 37 1 0
328
+ 9 38 1 0
329
+ 10 12 1 0
330
+ 10 13 1 0
331
+ 11 18 1 0
332
+ 11 39 1 0
333
+ 11 40 1 0
334
+ 12 14 2 0
335
+ 12 19 1 0
336
+ 13 17 1 0
337
+ 14 15 1 0
338
+ 14 20 1 0
339
+ 15 16 1 0
340
+ 15 41 1 0
341
+ 15 42 1 0
342
+ 16 17 1 0
343
+ 16 43 1 0
344
+ 16 44 1 0
345
+ 17 21 2 0
346
+ 18 24 2 0
347
+ 18 25 1 0
348
+ 19 22 2 0
349
+ 19 45 1 0
350
+ 20 23 2 0
351
+ 20 46 1 0
352
+ 21 26 1 0
353
+ 21 47 1 0
354
+ 22 23 1 0
355
+ 22 48 1 0
356
+ 24 28 1 0
357
+ 24 49 1 0
358
+ 25 50 1 0
359
+ 26 27 2 0
360
+ 26 51 1 0
361
+ 27 52 1 0
362
+ 28 29 2 0
363
+ 28 30 1 0
364
+ 29 53 1 0
365
+ 30 54 1 0
366
+ 30 55 1 0
367
+ 30 56 1 0
368
+ M END
369
+ > <compoud_name> (3)
370
+ H1_Rupatadine
371
+
372
+ > <SMILES> (3)
373
+ [H]c1nc2c(c([H])c1[H])C([H])([H])C([H])([H])c1c([H])c(Cl)c([H])c([H])c1C2=C1C([H])([H])C([H])([H])N(C([H])([H])c2c([H])nc([H])c(C([H])([H])[H])c2[H])C([H])([H])C1([H])[H]
374
+
375
+ > <cid> (3)
376
+ 133017
377
+
378
+ > <category> (3)
379
+ N
380
+
381
+ > <inchi> (3)
382
+ InChI=1S/C26H26ClN3/c1-18-13-19(16-28-15-18)17-30-11-8-20(9-12-30)25-24-7-6-23(27)14-22(24)5-4-21-3-2-10-29-26(21)25/h2-3,6-7,10,13-16H,4-5,8-9,11-12,17H2,1H3
383
+
384
+ > <Energy> (3)
385
+ 119.976
386
+
387
+ $$$$
sample_input_smiles.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ OC(=O)CCCN1CCC(OC(c2ncccc2)c2ccc(Cl)cc2)CC1
2
+ OC(c1ccccc1)(c1ccccc1)C1CN2CCC1CC2
3
+ c1nc2c(cc1)CCc1cc(Cl)ccc1C2=C1CCN(Cc2cncc(C)c2)CC1
4
+ C1=CC=C2C(=C1)C=CC3=CC=CC=C3N2C(=O)N
5
+ CC(=O)Oc1ccccc1C(=O)O
6
+ CC(=O)Oc1c(cc(cc1)Cl)C(=O)OC(=O)c1c(ccc(c1)Cl)OC(=O)C
utils.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools as it
2
+ import os
3
+
4
+ import joblib
5
+ import numpy as np
6
+ import pandas as pd
7
+ import pkg_resources
8
+ import streamlit as st
9
+ from b3clf.descriptor_padel import compute_descriptors
10
+ from b3clf.geometry_opt import geometry_optimize
11
+ from b3clf.utils import get_descriptors, scale_descriptors, select_descriptors
12
+
13
+
14
+ @st.cache_resource()
15
+ def load_all_models():
16
+ """Get b3clf fitted classifier"""
17
+ clf_list = ["dtree", "knn", "logreg", "xgb"]
18
+ sampling_list = [
19
+ "borderline_SMOTE",
20
+ "classic_ADASYN",
21
+ "classic_RandUndersampling",
22
+ "classic_SMOTE",
23
+ "kmeans_SMOTE",
24
+ "common",
25
+ ]
26
+
27
+ model_dict = {}
28
+ package_name = "b3clf"
29
+
30
+ for clf_str, sampling_str in it.product(clf_list, sampling_list):
31
+ # joblib_fpath = os.path.join(
32
+ # dirname, "pre_trained", "b3clf_{}_{}.joblib".format(clf_str, sampling_str))
33
+ # pred_model = joblib.load(joblib_fpath)
34
+ joblib_path_str = f"pre_trained/b3clf_{clf_str}_{sampling_str}.joblib"
35
+ with pkg_resources.resource_stream(package_name, joblib_path_str) as f:
36
+ pred_model = joblib.load(f)
37
+
38
+ model_dict[clf_str + "_" + sampling_str] = pred_model
39
+
40
+ return model_dict
41
+
42
+
43
+ @st.cache_resource
44
+ def predict_permeability(
45
+ clf_str, sampling_str, _models_dict, mol_features, info_df, threshold="none"
46
+ ):
47
+ """Compute permeability prediction for given feature data."""
48
+ # load the model
49
+ # pred_model = load_all_models()[clf_str + "_" + sampling_str]
50
+ pred_model = _models_dict[clf_str + "_" + sampling_str]
51
+
52
+ # load the threshold data
53
+ package_name = "b3clf"
54
+ with pkg_resources.resource_stream(package_name, "data/B3clf_thresholds.xlsx") as f:
55
+ df_thres = pd.read_excel(f, index_col=0, engine="openpyxl")
56
+
57
+ # default threshold is 0.5
58
+ label_pool = np.zeros(mol_features.shape[0], dtype=int)
59
+
60
+ if type(mol_features) == pd.DataFrame:
61
+ if mol_features.index.tolist() != info_df.index.tolist():
62
+ raise ValueError("Features_df and Info_df do not have the same index.")
63
+
64
+ # get predicted probabilities
65
+ info_df.loc[:, "B3clf_predicted_probability"] = pred_model.predict_proba(
66
+ mol_features
67
+ )[:, 1]
68
+ # get predicted label from probability using the threshold
69
+ mask = np.greater_equal(
70
+ info_df["B3clf_predicted_probability"].to_numpy(),
71
+ # df_thres.loc[clf_str + "-" + sampling_str, threshold])
72
+ df_thres.loc["xgb-classic_ADASYN", threshold],
73
+ )
74
+ label_pool[mask] = 1
75
+
76
+ # save the predicted labels
77
+ info_df["B3clf_predicted_label"] = label_pool
78
+ info_df.reset_index(inplace=True)
79
+
80
+ return info_df
81
+
82
+
83
+ @st.cache_resource
84
+ def generate_predictions(
85
+ input_fname: str = None,
86
+ sep: str = "\s+|\t+",
87
+ clf: str = "xgb",
88
+ _models_dict: dict = None,
89
+ keep_sdf: str = "no",
90
+ sampling: str = "classic_ADASYN",
91
+ time_per_mol: int = 120,
92
+ mol_features: pd.DataFrame = None,
93
+ info_df: pd.DataFrame = None,
94
+ ):
95
+ """
96
+ Generate predictions for a given input file.
97
+ """
98
+ if mol_features is None and info_df is None:
99
+ # mol_tag = os.path.splitext(uploaded_file.name)[0]
100
+ # uploaded_file = uploaded_file.read().decode("utf-8")
101
+ mol_tag = os.path.basename(input_fname).split(".")[0]
102
+ internal_sdf = f"{mol_tag}_optimized_3d.sdf"
103
+
104
+ # Geometry optimization
105
+ # Input:
106
+ # * Either an SDF file with molecular geometries or a text file with SMILES strings
107
+
108
+ geometry_optimize(input_fname=input_fname, output_sdf=internal_sdf, sep=sep)
109
+
110
+ df_features = compute_descriptors(
111
+ sdf_file=internal_sdf,
112
+ excel_out=None,
113
+ output_csv=None,
114
+ timeout=None,
115
+ time_per_molecule=time_per_mol,
116
+ )
117
+
118
+ # Get computed descriptors
119
+ mol_features, info_df = get_descriptors(df=df_features)
120
+
121
+ # Select descriptors
122
+ mol_features = select_descriptors(df=mol_features)
123
+
124
+ # Scale descriptors
125
+ mol_features.iloc[:, :] = scale_descriptors(df=mol_features)
126
+
127
+ # this is problematic for using the same file for calculation
128
+ if os.path.exists(internal_sdf) and keep_sdf == "no":
129
+ os.remove(internal_sdf)
130
+
131
+ # Get classifier
132
+ # clf = get_clf(clf_str=clf, sampling_str=sampling)
133
+ # Get classifier
134
+ result_df = predict_permeability(
135
+ clf_str=clf,
136
+ sampling_str=sampling,
137
+ _models_dict=_models_dict,
138
+ mol_features=mol_features,
139
+ info_df=info_df,
140
+ threshold="none",
141
+ )
142
+
143
+ # Get classifier
144
+ display_cols = [
145
+ "ID",
146
+ "SMILES",
147
+ "B3clf_predicted_probability",
148
+ "B3clf_predicted_label",
149
+ ]
150
+
151
+ result_df = result_df[
152
+ [col for col in result_df.columns.to_list() if col in display_cols]
153
+ ]
154
+
155
+ return mol_features, info_df, result_df