Tschoui commited on
Commit
cf004a6
1 Parent(s): 7971ae3

move project from private to public space

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. .streamlit/config.toml +19 -0
  3. README.md +108 -8
  4. app.py +65 -0
  5. assets/data_preprocessing_objects/ecdfs.pkl +3 -0
  6. assets/data_preprocessing_objects/scaler_fitted.pkl +3 -0
  7. assets/example_csv/.~lock.known_inactive_molecules.csv# +1 -0
  8. assets/example_csv/known_active_molecules.csv +3 -0
  9. assets/example_csv/known_inactive_molecules.csv +3 -0
  10. assets/example_csv/molecules_for_prediction.csv +3 -0
  11. assets/example_csv/predictions/nottrustworthy_example.csv +3 -0
  12. assets/example_csv/predictions/nottrustworthy_example.png +3 -0
  13. assets/example_csv/predictions/trustworthy_example.csv +3 -0
  14. assets/example_csv/predictions/trustworthy_example.png +3 -0
  15. assets/header.png +3 -0
  16. assets/logo.png +3 -0
  17. assets/mhnfs_data/cfg.yaml +42 -0
  18. assets/mhnfs_data/full_context_set.npy +3 -0
  19. assets/mhnfs_data/mhnfs_checkpoint.ckpt +3 -0
  20. assets/mhnfs_overview.png +3 -0
  21. assets/test_reference_data/ecfps.npy +3 -0
  22. assets/test_reference_data/model_input_query.pt +3 -0
  23. assets/test_reference_data/model_input_support_actives.pt +3 -0
  24. assets/test_reference_data/model_input_support_inactives.pt +3 -0
  25. assets/test_reference_data/model_predictions.pt +3 -0
  26. assets/test_reference_data/preprocessed_features.npy +3 -0
  27. assets/test_reference_data/rdkit_descr_quantils.npy +3 -0
  28. assets/test_reference_data/rdkit_descrs.npy +3 -0
  29. assets/test_reference_data/smiles.pkl +3 -0
  30. requirements.txt +10 -0
  31. src/__init__.py +0 -0
  32. src/__pycache__/__init__.cpython-37.pyc +0 -0
  33. src/__pycache__/prediction_pipeline.cpython-37.pyc +0 -0
  34. src/app/__pycache__/constants.cpython-37.pyc +0 -0
  35. src/app/__pycache__/layout.cpython-37.pyc +0 -0
  36. src/app/__pycache__/prediction_utils.cpython-37.pyc +0 -0
  37. src/app/constants.py +269 -0
  38. src/app/layout.py +439 -0
  39. src/app/prediction_utils.py +33 -0
  40. src/data_preprocessing/__init__.py +0 -0
  41. src/data_preprocessing/__pycache__/__init__.cpython-36.pyc +0 -0
  42. src/data_preprocessing/__pycache__/__init__.cpython-37.pyc +0 -0
  43. src/data_preprocessing/__pycache__/constants.cpython-37.pyc +0 -0
  44. src/data_preprocessing/__pycache__/create_descriptors.cpython-36.pyc +0 -0
  45. src/data_preprocessing/__pycache__/create_descriptors.cpython-37.pyc +0 -0
  46. src/data_preprocessing/__pycache__/create_model_inputs.cpython-37.pyc +0 -0
  47. src/data_preprocessing/__pycache__/utils.cpython-37.pyc +0 -0
  48. src/data_preprocessing/constants.py +11 -0
  49. src/data_preprocessing/create_descriptors.py +148 -0
  50. src/data_preprocessing/create_model_inputs.py +46 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.csv filter=lfs diff=lfs merge=lfs -text
37
+ *.png filter=lfs diff=lfs merge=lfs -text
.streamlit/config.toml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [theme]
2
+ base="light"
3
+
4
+ # Primary accent for interactive elements
5
+ primaryColor = '#0078aa'
6
+
7
+ # Background color for the main content area
8
+ # backgroundColor = '#273346'
9
+
10
+ # Background color for sidebar and most interactive widgets
11
+ # secondaryBackgroundColor = '#7d828c'
12
+
13
+ # Color used for almost all text
14
+ # textColor = '#4bc9ff'
15
+
16
+ # Font family for all text in the app, except code blocks
17
+ # Accepted values (serif | sans serif | monospace)
18
+ # Default: "sans serif"
19
+ # font = "sans serif"
README.md CHANGED
@@ -1,13 +1,113 @@
1
  ---
2
- title: Mhnfs
3
- emoji: 🚀
4
- colorFrom: yellow
5
- colorTo: purple
 
6
  sdk: streamlit
7
- sdk_version: 1.32.2
8
  app_file: app.py
9
- pinned: false
10
- license: gpl-3.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: MHNfs
3
+ emoji: 🔬
4
+ short_description: Activity prediction for low-data scenarios
5
+ colorFrom: gray
6
+ colorTo: gray
7
  sdk: streamlit
8
+ sdk_version: 1.29.0
9
  app_file: app.py
10
+ pinned: true
 
11
  ---
12
 
13
+ # Activity Predictions with MHNfs for low-data scenarios
14
+
15
+ ## ⚙️ Under the hood
16
+ <div style="text-align: justify">
17
+ The predictive model (MHNfs) used in this application was specifically designed and
18
+ trained for low-data scenarios. The model predicts whether a molecule is active or
19
+ inactive. The predicted activity value is a continuous value between 0 and 1, and,
20
+ similar to a probability, the higher/lower the value, the more confident the model
21
+ is that the molecule is active/inactive.<br>
22
+ <br>
23
+ The model was trained on the FS-Mol dataset which
24
+ includes 5120 tasks (roughly 5000 tasks were used for training, rest for evaluation).
25
+ The training tasks are listed here:
26
+ <a href="https://github.com/microsoft/FS-Mol/tree/main/datasets/targets"
27
+ target="_blank">https://github.com/microsoft/FS-Mol/tree/main/datasets/targets</a>.
28
+ </div>
29
+
30
+ ## 🎯 About few-shot learning and the model MHNfs
31
+ <div style="text-align: justify">
32
+ <b>Few-shot learning</b> is a machine learning sub-field which aims to provide
33
+ predictive models for scenarios in which only little data is known/available.<br>
34
+ <br>
35
+ <b>MHNfs</b> is a few-shot learning model which is specifically designed for drug
36
+ discovery applications. It is built to use the input prompts in a way such that
37
+ the provided available knowledge, i.e. the known active and inactive molecules,
38
+ functions as context to predict the activity of the new requested molecules.
39
+ Precisely, the provided active and inactive molecules are associated with a
40
+ large set of general molecules - called context molecules - to enrich the
41
+ provided information and to remove spurious correlations arising from the
42
+ decoration of molecules. This is analogous to a Large Language Model which would
43
+ not only use the provided information in the current prompt as context but would
44
+ also have access to way more information, e.g., a prompting history.
45
+ </div>
46
+
47
+ ## 💻 Run the prediction pipeline locally for larger screening chunks
48
+
49
+ ### Get started:
50
+ ```bash
51
+ # Copied from hugging face
52
+ # Make sure you have git-lfs installed (https://git-lfs.com)
53
+ git lfs install
54
+
55
+ # Clone repo
56
+ git clone https://huggingface.co/spaces/tschouis/mhnfs
57
+
58
+ # Alternatively, if you want to clone without large files
59
+ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/spaces/tschouis/mhnfs
60
+ ```
61
+
62
+ ### Install requirements
63
+ ```bash
64
+ pip install -r requirements.txt
65
+ ```
66
+ Notably, this command was tested inside a conda environment with python 3.7.
67
+
68
+ ### Run the prediction pipeline:
69
+ For your screening, load the model, i.e. the **Activity Predictor** into your python file or notebook and simply run it:
70
+ ```python
71
+ from src.prediction_pipeline load ActivityPredictor
72
+
73
+ # Define inputs
74
+ query_smiles = ["C1CCCCC1", "C1CCCCC1", "C1CCCCC1", "C1CCCCC1"] # Replace with your data
75
+ support_actives_smiles = ["C1CCCCC1", "C1CCCCC1"] # Replace with your data
76
+ support_inactives_smiles = ["C1CCCCC1", "C1CCCCC1"] # Replace with your data
77
+
78
+ # Make predictions
79
+ predictions = predictor.predict(query_smiles, support_actives_smiles support_inactives_smiles)
80
+ ```
81
+
82
+ * Provide molecules in SMILES notation.
83
+ * Make sure that the inputs to the Activity Predictor are either comma separated lists, or flattened numpy arrays, or pandas DataFrames. In the latter case, there should be a "smiles" column (both upper and lower case "SMILES" are accepted). All other columns are ignored.
84
+
85
+
86
+
87
+ ### Run the app locally with streamlib:
88
+ ```bash
89
+ # Navigate into root directory of this project
90
+ cd .../whatever_your_dir_name_is/ # Replace with your path
91
+
92
+ # Run streamlit app
93
+ python -m streamlit run
94
+ ```
95
+
96
+
97
+ ## 🤗 Hugging face app
98
+ Explore our hugging-face app here:
99
+
100
+ ## 📚 Cite us
101
+
102
+ ```
103
+ @inproceedings{
104
+ schimunek2023contextenriched,
105
+ title={Context-enriched molecule representations improve few-shot drug discovery},
106
+ author={Johannes Schimunek and Philipp Seidl and Lukas Friedrich and Daniel Kuhn and Friedrich Rippmann and Sepp Hochreiter and Günter Klambauer},
107
+ booktitle={The Eleventh International Conference on Learning Representations},
108
+ year={2023},
109
+ url={https://openreview.net/forum?id=XrMWUuEevr}
110
+ }
111
+ ```
112
+
113
+
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This script runs the streamlit app for MHNfs
3
+
4
+ MHNfs: Few-shot method for drug discovery activity predictions
5
+ (https://openreview.net/pdf?id=XrMWUuEevr)
6
+ """
7
+
8
+ # --------------------------------------------------------------------------------------
9
+ # Imports
10
+ import streamlit as st
11
+
12
+ from src.app.layout import LayoutMaker
13
+ from src.app.prediction_utils import (create_prediction_df,
14
+ create_molecule_grid_plot)
15
+ from src.prediction_pipeline import ActivityPredictor
16
+
17
+ # --------------------------------------------------------------------------------------
18
+ # Functions
19
+ class App():
20
+ def __init__(self):
21
+ # Set page configration to wide
22
+ st.set_page_config(layout="wide", page_title="MHNfs", page_icon="🔬")
23
+
24
+ # Layout maker
25
+ self.layoutMaker = LayoutMaker()
26
+
27
+ # Load mhnfs model
28
+ self.predictor = ActivityPredictor()
29
+
30
+ def define_layout(self):
31
+
32
+ # Define Sidebar width
33
+ css = '''
34
+ <style>
35
+ [data-testid="stSidebar"]{
36
+ min-width: 500px;
37
+ max-width: 500px;
38
+ }
39
+ </style>
40
+ '''
41
+ st.markdown(css, unsafe_allow_html=True)
42
+
43
+ # Sidebar
44
+ self.inputs, self.buttons = self.layoutMaker.make_sidebar()
45
+
46
+ # Main page
47
+ # - header
48
+ self.layoutMaker.make_header()
49
+
50
+ # - main body
51
+ self.layoutMaker.make_main_content_area(self.predictor,
52
+ self.inputs,
53
+ self.buttons,
54
+ create_prediction_df,
55
+ create_molecule_grid_plot)
56
+
57
+ def run_app():
58
+ app = App()
59
+ app.define_layout()
60
+
61
+
62
+ # --------------------------------------------------------------------------------------
63
+ # Run script
64
+ if __name__ == "__main__":
65
+ run_app()
assets/data_preprocessing_objects/ecdfs.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eeec12688fd9e0bb0bbd68d5203e2fb46c45d30a07417f0883adbfc133d48e9f
3
+ size 520417347
assets/data_preprocessing_objects/scaler_fitted.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4538c1c1d9b5b50d29a14c14134f66a563c3a0f4022ce77b8eb2959c3eff51ea
3
+ size 54501
assets/example_csv/.~lock.known_inactive_molecules.csv# ADDED
@@ -0,0 +1 @@
 
 
1
+ ,johannes,Latitude-5501,02.01.2024 15:57,file:///home/johannes/.config/libreoffice/4;
assets/example_csv/known_active_molecules.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc98c05246b42d84c6833d191efa32c7c6473d76c5f2719c8ff3310cfe22df04
3
+ size 353
assets/example_csv/known_inactive_molecules.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6e183c33b7445ae0c00bea4a7cdae52bfce14da2829f6827e20dda162df23af
3
+ size 363
assets/example_csv/molecules_for_prediction.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:497adfdbd026c7ab7d1564b685a246fcb7eb6eabb2442918862b31ccd0b32369
3
+ size 460
assets/example_csv/predictions/nottrustworthy_example.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3f8b5e017175b8d62982b1fc4138a4348f51b6a0469c32df991f5d2576a679d
3
+ size 588
assets/example_csv/predictions/nottrustworthy_example.png ADDED

Git LFS Details

  • SHA256: ae7aff2e2cd2e68bdcb4a5563be38c13d7780453443657b36f01333ab57a949c
  • Pointer size: 130 Bytes
  • Size of remote file: 25.5 kB
assets/example_csv/predictions/trustworthy_example.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3517bcef4a9998975b031d1b4f2b4aa29679669079100230f84e27bc06f80c02
3
+ size 889
assets/example_csv/predictions/trustworthy_example.png ADDED

Git LFS Details

  • SHA256: df2a73cdf527546e8b078cb45618b4554a77f11fdd48367ef25939e0a6a2b518
  • Pointer size: 130 Bytes
  • Size of remote file: 28.3 kB
assets/header.png ADDED

Git LFS Details

  • SHA256: 1d355c5fc158281371a09759584110e611c810d2442e8aad30551998aa728f0a
  • Pointer size: 131 Bytes
  • Size of remote file: 123 kB
assets/logo.png ADDED

Git LFS Details

  • SHA256: 505cc795dcaac622e2af6bf2ed118d7ab28d3eab27fd421755844c042ed7646a
  • Pointer size: 130 Bytes
  • Size of remote file: 40.9 kB
assets/mhnfs_data/cfg.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ encoder:
3
+ activation: selu
4
+ input_dim: 2248
5
+ number_hidden_layers: 0
6
+ number_hidden_neurons: 1024
7
+ regularization:
8
+ input_dropout: 0.1
9
+ dropout: 0.5
10
+ layerNormBlock:
11
+ affine: False
12
+ usage: True
13
+ transformer:
14
+ activity_embedding_dim: 64
15
+ number_heads: 8
16
+ dim_forward: 567
17
+ dropout: 0.5
18
+ num_layers: 1
19
+ ss_dropout: 0.1
20
+ hopfield:
21
+ dim_QK: 512
22
+ heads: 8
23
+ beta: 0.044194173824159216
24
+ dropout: 0.5
25
+ prediction_scaling: 0.044194173824159216
26
+ associationSpace_dim: 1024
27
+ similarityModule:
28
+ type: cosineSim
29
+ l2Norm: False
30
+ scaling: 1/N
31
+ training:
32
+ optimizer: AdamW
33
+ batch_size: 512
34
+ lr: 0.0001
35
+ weightDecay: 0.0
36
+ lrScheduler:
37
+ usage: True
38
+ context:
39
+ ratio_training_molecules: 0.05
40
+ system:
41
+ ressources:
42
+ device: cpu
assets/mhnfs_data/full_context_set.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ed40b8d9cc39859772af0d32ed69c7f2467b9235f83f37ff42611bc22828e52
3
+ size 3899416896
assets/mhnfs_data/mhnfs_checkpoint.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25fcfdb7c6355b7781edaefc9ec56351f012356b17e4087f72b0a78c6d8e2300
3
+ size 313588174
assets/mhnfs_overview.png ADDED

Git LFS Details

  • SHA256: f89731eaf842e6018b4153d60193ea57442fb5933774135a653d4b70ac48afe2
  • Pointer size: 131 Bytes
  • Size of remote file: 467 kB
assets/test_reference_data/ecfps.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:056a628c308cf69e647f2c86090f8f93c2aedcd719845f57f11e653ce6d9d70b
3
+ size 24704
assets/test_reference_data/model_input_query.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e889558eb3300355b5c6ea0ce1518bb949141238b8d26b257ec1bd496baeda18
3
+ size 36715
assets/test_reference_data/model_input_support_actives.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5e55816e09597d267fb91297a56f58a4f4420ed32340650be4c1dd37efe1656
3
+ size 72683
assets/test_reference_data/model_input_support_inactives.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e62e8b18da47d1c9475c18bc2ad50a563f10f0d0bced247d848e453321a13ced
3
+ size 72683
assets/test_reference_data/model_predictions.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7e63ad2ad9b664e3301479427f8d5cf005c979d7cc9e4bce033f18640eb4df0
3
+ size 747
assets/test_reference_data/preprocessed_features.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e97dc7eb85509c6b07156292b57a1bee4eaa8d60fbdb40c7e2e5738c8c6a460
3
+ size 54080
assets/test_reference_data/rdkit_descr_quantils.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cde4d2fd8658cdbcd55e75f14cb360cfa1b239f99d281c1f7296449636e94c6a
3
+ size 4928
assets/test_reference_data/rdkit_descrs.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1b06153004b3f2ac02f0cefd16b0f17225527bbf53f8efe6e43c035b3d21690
3
+ size 2528
assets/test_reference_data/smiles.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0168a7aaa6f7f3eca611a42d70782bae9eb970194449320d37b64f5a8c264f9
3
+ size 179
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ rdkit==2022.3.3
2
+ pytorch-lightning==1.6.1
3
+ torch==1.13.1
4
+ numpy==1.21.5
5
+ pandas==1.3.5
6
+ omegaconf==2.1.2
7
+ mols2grid==1.1.1
8
+ scikit-learn
9
+ statsmodels==0.13.5
10
+ streamlit
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (154 Bytes). View file
 
src/__pycache__/prediction_pipeline.cpython-37.pyc ADDED
Binary file (2.73 kB). View file
 
src/app/__pycache__/constants.cpython-37.pyc ADDED
Binary file (13.1 kB). View file
 
src/app/__pycache__/layout.cpython-37.pyc ADDED
Binary file (13.3 kB). View file
 
src/app/__pycache__/prediction_utils.cpython-37.pyc ADDED
Binary file (1.05 kB). View file
 
src/app/constants.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This file includes all the constant content shown in the app
3
+ """
4
+
5
+ # --------------------------------------------------------------------------------------
6
+
7
+ summary_text = ('''
8
+ This application allows you to make **activity predictions** for
9
+ **biological targets** for which you have only a **little knowledge** in
10
+ terms of known active and inactive molecules.
11
+
12
+ **Provide** via the sidebar:\n
13
+ - some active molecules,
14
+ - some inactive molecules, and
15
+ - molecules you want to predict.
16
+
17
+ Hit **Predict** and explore the predictions!
18
+
19
+ For more **information** about the **model** and **how to provide the
20
+ molecules**, please visit the **Additional Information** tab.
21
+ ''')
22
+
23
+ mhnfs_text =('''
24
+ <div style="text-align: justify">
25
+ <b>MHNfs</b> is a few-shot drug discovery model which consists of a <b>context
26
+ module</b> , a <b>cross-attention module</b> , and a <b>similarity module</b>
27
+ as described here: <a href="https://openreview.net/pdf?id=XrMWUuEevr"
28
+ target="_blank">https://openreview.net/pdf?id=XrMWUuEevr</a>.
29
+ </div>
30
+ <br>
31
+
32
+ <div style="text-align: justify">
33
+ <b>Abstract</b>. A central task in computational drug discovery is to construct
34
+ models from known active molecules to find further promising molecules for
35
+ subsequent screening. However, typically only very few active molecules are
36
+ known. Therefore, few-shot learning methods have the potential to improve the
37
+ effectiveness of this critical phase of the drug discovery process. We introduce
38
+ a new method for few-shot drug discovery. Its main idea is to enrich a molecule
39
+ representation by knowledge about known context or reference molecules. Our
40
+ novel concept for molecule representation enrichment is to associate molecules
41
+ from both the support set and the query set with a large set of reference
42
+ (context) molecules through a modern Hopfield network. Intuitively, this
43
+ enrichment step is analogous to a human expert who would associate a given
44
+ molecule with familiar molecules whose properties are known. The enrichment step
45
+ reinforces and amplifies the covariance structure of the data, while
46
+ simultaneously removing spurious correlations arising from the decoration of
47
+ molecules. Our approach is compared with other few-shot methods for drug
48
+ discovery on the FS-Mol benchmark dataset. On FS-Mol, our approach outperforms
49
+ all compared methods and therefore sets a new state-of-the art for few-shot
50
+ learning in drug discovery. An ablation study shows that the enrichment step of
51
+ our method is the key to improve the predictive quality. In a domain shift
52
+ experiment, we further demonstrate the robustness of our method. Code is
53
+ available at <a href="https://github.com/ml-jku/MHNfs"
54
+ target="_blank">https://github.com/ml-jku/MHNfs</a>.
55
+ </div>
56
+ <br>
57
+ <br>
58
+ ''')
59
+
60
+ citation_text = '''
61
+ ###
62
+ @inproceedings{
63
+ schimunek2023contextenriched,
64
+ title={Context-enriched molecule representations improve few-shot drug discovery},
65
+ author={Johannes Schimunek and Philipp Seidl and Lukas Friedrich and Daniel Kuhn and Friedrich Rippmann and Sepp Hochreiter and Günter
66
+ Klambauer},
67
+ booktitle={The Eleventh International Conference on Learning Representations},
68
+ year={2023},
69
+ url={https://openreview.net/forum?id=XrMWUuEevr}
70
+ }
71
+ '''
72
+
73
+ few_shot_learning_text = (
74
+ '''
75
+ <div style="text-align: justify">
76
+ <b>Few-shot learning</b> is a machine learning sub-field which aims to provide
77
+ predictive models for scenarios in which only little data is known/available.<br>
78
+ <br>
79
+
80
+ <b>MHNfs</b> is a few-shot learning model which is specifically designed for drug
81
+ discovery applications. It is built to use the input prompts in a way such that
82
+ the provided available knowledge, i.e. the known active and inactive molecules,
83
+ functions as context to predict the activity of the new requested molecules.
84
+ Precisely, the provided active and inactive molecules are associated with a
85
+ large set of general molecules - called context molecules - to enrich the
86
+ provided information and to remove spurious correlations arising from the
87
+ decoration of molecules. This is analogous to a Large Language Model which would
88
+ not only use the provided information in the current prompt as context but would
89
+ also have access to way more information, e.g., a prompting history.
90
+ </div>
91
+ ''')
92
+
93
+ under_the_hood_text = ('''
94
+ <div style="text-align: justify">
95
+ The predictive model (MHNfs) used in this application was specifically designed and
96
+ trained for low-data scenarios. The model predicts whether a molecule is active or
97
+ inactive. The predicted activity value is a continuous value between 0 and 1, and,
98
+ similar to a probability, the higher/lower the value, the more confident the model
99
+ is that the molecule is active/inactive.
100
+
101
+ The model was trained on the FS-Mol dataset which
102
+ includes 5120 tasks (roughly 5000 tasks were used for training, rest for evaluation).
103
+ The training tasks are listed here:
104
+ <a href="https://github.com/microsoft/FS-Mol/tree/main/datasets/targets"
105
+ target="_blank">https://github.com/microsoft/FS-Mol/tree/main/datasets/targets</a>.
106
+ </div>
107
+ ''')
108
+
109
+ usage_text = ('''
110
+ <div style="text-align: justify">
111
+ To use this application, you need to provide <b>3 different sets of molecules</b>:
112
+ <ol>
113
+ <li><b>active</b> molecules: set of known active molecules,</li>
114
+ <li><b>inactive</b> molecules: set of known inactive molecules, and</li>
115
+ <li>molecules to <b>predict</b>: set of molecules you want to predict.</li>
116
+ </ol>
117
+ These three sets can be provided via the <b>sidebar</b>. The sidebar also includes two
118
+ buttons <b>predict</b> and <b>reset</b> to run the prediction pipeline and to
119
+ reset it.
120
+ </div>
121
+ ''')
122
+
123
+ data_text = ('''
124
+ <div style="text-align: justify">
125
+ <ul>
126
+ <li> Molecules have to be provided in SMILES format</li>
127
+ <li> For each input, the maximum number of molecules which can be provided is
128
+ restricted to 20 </li>
129
+ <li> You can provide the molecules via the text boxes or via CSV upload
130
+ <ul>
131
+ <li> Text box
132
+ <ul>
133
+ <li> Replace the pseudo input by directly typing your molecules
134
+ into
135
+ the text box </li>
136
+ <li> Separate the molecules by comma </li>
137
+ </ul>
138
+ </li>
139
+ <li> CSV upload
140
+ <ul>
141
+ <li> The CSV file should include a "smiles" column (both upper
142
+ and lower case "SMILES" are accepted) </li>
143
+ <li> All other columns will be ignored </li>
144
+ <li> Examples are provided here:
145
+ <div style="background-color: #efefef">
146
+ assets/example_csv/ </li>
147
+ </div>
148
+ </ul>
149
+ </li>
150
+ </ul>
151
+ </li>
152
+ </ul>
153
+ </div>
154
+ ''')
155
+
156
+ trust_text = ('''
157
+ <div style="text-align: justify">
158
+ Just like all other machine learning models, the performance of MHNfs varies
159
+ and, generally, the model works well if the task is somehow close to tasks which
160
+ were used to train the model. The model performance for very different tasks is
161
+ unclear and might be poor.<br>
162
+ <br>
163
+
164
+ MHNfs was trained on the FS-Mol dataset which includes 5120 tasks (roughly
165
+ 5000 tasks were used for training, rest for evaluation). The training tasks are
166
+ listed here: <a href= https://github.com/microsoft/FS-Mol/tree/main/datasets/targets
167
+ target="_blank">https://github.com/microsoft/FS-Mol/tree/main/datasets/targets</a>.
168
+ </div>
169
+ ''')
170
+
171
+ example_trustworthy_text = ('''
172
+ <div style="text-align: justify">
173
+ Since the predicitve model has seen a lot of kinase related tasks during training,
174
+ the model is expected to generally perform well on kinase targets. For this example,
175
+ we use data for the target
176
+ <a href=https://www.ebi.ac.uk/chembl/target_report_card/CHEMBL5914/
177
+ target="_blank">CHEMBL5914</a>. Notably, this specific kinase has not been seen
178
+ during training. Precisely, we use the available inhibition data while molecules
179
+ with an inhibition value greater (smaller) than 50 % are considered as active
180
+ (inactive).<br>
181
+
182
+ From the known available data, we have selected 4 "known" active molecules,
183
+ 8 "known" inactive molecules, and 11 molecules to predict.<br>
184
+
185
+ <b>Molecules to predict</b>:
186
+ <div style="background-color: #efefef">
187
+ FC(F)(F)c1ccc(Cl)cc1CN1CCNc2ncc(-c3ccnc(N4CCNCC4)c3)cc21,<br>
188
+ CS(=O)(=O)c1ccc(-n2nc(-c3cnc4[nH]ccc4c3)c3c(N)ncnc32)cc1,<br>
189
+ O=C(Nc1ccccc1Cl)c1cnc2ccc(C3CCNCC3)cn12.O=C(O)C(=O)O,<br>
190
+ CC(C)n1cnc2c(Nc3cccc(Cl)c3)nc(N[C@@H]3CCCC[C@@H]3N)nc21,<br>
191
+ Nc1ncc(-c2ccc(NS(=O)(=O)C3CC3)cc2F)cc1-c1ccc2c(c1)CCNC2=O,<br>
192
+ CCN1CCN(Cc2ccc(NC(=O)c3ccc(C)c(C#Cc4cccnc4)c3)cc2C(F)(F)F)CC1,<br>
193
+ CN1CCN(c2ccc(-c3cnc4c(c3)N(Cc3cc(Cl)ccc3C(F)(F)F)CCN4)cn2)CC1,<br>
194
+ CC(C)n1nc(-c2cnc(N)c(OC(F)(F)F)c2)cc1[C@H]1[C@@H]2CN(C3COC3)C[C@@H]21,<br>
195
+ Nc1ncc(-c2cc([C@H]3[C@@H]4CN(C5COC5)C[C@@H]43)n(CC3CC3)n2)cc1C(F)(F)F,<br>
196
+ Cc1ccc(NC(=O)C2(C(=O)Nc3ccc(Nc4ncc(F)c(-c5cc(F)c6nc(C)n(C(C)C)c6c5)n4)cc3)CC2)cc1,<br>
197
+ C[C@@H](Oc1cc(-c2cnn(C3CCNCC3)c2)cnc1N)c1c(Cl)ccc(F)c1Cl
198
+ </div><br>
199
+
200
+ <b>Known active molecules</b>:
201
+ <div style="background-color: #efefef">
202
+ CC(=O)N1CCN(c2cc(-c3cnc4c(c3)N(Cc3cc(Cl)ccc3C(F)(F)F)CCN4)ccn2)CC1,<br>
203
+ CS(=O)(=O)c1cccc(Nc2nccc(-c3sc(N4CCOCC4)nc3-c3cccc(NS(=O)(=O)c4c(F)cccc4F)c3)n2)c1,<br>
204
+ COc1cnccc1Nc1nc(-c2nn(Cc3c(F)cc(OCCO)cc3F)c3ccccc23)ncc1OC,<br>
205
+ CN(C)[C@@H]1CC[C@@]2(C)[C@@H](CC[C@@H]3[C@@H]2CC[C@]2(C)C(c4cccc5cnccc45)=CC[C@@H]32)C1<br>
206
+ </div><br>
207
+
208
+ <b>Known inactive molecules</b>:
209
+ <div style="background-color: #efefef">
210
+ c1cc(-c2c[nH]c3cnccc23)ccn1,<br>
211
+ COc1ccc2c3ccnc(C(F)(F)F)c3n(CCCCN)c2c1,<br>
212
+ CNS(=O)(=O)c1ccc(N(C)C)c(Nc2ncnc3cc(OC)c(OC)cc23)c1,<br>
213
+ CN(C1CC1)S(=O)(=O)c1ccc(-c2cnc(N)c(-c3ccc4c(c3)CCNC4=O)c2)c(F)c1,<br>
214
+ CCN1CCN(Cc2ccc(NC(=O)c3ccc(C)c(C#Cc4cnc5[nH]ccc5c4)c3)cc2C(F)(F)F)CC1,<br>
215
+ CC(C)n1cc(-c2cc(-c3ccc(CN4CCOCC4)cc3)cnc2N)nn1,<br>
216
+ CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c(N3CCOCC3)cc2C1=O,<br>
217
+ [2H]C([2H])([2H])C1(C([2H])([2H])[2H])Cn2nc(-c3ccc(F)cn3)c(-c3ccnc4[nH]ncc34)c2CO1<br>
218
+ </div><br>
219
+
220
+ <b>Predictions</b>:<br>
221
+
222
+ </div>
223
+ ''')
224
+
225
+ example_nottrustworthy_text = ('''
226
+ <div style="text-align: justify">
227
+ For this example, we use data for the auxiliary transport protein target
228
+ <a href=https://www.ebi.ac.uk/chembl/target_report_card/CHEMBL5738/
229
+ target="_blank">CHEMBL5738</a>. Precisely, we use the available Ki data
230
+ while molecules with a pCHEMBL value greater (smaller) than 5 are considered
231
+ as active (inactive).<br>
232
+
233
+ From the known available data, we have selected 4 "known" active molecules,
234
+ 3 "known" inactive molecules, and 10 molecules to predict.<br>
235
+
236
+ <b>Molecules to predict</b>:
237
+ <div style="background-color: #efefef">
238
+ CC(C(=O)O)c1ccc(-c2ccccc2)c(F)c1,<br>
239
+ O=S(=O)(O)Oc1cccc2cccc(Nc3ccccc3)c12,<br>
240
+ CCCCCCCC/C=C\CCCCCCCC(=O)O,<br>
241
+ C[C@]12C=CC(=O)C=C1CC[C@@H]1[C@@H]2[C@@H](O)C[C@@]2(C)[C@H]1CC[C@]2(O)C(=O)CO,<br>
242
+ CCOC(=O)C(C)(C)Oc1ccc(Cl)cc1,<br>
243
+ Cc1ccc(Cl)c(Nc2ccccc2C(=O)O)c1Cl,<br>
244
+ O=C(O)Cc1ccccc1Nc1c(Cl)cccc1Cl,<br>
245
+ CC(C)(Oc1ccc(CCNC(=O)c2ccc(Cl)cc2)cc1)C(=O)O,<br>
246
+ O=C(c1ccccc1)c1ccc2n1CCC2C(=O)O,<br>
247
+ CC(C)OC(=O)C(C)(C)Oc1ccc(C(=O)c2ccc(Cl)cc2)cc1<br>
248
+ </div><br>
249
+
250
+ <b>Known active molecules</b>:
251
+ <div style="background-color: #efefef">
252
+ CC(C(=O)O)c1ccc(N2Cc3ccccc3C2=O)cc1,<br>
253
+ CN1C(=O)CN=C(c2ccccc2)c2cc(Cl)ccc21,<br>
254
+ CC(C)(Oc1ccc(C(=O)c2ccc(Cl)cc2)cc1)C(=O)O,<br>
255
+ CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(C)[C@H]3CC[C@]12C
256
+
257
+ </div><br>
258
+
259
+ <b>Known inactive molecules</b>:
260
+ <div style="background-color: #efefef">
261
+ CC(C)Cc1ccc(C(C)C(=O)O)cc1,<br>
262
+ O=C1Nc2ccc(Cl)cc2C(c2ccccc2Cl)=NC1O,<br>
263
+ C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C@@]3(F)[C@@H](O)C[C@]2(C)[C@@]1(O)C(=O)CO
264
+ </div><br>
265
+
266
+ <b>Predictions</b>:<br>
267
+
268
+ </div>
269
+ ''')
src/app/layout.py ADDED
@@ -0,0 +1,439 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This file defines the layout of the app including the header, sidebar, and tabs in the
3
+ main content area.
4
+ """
5
+
6
+ #---------------------------------------------------------------------------------------
7
+ # Imports
8
+ import streamlit as st
9
+ import streamlit.components.v1 as components
10
+ from PIL import Image
11
+ import pandas as pd
12
+ import yaml
13
+
14
+ from src.data_preprocessing.create_descriptors import handle_inputs
15
+ from src.app.constants import (summary_text,
16
+ mhnfs_text,
17
+ citation_text,
18
+ few_shot_learning_text,
19
+ under_the_hood_text,
20
+ usage_text,
21
+ data_text,
22
+ trust_text,
23
+ example_trustworthy_text,
24
+ example_nottrustworthy_text)
25
+ #---------------------------------------------------------------------------------------
26
+ # Global variables
27
+ MAX_INPUT_LENGTH = 20
28
+
29
+ #---------------------------------------------------------------------------------------
30
+ # Functions
31
+
32
+ class LayoutMaker():
33
+ """
34
+ This class includes all the design choices regarding the layout of the app. This
35
+ class can be used in the main file to define header, sidebar, and main content area.
36
+ """
37
+
38
+ def __init__(self):
39
+
40
+ # Initialize the inputs dictionary
41
+ self.inputs = dict() # this will be the storage for query and support set inputs
42
+ self.inputs_lists = dict()
43
+
44
+ # Initialize prediction storage
45
+ self.predictions = None
46
+
47
+ # Buttons
48
+ self.buttons = dict() # this will be the storage for buttons
49
+
50
+ # content
51
+ self.summary_text = summary_text
52
+ self.mhnfs_text = mhnfs_text
53
+ self.citation_text = citation_text
54
+ self.few_shot_learning_text = few_shot_learning_text
55
+ self.under_the_hood_text = under_the_hood_text
56
+ self.usage_text = usage_text
57
+ self.data_text = data_text
58
+ self.trust_text = trust_text
59
+ self.example_trustworthy_text = example_trustworthy_text
60
+ self.example_nottrustworthy_text = example_nottrustworthy_text
61
+
62
+ self.df_trustworthy = pd.read_csv("./assets/example_csv/predictions/"
63
+ "trustworthy_example.csv")
64
+ self.df_nottrustworthy = pd.read_csv("./assets/example_csv/predictions/"
65
+ "nottrustworthy_example.csv")
66
+
67
+ self.max_input_length = MAX_INPUT_LENGTH
68
+
69
+ def make_sidebar(self):
70
+ """
71
+ This function defines the sidebar of the app. It includes the logo, query box,
72
+ support set boxes, and predict buttons.
73
+ It returns the stored inputs (for query and support set) and the buttons which
74
+ allow for user interactions.
75
+ """
76
+ with st.sidebar:
77
+ # Logo
78
+ logo = Image.open("./assets/logo.png")
79
+ st.image(logo)
80
+ st.divider()
81
+
82
+ # Query box
83
+ self._make_query_box()
84
+ st.divider()
85
+
86
+ # Support set actives box
87
+ self._make_active_support_set_box()
88
+ st.divider()
89
+
90
+ # Support set inactives box
91
+ self._make_inactive_support_set_box()
92
+ st.divider()
93
+
94
+ # Predict buttons
95
+ self.buttons["predict"] = st.button("Predict...")
96
+ self.buttons["reset"] = st.button("Reset")
97
+
98
+ return self.inputs, self.buttons
99
+
100
+ def make_header(self):
101
+ """
102
+ This function defines the header of the app. It consists only of a png image
103
+ in which the title and an overview is given.
104
+ """
105
+
106
+ header_container = st.container()
107
+ with header_container:
108
+ header = Image.open("./assets/header.png")
109
+ st.image(header)
110
+
111
+ def make_main_content_area(self,
112
+ predictor,
113
+ inputs,
114
+ buttons,
115
+ create_prediction_df: callable,
116
+ create_molecule_grid_plot: callable):
117
+
118
+
119
+ tab1, tab2, tab3, tab4 = st.tabs(["Predictions",
120
+ "Paper / Cite",
121
+ "Additional Information",
122
+ "Examples"])
123
+
124
+ # Results tab
125
+ with tab1:
126
+ self._fill_tab_with_results_content(predictor,
127
+ inputs,
128
+ buttons,
129
+ create_prediction_df,
130
+ create_molecule_grid_plot)
131
+
132
+ # Paper tab
133
+ with tab2:
134
+ self._fill_paper_and_citation_tab()
135
+
136
+ # More explanations tab
137
+ with tab3:
138
+ self._fill_more_explanations_tab()
139
+
140
+ with tab4:
141
+ self._fill_examples_tab()
142
+
143
+ def _make_query_box(self):
144
+ """
145
+ This function
146
+ a) defines the query box and
147
+ b) stores the query input in the inputs dictionary
148
+ """
149
+
150
+ st.info(":blue[Molecules to predict:]", icon="❓")
151
+
152
+ query_container = st.container()
153
+ with query_container:
154
+ input_choice = st.radio(
155
+ "Input your data in SMILES notation via:", ["Text box", "CSV upload"]
156
+ )
157
+ if input_choice == "Text box":
158
+ query_input = st.text_area(
159
+ label="SMILES input for query molecules",
160
+ label_visibility="hidden",
161
+ key="query_textbox",
162
+ value="CC(C)Sc1nc(C(C)(C)C)nc(OCC(=O)O)c1C#N, "
163
+ "Cc1nc(NCc2cccnc2)cc(=O)n1CC(=O)O",
164
+ )
165
+ elif input_choice == "CSV upload":
166
+ query_file = st.file_uploader(key="query_csv",
167
+ label = "CSV upload for query mols",
168
+ label_visibility="hidden")
169
+ if query_file is not None:
170
+ query_input = pd.read_csv(query_file)
171
+ else: query_input = None
172
+
173
+ # Update storage
174
+ self.inputs["query"] = query_input
175
+
176
+ def _make_active_support_set_box(self):
177
+ """
178
+ This function
179
+ a) defines the active support set box and
180
+ b) stores the active support set input in the inputs dictionary
181
+ """
182
+
183
+ st.info(":blue[Known active molecules:]", icon="✨")
184
+ active_container = st.container()
185
+ with active_container:
186
+ active_input_choice = st.radio(
187
+ "Input your data in SMILES notation via:",
188
+ ["Text box", "CSV upload"],
189
+ key="active_input_choice",
190
+ )
191
+
192
+ if active_input_choice == "Text box":
193
+ support_active_input = st.text_area(
194
+ label="SMILES input for active support set molecules",
195
+ label_visibility="hidden",
196
+ key="active_textbox",
197
+ value="Cc1nc(NCC2CCCCC2)c(C#N)c(=O)n1CC(=O)O, "
198
+ "CSc1nc(C(C)C)nc(OCC(=O)O)c1C#N"
199
+ )
200
+ elif active_input_choice == "CSV upload":
201
+ support_active_file = st.file_uploader(
202
+ key="support_active_csv",
203
+ label = "CSV upload for active support set molecules",
204
+ label_visibility="hidden"
205
+ )
206
+ if support_active_file is not None:
207
+ support_active_input = pd.read_csv(support_active_file)
208
+ else: support_active_input = None
209
+
210
+ # Update storage
211
+ self.inputs["support_active"] = support_active_input
212
+
213
+ def _make_inactive_support_set_box(self):
214
+ st.info(":blue[Known inactive molecules:]", icon="✨")
215
+ inactive_container = st.container()
216
+ with inactive_container:
217
+ inactive_input_choice = st.radio(
218
+ "Input your data in SMILES notation via:",
219
+ ["Text box", "CSV upload"],
220
+ key="inactive_input_choice",
221
+ )
222
+ if inactive_input_choice == "Text box":
223
+ support_inactive_input = st.text_area(
224
+ label="SMILES input for inactive support set molecules",
225
+ label_visibility="hidden",
226
+ key="inactive_textbox",
227
+ value="CSc1nc(C)nc(OCC(=O)O)c1C#N, "
228
+ "CSc1nc(C)n(CC(=O)O)c(=O)c1C#N"
229
+ )
230
+ elif inactive_input_choice == "CSV upload":
231
+ support_inactive_file = st.file_uploader(
232
+ key="support_inactive_csv",
233
+ label = "CSV upload for inactive support set molecules",
234
+ label_visibility="hidden"
235
+ )
236
+ if support_inactive_file is not None:
237
+ support_inactive_input = pd.read_csv(
238
+ support_inactive_file
239
+ )
240
+ else: support_inactive_input = None
241
+
242
+ # Update storage
243
+ self.inputs["support_inactive"] = support_inactive_input
244
+
245
+ def _fill_tab_with_results_content(self, predictor, inputs, buttons,
246
+ create_prediction_df, create_molecule_grid_plot):
247
+ tab_container = st.container()
248
+ with tab_container:
249
+ # Info
250
+ st.info(":blue[Summary:]", icon="🚀")
251
+ st.markdown(self.summary_text)
252
+
253
+ # Results
254
+ st.info(":blue[Results:]",icon="👨‍💻")
255
+
256
+ if buttons['predict']:
257
+
258
+ # Check 1: Are all inputs provided?
259
+ if (inputs['query'] is None or
260
+ inputs['support_active'] is None or
261
+ inputs['support_inactive'] is None):
262
+ st.error("You didn't provide all necessary inputs.\n\n"
263
+ "Please provide all three necessary inputs via the "
264
+ "sidebar and hit the predict button again.")
265
+ else:
266
+ # Check 2: Less than max allowed molecules provided?
267
+ max_input_length = 0
268
+ for key, input in inputs.items():
269
+ input_list = handle_inputs(input)
270
+ self.inputs_lists[key] = input_list
271
+ max_input_length = max(max_input_length, len(input_list))
272
+
273
+ if max_input_length > self.max_input_length:
274
+ st.error("You provided too many molecules. The number of "
275
+ "molecules for each input is restricted to "
276
+ f"{self.max_input_length}.\n\n"
277
+ "For larger screenings, we suggest to clone the repo "
278
+ "and to run the model locally.")
279
+ else:
280
+ # Progress bar
281
+ progress_bar_text = ("I'm predicting activities. This might "
282
+ "need some minutes. Please wait...")
283
+ progress_bar = st.progress(50, text=progress_bar_text)
284
+
285
+ # Results table
286
+ df = self._predict_and_create_results_table(predictor,
287
+ inputs,
288
+ create_prediction_df)
289
+
290
+ progress_bar_text = ("Done. Here are the results:")
291
+ progress_bar = progress_bar.progress(100, text=progress_bar_text)
292
+ st.dataframe(df, use_container_width=True)
293
+
294
+ col1, col2, col3, col4 = st.columns([1,1,1,1])
295
+ # Provide download button for predictions
296
+ with col2:
297
+ self.buttons["download_results"] = st.download_button(
298
+ "Download predictions as CSV",
299
+ self._convert_df_to_binary(df),
300
+ file_name="predictions.csv",
301
+ )
302
+
303
+ # Provide download button for inputs
304
+ with col3:
305
+ with open("inputs.yml", 'w') as fl:
306
+ self.buttons["download_inputs"] = st.download_button(
307
+ "Download inputs as YML",
308
+ self._convert_to_yml(self.inputs_lists),
309
+ file_name="inputs.yml",
310
+ )
311
+ st.divider()
312
+
313
+ # Results grid
314
+ st.info(":blue[Grid plot of the predicted molecules:]",
315
+ icon="📊")
316
+ mol_html_grid = create_molecule_grid_plot(df)
317
+ components.html(mol_html_grid, height=1000, scrolling=True)
318
+
319
+ elif buttons['reset']:
320
+ self._reset()
321
+
322
+ def _fill_paper_and_citation_tab(self):
323
+ st.info(":blue[**Paper: Context-enriched molecule representations improve "
324
+ "few-shot drug discovery**]", icon="📄")
325
+ st.markdown(self.mhnfs_text, unsafe_allow_html=True)
326
+ st.image("./assets/mhnfs_overview.png")
327
+ st.write("")
328
+ st.write("")
329
+ st.write("")
330
+ st.info(":blue[**Cite us / BibTex**]", icon="📚")
331
+ st.markdown(self.citation_text)
332
+
333
+ def _fill_more_explanations_tab(self):
334
+ st.info(":blue[**Under the hood**]", icon="⚙️")
335
+ st.markdown(self.under_the_hood_text, unsafe_allow_html=True)
336
+ st.write("")
337
+ st.write("")
338
+
339
+ st.info(":blue[**About few-shot learning and the model MHNfs**]", icon="🎯")
340
+ st.markdown(self.few_shot_learning_text, unsafe_allow_html=True)
341
+ st.write("")
342
+ st.write("")
343
+
344
+ st.info(":blue[**Usage**]", icon="🎛️")
345
+ st.markdown(self.usage_text, unsafe_allow_html=True)
346
+ st.write("")
347
+ st.write("")
348
+
349
+ st.info(":blue[**How to provide the data**]", icon="📀")
350
+ st.markdown(self.data_text, unsafe_allow_html=True)
351
+ st.write("")
352
+ st.write("")
353
+
354
+ st.info(":blue[**When to trust the predictions**]", icon="🔍")
355
+ st.markdown(self.trust_text, unsafe_allow_html=True)
356
+
357
+ def _fill_examples_tab(self):
358
+ st.info(":blue[**Example for trustworthy predictions**]", icon="✅")
359
+ st.markdown(self.example_trustworthy_text, unsafe_allow_html=True)
360
+ st.dataframe(self.df_trustworthy, use_container_width=True)
361
+ st.markdown("**Plot: Predictions for active and inactive molecules (model AUC="
362
+ "0.96**)")
363
+ prediction_plot_tw = Image.open("./assets/example_csv/predictions/"
364
+ "trustworthy_example.png")
365
+ st.image(prediction_plot_tw)
366
+ st.write("")
367
+ st.write("")
368
+
369
+ st.info(":blue[**Example for not trustworthy predictions**]", icon="⛔️")
370
+ st.markdown(self.example_nottrustworthy_text, unsafe_allow_html=True)
371
+ st.dataframe(self.df_nottrustworthy, use_container_width=True)
372
+ st.markdown("**Plot: Predictions for active and inactive molecules (model AUC="
373
+ "0.42**)")
374
+ prediction_plot_ntw = Image.open("./assets/example_csv/predictions/"
375
+ "nottrustworthy_example.png")
376
+ st.image(prediction_plot_ntw)
377
+
378
+ def _predict_and_create_results_table(self,
379
+ predictor,
380
+ inputs,
381
+ create_prediction_df: callable):
382
+
383
+ df = create_prediction_df(predictor,
384
+ inputs['query'],
385
+ inputs['support_active'],
386
+ inputs['support_inactive'])
387
+ return df
388
+
389
+ def _reset(self):
390
+ keys = list(st.session_state.keys())
391
+ for key in keys:
392
+ st.session_state.pop(key)
393
+
394
+ def _convert_df_to_binary(_self, df):
395
+ return df.to_csv(index=False).encode('utf-8')
396
+
397
+ def _convert_to_yml(_self, inputs):
398
+ return yaml.dump(inputs)
399
+ content = """
400
+ # Usage
401
+ As soon as you have a few active and inactive molecules for your task, you can
402
+ provide them here and make predictions for new molecules.
403
+
404
+ ## About few-shot learning and the model MHNfs
405
+ **Few-shot learning** is a machine learning sub-field which aims to provide
406
+ predictive models for scenarios in which only little data is known/available.
407
+
408
+ **MHNfs** is a few-shot learning model which is specifically designed for drug
409
+ discovery applications. It is built to use the input prompts in a way such that
410
+ the provided available knowledge - i.e. the known active and inactive molecules -
411
+ functions as context to predict the activity of the new requested molecules.
412
+ Precisely, the provided active and inactive molecules are associated with a
413
+ large set of general molecules - called context molecules - to enrich the
414
+ provided information and to remove spurious correlations arising from the
415
+ decoration of molecules. This is analogous to a Large Language Model which would
416
+ not only use the provided information in the current prompt as context but would
417
+ also have access to way more information, e.g. a prompting history.
418
+
419
+ ## How to provide the data
420
+ * Molecules have to be provided in SMILES format.
421
+ * You can provide the molecules via the text boxes or via CSV upload.
422
+ - Text box: Replace the pseudo input by directly typing your molecules into
423
+ the text box. Please separate the molecules by comma.
424
+ - CSV upload: Upload a CSV file with the molecules.
425
+ * The CSV file should include a smiles column (both upper and lower
426
+ case "SMILES" are accepted).
427
+ * All other columns will be ignored.
428
+
429
+ ## When to trust the predictions
430
+ Just like all other machine learning models, the performance of MHNfs varies
431
+ and, generally, the model works well if the task is somehow close to tasks which
432
+ were used to train the model. The model performance for very different tasks is
433
+ unclear and might be poor.
434
+
435
+ MHNfs was trained on a the FS-Mol dataset which includes 5120 tasks (Roughly
436
+ 5000 tasks were used for training, rest for evaluation). The training tasks are
437
+ listed here: https://github.com/microsoft/FS-Mol/tree/main/datasets/targets.
438
+ """
439
+ return content
src/app/prediction_utils.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module includes all functions which are called from the main app and are needed to
3
+ make activity predictions and to output the results.
4
+ """
5
+
6
+ #---------------------------------------------------------------------------------------
7
+ # Deendencies
8
+ import pandas as pd
9
+ import mols2grid
10
+ #---------------------------------------------------------------------------------------
11
+ # Define functions
12
+
13
+ def create_prediction_df(predictor, query_smiles, support_activces_smiles,
14
+ support_inactives_smiles):
15
+ """
16
+ This function creates a dataframe with the query molecules and the corresponding
17
+ predictions.
18
+ """
19
+ # Make predictions
20
+ predictions = predictor.predict(query_smiles, support_activces_smiles,
21
+ support_inactives_smiles)
22
+
23
+ smiles = predictor._return_query_mols_as_list()
24
+
25
+ # Create dataframe
26
+ prediction_df = pd.DataFrame({"Molecule": smiles,
27
+ "Predicted activity": predictions.astype('str')})
28
+
29
+ return prediction_df
30
+
31
+ def create_molecule_grid_plot(df, smiles_col="Molecule"):
32
+ mol_html_grid = mols2grid.display(df,smiles_col=smiles_col)._repr_html_()
33
+ return mol_html_grid
src/data_preprocessing/__init__.py ADDED
File without changes
src/data_preprocessing/__pycache__/__init__.cpython-36.pyc ADDED
Binary file (167 Bytes). View file
 
src/data_preprocessing/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (173 Bytes). View file
 
src/data_preprocessing/__pycache__/constants.cpython-37.pyc ADDED
Binary file (1.61 kB). View file
 
src/data_preprocessing/__pycache__/create_descriptors.cpython-36.pyc ADDED
Binary file (2.39 kB). View file
 
src/data_preprocessing/__pycache__/create_descriptors.cpython-37.pyc ADDED
Binary file (4.19 kB). View file
 
src/data_preprocessing/__pycache__/create_model_inputs.cpython-37.pyc ADDED
Binary file (1.29 kB). View file
 
src/data_preprocessing/__pycache__/utils.cpython-37.pyc ADDED
Binary file (7.49 kB). View file
 
src/data_preprocessing/constants.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ USED_200_DESCR = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,25,26,27,28,29,30, 31,32,33,
2
+ 34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,
3
+ 57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,
4
+ 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,
5
+ 102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,
6
+ 119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,
7
+ 136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,
8
+ 153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,
9
+ 170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,
10
+ 187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,
11
+ 204,205,206,207]
src/data_preprocessing/create_descriptors.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This file includes all necessary code to preprocess molecules (assumed to be in SMILES
3
+ format) and create descriptors which can be fed into MHNfs.
4
+ """
5
+
6
+ #---------------------------------------------------------------------------------------
7
+ # Dependencies
8
+ import numpy as np
9
+ import pandas as pd
10
+ import pickle
11
+ from typing import List
12
+ from rdkit import Chem, DataStructs
13
+ from rdkit.Chem.rdchem import Mol
14
+ from rdkit.Chem import Descriptors, rdFingerprintGenerator
15
+
16
+ from src.data_preprocessing.constants import USED_200_DESCR
17
+ from src.data_preprocessing.utils import Standardizer
18
+
19
+ #---------------------------------------------------------------------------------------
20
+ # Define main function
21
+
22
+ def preprocess_molecules(input_molecules: [str, List[str], pd.DataFrame]):
23
+ """
24
+ This function preprocesses molecules (assumed to be in SMILES format) and creates
25
+ descriptors which can be fed into MHNfs.
26
+ """
27
+
28
+ # Load needed objects
29
+ current_loc = __file__.rsplit("/",3)[0]
30
+ with open(current_loc + "/assets/data_preprocessing_objects/scaler_fitted.pkl",
31
+ "rb") as fl:
32
+ scaler = pickle.load(fl)
33
+
34
+ with open(current_loc + "/assets/data_preprocessing_objects/ecdfs.pkl", "rb") as fl:
35
+ ecdfs = pickle.load(fl)
36
+
37
+ # Ensure that input_molecules is an Iterable with strs
38
+ input_smiles = handle_inputs(input_molecules)
39
+
40
+ # Create cleanded rdkit mol objects
41
+ input_molecules = create_cleaned_mol_objects(input_smiles)
42
+
43
+ # Create fingerprints and descriptors
44
+ ecfps = create_ecfp_fps(input_molecules)
45
+ rdkit_descrs = create_rdkit_descriptors(input_molecules)
46
+
47
+ # Create quantils
48
+ rdkit_descr_quantils = create_quantils(rdkit_descrs, ecdfs)
49
+
50
+ # Concatenate features
51
+ raw_features = np.concatenate((ecfps, rdkit_descr_quantils), axis=1)
52
+
53
+ # Normalize feature vectors
54
+ normalized_features = scaler.transform(raw_features)
55
+
56
+ # Return feature vectors
57
+ return normalized_features
58
+
59
+ #---------------------------------------------------------------------------------------
60
+ # Define helper functions
61
+ def handle_inputs(input_molecules: [str, List[str], pd.DataFrame]):
62
+ """
63
+ This function handles the input molecules.
64
+ """
65
+
66
+ if isinstance(input_molecules, list):
67
+ return input_molecules
68
+
69
+ elif isinstance(input_molecules, pd.DataFrame):
70
+ input_molecules.columns = [c.lower() for c in input_molecules.columns]
71
+ if "smiles" not in input_molecules.columns:
72
+ raise ValueError(("Input DataFrame must have a column named 'Smiles'."))
73
+ iterable = list(input_molecules["smiles"].values)
74
+ return iterable
75
+
76
+ elif isinstance(input_molecules, str):
77
+ smiles_list = input_molecules.split(",")
78
+ smiles_list_cleaned = [smiles.strip() for smiles in smiles_list]
79
+
80
+ smiles_list_cleaned = [smiles for smiles in smiles_list_cleaned if smiles != ""]
81
+ return smiles_list_cleaned
82
+ else:
83
+ raise TypeError(("Input molecules must be a string,a list of strings or a "
84
+ "pandas DataFrame."))
85
+
86
+ def create_ecfp_fps(mols: List[Mol]) -> np.ndarray:
87
+ """
88
+ This function ECFP fingerprints for a list of molecules.
89
+ """
90
+ ecfps = list()
91
+
92
+ for mol in mols:
93
+ fp_sparse_vec = rdFingerprintGenerator.GetCountFPs(
94
+ [mol], fpType=rdFingerprintGenerator.MorganFP
95
+ )[0]
96
+ fp = np.zeros((0,), np.int8)
97
+ DataStructs.ConvertToNumpyArray(fp_sparse_vec, fp)
98
+
99
+ ecfps.append(fp)
100
+
101
+ return np.array(ecfps)
102
+
103
+ def create_rdkit_descriptors(mols: List[Mol]) -> np.ndarray:
104
+ """
105
+ This function creates RDKit descriptors for a list of molecules.
106
+ """
107
+ rdkit_descriptors = list()
108
+
109
+ for mol in mols:
110
+ descrs = []
111
+ for _, descr_calc_fn in Descriptors._descList:
112
+ descrs.append(descr_calc_fn(mol))
113
+
114
+ descrs = np.array(descrs)
115
+ descrs = descrs[USED_200_DESCR]
116
+ rdkit_descriptors.append(descrs)
117
+
118
+ return np.array(rdkit_descriptors)
119
+
120
+ def create_quantils(raw_features: np.ndarray, ecdfs: list) -> np.ndarray:
121
+
122
+ quantils = np.zeros_like(raw_features)
123
+
124
+ for column in range(raw_features.shape[1]):
125
+ raw_values = raw_features[:, column].reshape(-1)
126
+ ecdf = ecdfs[column]
127
+ q = ecdf(raw_values)
128
+ quantils[:, column] = q
129
+
130
+ return quantils
131
+
132
+ def create_cleaned_mol_objects(smiles: List[str]) -> List[Mol]:
133
+ """
134
+ This function creates cleaned RDKit mol objects from a list of SMILES.
135
+ """
136
+ sm = Standardizer(canon_taut=True)
137
+
138
+ mols = list()
139
+ for smile in smiles:
140
+ #try:
141
+ mol = Chem.MolFromSmiles(smile)
142
+ standardized_mol, _ = sm.standardize_mol(mol)
143
+ can_mol = Chem.MolFromSmiles(Chem.MolToSmiles(standardized_mol))
144
+ mols.append(can_mol)
145
+ return mols
146
+
147
+ #---------------------------------------------------------------------------------------
148
+
src/data_preprocessing/create_model_inputs.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ In this file, the input functions for query and support set molecules are defined.
3
+ Input is assumed to be either a SMILES string, a list of SMILES strings, or a pandas
4
+ dataframe.
5
+ """
6
+
7
+ #---------------------------------------------------------------------------------------
8
+ # Dependencies
9
+ import pandas as pd
10
+ from typing import List
11
+ import torch
12
+
13
+ from src.data_preprocessing.create_descriptors import preprocess_molecules
14
+
15
+ #---------------------------------------------------------------------------------------
16
+ # Define main functions
17
+ def create_query_input(smiles_input: [str, List[str], pd.DataFrame]):
18
+ """
19
+ This function creates the input for the query molecules.
20
+ """
21
+
22
+ # Create vector representation
23
+ numpy_vector_representation = preprocess_molecules(smiles_input)
24
+ assert len(numpy_vector_representation.shape) == 2
25
+
26
+ # Create pytorch tensor
27
+ tensor = torch.from_numpy(numpy_vector_representation).unsqueeze(1).float()
28
+
29
+ return tensor
30
+
31
+ def create_support_set_input(smiles_input: [str, List[str], pd.DataFrame]):
32
+ """
33
+ This function creates the input for the support set molecules.
34
+ """
35
+
36
+ # Create vector representation
37
+ numpy_vector_representation = preprocess_molecules(smiles_input)
38
+ assert len(numpy_vector_representation.shape) == 2
39
+
40
+ size = numpy_vector_representation.shape[0]
41
+
42
+ # Create pytorch tensors
43
+ tensor = torch.from_numpy(numpy_vector_representation).unsqueeze(0).float()
44
+ size = torch.tensor(size)
45
+
46
+ return tensor, size