jannisborn commited on
Commit
480220c
0 Parent(s):

Duplicate from GT4SD/molecular_properties

Browse files
Files changed (10) hide show
  1. .gitattributes +34 -0
  2. .gitignore +1 -0
  3. LICENSE +21 -0
  4. README.md +16 -0
  5. app.py +99 -0
  6. model_cards/article.md +68 -0
  7. model_cards/description.md +7 -0
  8. model_cards/examples.smi +13 -0
  9. requirements.txt +29 -0
  10. utils.py +57 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Generative Toolkit 4 Scientific Discovery
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Molecular properties
3
+ emoji: 💡
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.9.1
8
+ app_file: app.py
9
+ pinned: false
10
+ python_version: 3.8.13
11
+ pypi_version: 20.2.4
12
+ duplicated_from: GT4SD/molecular_properties
13
+ ---
14
+
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
16
+
app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import pathlib
3
+
4
+ import gradio as gr
5
+ import numpy as np
6
+ import pandas as pd
7
+ from gt4sd.properties.molecules import MOLECULE_PROPERTY_PREDICTOR_FACTORY
8
+
9
+ from utils import draw_grid_predict
10
+
11
+ logger = logging.getLogger(__name__)
12
+ logger.addHandler(logging.NullHandler())
13
+
14
+ REMOVE = ["docking", "docking_tdc", "molecule_one", "askcos", "plogp"]
15
+ REMOVE.extend(["similarity_seed", "activity_against_target", "organtox"])
16
+
17
+ MODEL_PROP_DESCRIPTION = {
18
+ "Tox21": "NR-AR, NR-AR-LBD, NR-AhR, NR-Aromatase, NR-ER, NR-ER-LBD, NR-PPAR-gamma, SR-ARE, SR-ATAD5, SR-HSE, SR-MMP, SR-p53",
19
+ "Sider": "Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal disorders,Gastrointestinal disorders,Social circumstances,Immune system disorders,Reproductive system and breast disorders,Bening & malignant,General disorders,Endocrine disorders,Surgical & medical procedures,Vascular disorders,Blood & lymphatic disorders,Skin & subcutaneous disorders,Congenital & genetic disorders,Infections,Respiratory & thoracic disorders,Psychiatric disorders,Renal & urinary disorders,Pregnancy conditions,Ear disorders,Cardiac disorders,Nervous system disorders,Injury & procedural complications",
20
+ "Clintox": "FDA approval, Clinical trial failure",
21
+ }
22
+
23
+
24
+ def main(property: str, smiles: str, smiles_file: str):
25
+
26
+ algo, config = MOLECULE_PROPERTY_PREDICTOR_FACTORY[property.lower()]
27
+ kwargs = (
28
+ {"algorithm_version": "v0"} if property in MODEL_PROP_DESCRIPTION.keys() else {}
29
+ )
30
+ model = algo(config(**kwargs))
31
+ if smiles is not None and smiles_file is not None:
32
+ raise ValueError("Pass either smiles or smiles_file, not both.")
33
+ elif smiles is not None:
34
+ smiles = [smiles]
35
+ elif smiles_file is not None:
36
+ smiles = pd.read_csv(smiles_file.name, header=None, sep="\t")[0].tolist()
37
+ props = np.array(list(map(model, smiles))).round(2)
38
+
39
+ # Expand to 2D array if needed
40
+ if len(props.shape) == 1:
41
+ props = np.expand_dims(np.array(props), -1)
42
+
43
+ if property in MODEL_PROP_DESCRIPTION.keys():
44
+ property_names = MODEL_PROP_DESCRIPTION[property].split(",")
45
+ else:
46
+ property_names = [property]
47
+
48
+ return draw_grid_predict(
49
+ smiles, props, property_names=property_names, domain="Molecules"
50
+ )
51
+
52
+
53
+ if __name__ == "__main__":
54
+
55
+ # Preparation (retrieve all available algorithms)
56
+ properties = list(MOLECULE_PROPERTY_PREDICTOR_FACTORY.keys())[::-1]
57
+ for prop in REMOVE:
58
+ prop_to_idx = dict(zip(properties, range(len(properties))))
59
+ properties.pop(prop_to_idx[prop])
60
+ properties = list(map(lambda x: x.capitalize(), properties))
61
+
62
+ # Load metadata
63
+ metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
64
+
65
+ examples = [
66
+ ["Qed", None, metadata_root.joinpath("examples.smi")],
67
+ [
68
+ "Esol",
69
+ "CN1CCN(CCCOc2ccc(N3C(=O)C(=Cc4ccc(Oc5ccc([N+](=O)[O-])cc5)cc4)SC3=S)cc2)CC1",
70
+ None,
71
+ ],
72
+ ]
73
+
74
+ with open(metadata_root.joinpath("article.md"), "r") as f:
75
+ article = f.read()
76
+ with open(metadata_root.joinpath("description.md"), "r") as f:
77
+ description = f.read()
78
+
79
+ demo = gr.Interface(
80
+ fn=main,
81
+ title="Molecular properties",
82
+ inputs=[
83
+ gr.Dropdown(properties, label="Property", value="qed"),
84
+ gr.Textbox(
85
+ label="Single SMILES",
86
+ placeholder="CC(C#C)N(C)C(=O)NC1=CC=C(Cl)C=C1",
87
+ lines=1,
88
+ ),
89
+ gr.File(
90
+ file_types=[".smi"],
91
+ label="Multiple SMILES (tab-separated, `.smi` file)",
92
+ ),
93
+ ],
94
+ outputs=gr.HTML(label="Output"),
95
+ article=article,
96
+ description=description,
97
+ examples=examples,
98
+ )
99
+ demo.launch(debug=True, show_error=True)
model_cards/article.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Supported molecular properties
2
+
3
+
4
+ ### ClinTox
5
+ A [ToxSmi model](https://github.com/PaccMann/toxsmi) trained on [ClinTox](https://moleculenet.org/datasets-1) dataset which has two endpoints: Probability of FDA approval and Probability of failure in clinical trials. When using this model, please cite *Born et al. (2023)* (citation below).
6
+
7
+ ### SIDER
8
+ A [ToxSmi model](https://github.com/PaccMann/toxsmi) trained on the [SIDER](https://moleculenet.org/datasets-1) dataset for 27 different types of side effects of drugs. When using this model, please cite *Born et al. (2023)* (citation below).
9
+
10
+ ### Tox21
11
+ A [ToxSmi model](https://github.com/PaccMann/toxsmi) trained on the [Tox21](https://tripod.nih.gov/tox/) dataset with 12 different types of environmental toxicities. When using this model, please cite *Born et al. (2023)* (citation below).
12
+
13
+ ### SCScore
14
+ Predict the synthetic complexity score (SCScore) as presented in [Coley et al. (*J. Chem. Inf. Model.*; 2018)](https://pubs.acs.org/doi/full/10.1021/acs.jcim.7b00622).
15
+
16
+ ### SAS
17
+ Estimate the synthetic accessibility score (SAS) as presented in [Ertl et al. (*Journal of Chemoinformatics*; 2009)](https://jcheminf.biomedcentral.com/articles/10.1186/1758-2946-1-8).
18
+
19
+ ### Lipinski
20
+ Measure whether a molecule confirms to the Lipinski-rule-of-five as presented in [Lipinski et al. (*Advanced Drug Delivery Reviews*; 2001)](https://www.sciencedirect.com/science/article/abs/pii/S0169409X00001290?via%3Dihub).
21
+
22
+ ### Penalized logP
23
+ Measure the penalized logP (partition coefficient) score as presented in [Gomez-Bombarelli et al. (*ACS Central Science*; 2018)](https://arxiv.org/abs/1610.02415v1). This is the logP minus the number of rings with > 6 atoms minus the SAS.
24
+
25
+ ### QED
26
+ Measure the drug-likeness as presented in [Bickerton et al. (*Nature Chemistry*; 2012)](https://www.nature.com/articles/nchem.1243).
27
+
28
+ ### LogP
29
+ Measure the logP (partition coefficient) of a molecule as presented in [Wildman et al. (*J. Chem. Inf. Comput. Sci.*; 1999)](https://pubs.acs.org/doi/full/10.1021/ci990307l).
30
+
31
+ ### Bertz
32
+ Calculate the total polar surface area of a molecule as presented in [Ertl et al. (*Journal of Medicinal Chemistry*; 2000)](https://pubs.acs.org/doi/full/10.1021/jm000942e).
33
+
34
+ ### TPSA
35
+ Calculate the first general index of molecular complexity [Bertz (*Journal of the American Chemical Society*; 1981)](https://pubs.acs.org/doi/pdf/10.1021/ja00402a071).
36
+
37
+ ### Is-Scaffold
38
+ Whether the molecule is identical to its [Murcko scaffold](https://rdkit.org/docs/source/rdkit.Chem.Scaffolds.MurckoScaffold.html).
39
+
40
+ ### Number-Of-X
41
+ Calculated with [RDKit](https://www.rdkit.org/docs/source/rdkit.Chem.rdchem.html).
42
+
43
+ ### Molecular Weight
44
+ Calculated with [RDKit](https://www.rdkit.org/docs/source/rdkit.Chem.rdchem.html).
45
+
46
+
47
+ ### ToxSmi citation
48
+ ```bib
49
+ @article{born2023chemical,
50
+ title={Chemical representation learning for toxicity prediction},
51
+ author={Born, Jannis and Markert, Greta and Janakarajan, Nikita and Kimber, Talia B. and Volkamer, Andrea and Rodriguez Martinez, Maria and Manica, Matteo},
52
+ journal={Under review at Digital Discovery},
53
+ year={2023}
54
+ }
55
+ ```
56
+
57
+
58
+ ### Unsupported properties
59
+ The following molecular properties are available via the GT4SD API but not in this UI:
60
+ - [MoleculeOne](https://tdcommons.ai/functions/oracles/#moleculeone) endpoint for retrosynthesis
61
+ - [ASKCOS](https://tdcommons.ai/functions/oracles/#askcos) endpoint for retrosynthesis
62
+ - [TDC-Docking](https://tdcommons.ai/functions/oracles/#docking-scores) endpoint for docking against a user-provided target
63
+ - [TDC-Docking](https://tdcommons.ai/functions/oracles/#docking-scores) endpoint for docking against *3pbl*.
64
+ - [Protein-ligand binding](https://tdcommons.ai/functions/oracles/#dopamine-receptor-d2-drd2) against one of the targets *drd2*, *gsk3b*, *jnk3*, *fpscores*, *cyp3a4_veith*, *drd2_current*, *gsk3b_current* or *jnk3_current*.
65
+ - [Tanimoto similarity](https://tdcommons.ai/functions/oracles/#similaritydissimilarity) to a seed molecule.
66
+
67
+
68
+ Moreover, GT4SD also includes properties on other entities such as [proteins](https://gt4sd.github.io/gt4sd-core/api/gt4sd.properties.proteins.html) and [crystals](https://gt4sd.github.io/gt4sd-core/api/gt4sd.properties.crystals.html).
model_cards/description.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+
2
+
3
+ <img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
4
+
5
+ ### Molecular property prediction
6
+
7
+ This is the GT4SD web-app for prediction of various molecular properties. For **examples** and **documentation** of the supported properties, please see below. Please note that this API does not expose **all** properties that are supported in GT4SD (a list of the non-supported ones can be found at the bottom).
model_cards/examples.smi ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Cc1cc2c(c3oc(CCCC#N)cc13)C(=O)c1c(O)cccc1C2=O
2
+ C=CCN1C(=O)C(=NNC(=S)NC2OC(COC(C)=O)C(OC(C)=O)C(OC(C)=O)C2OC(C)=O)c2ccccc21
3
+ O=C1C(=Cc2ccc(F)cc2)CCOc2c1ccc1ccccc21
4
+ CC(C)CNc1cc(NCC(C)C)nc(NCC(C)C)n1
5
+ CN1CCN(CCCOc2ccc(N3C(=O)C(=Cc4ccc(Oc5ccc([N+](=O)[O-])cc5)cc4)SC3=S)cc2)CC1
6
+ COc1ccc2ccccc2c1C1CC1NC(C)=O
7
+ Cc1ccc(-n2c(=O)[nH]cc(C(=O)Nc3ccc4c(c3)OCCO4)c2=O)cc1
8
+ Cc1ccc(NCc2nnc(SCC(=O)NCCc3ccccc3)n2C)cc1
9
+ CCCNC(=O)c1ccc2c(c1)N=C(C)c1c(C)ccc(C)c1S2
10
+ COc1ccc(Cn2ccn(CC(=O)Nc3cc(C)ccc3C)c(=O)c2=O)cc1
11
+ Cn1nccc1C(=O)NN=Cc1c(O)ccc2ccccc12
12
+ CCOC(=O)Nc1cc(N)c2c(n1)NC(C)C(c1ccccc1)=N2
13
+ Cn1nc(N)c2ncc(C(Cl)(Cl)Cl)nc21
requirements.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -f https://download.pytorch.org/whl/cpu/torch_stable.html
2
+ -f https://data.pyg.org/whl/torch-1.12.1+cpu.html
3
+ # pip==20.2.4
4
+ torch==1.12.1
5
+ torch-scatter
6
+ torch-spline-conv
7
+ torch-sparse
8
+ torch-geometric
9
+ torchvision==0.13.1
10
+ torchaudio==0.12.1
11
+ gt4sd>=1.1.1
12
+ molgx>=0.22.0a1
13
+ molecule_generation
14
+ nglview
15
+ PyTDC==0.3.7
16
+ gradio>=3.9
17
+ markdown-it-py>=2.1.0
18
+ mols2grid>=0.2.0
19
+ pandas>=1.0.0
20
+ pymatgen>=2023.1.9
21
+ terminator @ git+https://github.com/IBM/regression-transformer@gt4sd
22
+ guacamol_baselines @ git+https://github.com/GT4SD/guacamol_baselines.git@v0.0.2
23
+ moses @ git+https://github.com/GT4SD/moses.git@v0.1.0
24
+ paccmann_chemistry @ git+https://github.com/PaccMann/paccmann_chemistry@0.0.4
25
+ paccmann_generator @ git+https://github.com/PaccMann/paccmann_generator@0.0.2
26
+ paccmann_gp @ git+https://github.com/PaccMann/paccmann_gp@0.1.1
27
+ paccmann_omics @ git+https://github.com/PaccMann/paccmann_omics@0.0.1.1
28
+ paccmann_predictor @ git+https://github.com/PaccMann/paccmann_predictor@sarscov2
29
+ reinvent_models @ git+https://github.com/GT4SD/reinvent_models@v0.0.1
utils.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List
3
+ import numpy as np
4
+ import mols2grid
5
+ import pandas as pd
6
+ from rdkit import Chem
7
+
8
+ logger = logging.getLogger(__name__)
9
+ logger.addHandler(logging.NullHandler())
10
+
11
+
12
+ def draw_grid_predict(
13
+ sequences: List[str], properties: np.array, property_names: List[str], domain: str
14
+ ) -> str:
15
+ """
16
+ Uses mols2grid to draw a HTML grid for the prediction
17
+
18
+ Args:
19
+ sequences: Sequences for which properties are predicted.
20
+ properties: Predicted properties. Array of shape (n_samples, n_properties).
21
+ names: List of property names
22
+ domain: Domain of the prediction (molecules or proteins).
23
+
24
+ Returns:
25
+ HTML to display
26
+ """
27
+
28
+ if domain not in ["Molecules", "Proteins"]:
29
+ raise ValueError(f"Unsupported domain {domain}")
30
+
31
+ if domain == "Proteins":
32
+ converter = lambda x: Chem.MolToSmiles(Chem.MolFromFASTA(x))
33
+ else:
34
+ converter = lambda x: x
35
+
36
+ smiles = []
37
+ for sequence in sequences:
38
+ try:
39
+ seq = converter(sequence)
40
+ smiles.append(seq)
41
+ except Exception:
42
+ logger.warning(f"Could not draw sequence {seq}")
43
+
44
+ result = pd.DataFrame({"SMILES": smiles})
45
+ for i, name in enumerate(property_names):
46
+ result[name] = properties[:, i]
47
+ n_cols = min(3, len(result))
48
+ size = (140, 200) if len(result) > 3 else (600, 700)
49
+ obj = mols2grid.display(
50
+ result,
51
+ tooltip=list(result.keys()),
52
+ height=1100,
53
+ n_cols=n_cols,
54
+ name="Results",
55
+ size=size,
56
+ )
57
+ return obj.data