cankoban commited on
Commit
9ef0f40
1 Parent(s): 58eeafe

Upload 4 files

Browse files

Filtering and new features added version.

About.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2018-2022 Streamlit Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import streamlit as st
16
+ from streamlit.logger import get_logger
17
+ LOGGER = get_logger(__name__)
18
+
19
+
20
+ def run():
21
+ st.set_page_config(
22
+ page_title="About WarmMolGen",
23
+ page_icon="🚀",
24
+ layout='wide'
25
+ )
26
+
27
+ st.write("## [Exploiting Pretrained Biochemical Language Models for Targeted Drug Design](https://arxiv.org/abs/2209.00981)")
28
+ #st.sidebar.title("Model Demos")
29
+ st.sidebar.success("Select a model demo above.")
30
+
31
+ st.markdown(
32
+ """
33
+ This application demonstrates the generation capabilities of the models trained as part of the study below, which has been published in *Bioinformatics* Published by Oxford University Press. The available models are:
34
+ * WarmMolGen
35
+ - WarmMolGenOne (i.e. EncDecBase)
36
+ - WarmMolGenTwo (i.e. EncDecLM)
37
+ * ChemBERTaLM
38
+ 👈 Select a model demo from the sidebar to generate molecules right away 🚀
39
+ ### Abstract
40
+ **Motivation:** The development of novel compounds targeting proteins of interest is one of the most important tasks in
41
+ the pharmaceutical industry. Deep generative models have been applied to targeted molecular design and have shown
42
+ promising results. Recently, target-specific molecule generation has been viewed as a translation between the protein
43
+ language and the chemical language. However, such a model is limited by the availability of interacting protein–ligand
44
+ pairs. On the other hand, large amounts of unlabelled protein sequences and chemical compounds are available and
45
+ have been used to train language models that learn useful representations. In this study, we propose exploiting pretrained
46
+ biochemical language models to initialize (i.e. warm start) targeted molecule generation models. We investigate
47
+ two warm start strategies: (i) a one-stage strategy where the initialized model is trained on targeted molecule generation
48
+ and (ii) a two-stage strategy containing a pre-finetuning on molecular generation followed by target-specific training. We
49
+ also compare two decoding strategies to generate compounds: beam search and sampling.
50
+ **Results:** The results show that the warm-started models perform better than a baseline model trained from scratch.
51
+ The two proposed warm-start strategies achieve similar results to each other with respect to widely used metrics
52
+ from benchmarks. However, docking evaluation of the generated compounds for a number of novel proteins suggests
53
+ that the one-stage strategy generalizes better than the two-stage strategy. Additionally, we observe that
54
+ beam search outperforms sampling in both docking evaluation and benchmark metrics for assessing compound
55
+ quality.
56
+ **Availability and implementation:** The source code is available at https://github.com/boun-tabi/biochemical-lms-for-drug-design and the materials (i.e., data, models, and outputs) are archived in Zenodo at https://doi.org/10.5281/zenodo.6832145.
57
+ ### Citation
58
+ ```bibtex
59
+ @article{10.1093/bioinformatics/btac482,
60
+ author = {Uludoğan, Gökçe and Ozkirimli, Elif and Ulgen, Kutlu O and Karalı, Nilgün and Özgür, Arzucan},
61
+ title = "{Exploiting pretrained biochemical language models for targeted drug design}",
62
+ journal = {Bioinformatics},
63
+ volume = {38},
64
+ number = {Supplement_2},
65
+ pages = {ii155-ii161},
66
+ year = {2022},
67
+ doi = {10.1093/bioinformatics/btac482},
68
+ url = {https://doi.org/10.1093/bioinformatics/btac482},
69
+ }
70
+ ```
71
+ """
72
+ )
73
+ # page_names_to_funcs = {
74
+ # "—": intro,
75
+ # "Plotting Demo": plotting_demo,
76
+ # "Mapping Demo": mapping_demo,
77
+ # "DataFrame Demo": data_frame_demo
78
+ # }
79
+
80
+ # demo_name = st.sidebar.selectbox("Choose a demo", page_names_to_funcs.keys())
81
+ # page_names_to_funcs[demo_name]()
82
+
83
+ if __name__ == "__main__":
84
+ run()
pages/1_🔥_WarmMolGen.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import streamlit.components.v1 as components
3
+ import pandas as pd
4
+ import mols2grid
5
+ from ipywidgets import interact, widgets
6
+ import textwrap
7
+ import numpy as np
8
+ from transformers import EncoderDecoderModel, RobertaTokenizer
9
+
10
+ from moses.metrics.utils import QED, SA, logP, NP, weight, get_n_rings
11
+ from moses.utils import mapper, get_mol
12
+
13
+ # @st.cache(allow_output_mutation=False, hash_funcs={Tokenizer: str})
14
+ from typing import List
15
+
16
+ from util import filter_dataframe
17
+
18
+
19
+ @st.cache(suppress_st_warning=True)
20
+ def load_models():
21
+ # protein_tokenizer = RobertaTokenizer.from_pretrained("gokceuludogan/WarmMolGenTwo")
22
+ # mol_tokenizer = RobertaTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")
23
+ model1 = EncoderDecoderModel.from_pretrained("gokceuludogan/WarmMolGenOne")
24
+ model2 = EncoderDecoderModel.from_pretrained("gokceuludogan/WarmMolGenTwo")
25
+ return model1, model2 # , protein_tokenizer, mol_tokenizer
26
+
27
+
28
+ def count(smiles_list: List[str]):
29
+ counts = []
30
+ for smiles in smiles_list:
31
+ counts.append(len(smiles))
32
+
33
+ return counts
34
+
35
+
36
+ def remove_none_elements(mol_list, smiles_list):
37
+ filtered_mol_list = []
38
+ filtered_smiles_list = []
39
+ indices = []
40
+ for i, element in enumerate(mol_list):
41
+ if element is not None:
42
+ filtered_mol_list.append(element)
43
+ else:
44
+ indices.append(i)
45
+ removed_len = len(indices)
46
+
47
+ for i in range(len(smiles_list)):
48
+ if i not in indices:
49
+ filtered_smiles_list.append(smiles_list.__getitem__(i))
50
+
51
+ return filtered_mol_list, filtered_smiles_list, removed_len
52
+
53
+
54
+ def format_list_numbers(lst):
55
+ for i, value in enumerate(lst):
56
+ lst[i] = float("{:.3f}".format(value))
57
+
58
+
59
+ def generate_molecules(model_name, num_mols, max_new_tokens, do_sample, num_beams, target, pool):
60
+ protein_tokenizer = RobertaTokenizer.from_pretrained("gokceuludogan/WarmMolGenTwo")
61
+ mol_tokenizer = RobertaTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")
62
+ # model1, model2, protein_tokenizer, mol_tokenizer = load_models()
63
+ model1, model2 = load_models()
64
+ inputs = protein_tokenizer(target, return_tensors="pt")
65
+
66
+ model = model1 if model_name == 'WarmMolGenOne' else model2
67
+ outputs = model.generate(**inputs, decoder_start_token_id=mol_tokenizer.bos_token_id,
68
+ eos_token_id=mol_tokenizer.eos_token_id, pad_token_id=mol_tokenizer.eos_token_id,
69
+ max_length=int(max_new_tokens), num_return_sequences=int(num_mols),
70
+ do_sample=do_sample, num_beams=num_beams)
71
+ output_smiles = mol_tokenizer.batch_decode(outputs, skip_special_tokens=True)
72
+ st.write("### Generated Molecules")
73
+ # mol_list = list(map(MolFromSmiles, output_smiles))
74
+ # print(mol_list)
75
+ # QED_scores = list(map(QED.qed, mol_list))
76
+ # print(QED_scores)
77
+ # st.write(output_smiles)
78
+ mol_list = mapper(pool)(get_mol, output_smiles)
79
+ mol_list, output_smiles, removed_len = remove_none_elements(mol_list, output_smiles)
80
+ if removed_len != 0:
81
+ st.write(f"#### Note that: {removed_len} numbers of generated invalid molecules are discarded.")
82
+
83
+ QED_scores = mapper(pool)(QED, mol_list)
84
+ SA_scores = mapper(pool)(SA, mol_list)
85
+ logP_scores = mapper(pool)(logP, mol_list)
86
+ NP_scores = mapper(pool)(NP, mol_list)
87
+ weight_scores = mapper(pool)(weight, mol_list)
88
+
89
+ format_list_numbers(QED_scores)
90
+ format_list_numbers(SA_scores)
91
+ format_list_numbers(logP_scores)
92
+ format_list_numbers(NP_scores)
93
+ format_list_numbers(weight_scores)
94
+
95
+ df_smiles = pd.DataFrame(
96
+ {'SMILES': output_smiles, "QED": QED_scores, "SA": SA_scores, "logP": logP_scores, "NP": NP_scores,
97
+ "Weight": weight_scores})
98
+
99
+ return df_smiles
100
+
101
+
102
+ def warm_molgen_demo():
103
+ with st.form("my_form"):
104
+ with st.sidebar:
105
+ st.sidebar.subheader("Configurable parameters")
106
+
107
+ model_name = st.sidebar.selectbox(
108
+ "Model Selector",
109
+ options=[
110
+ "WarmMolGenOne",
111
+ "WarmMolGenTwo",
112
+ ],
113
+ index=0,
114
+ )
115
+
116
+ num_mols = st.sidebar.number_input(
117
+ "Number of generated molecules",
118
+ min_value=0,
119
+ max_value=20,
120
+ value=10,
121
+ help="The number of molecules to be generated.",
122
+ )
123
+
124
+ max_new_tokens = st.sidebar.number_input(
125
+ "Maximum length",
126
+ min_value=0,
127
+ max_value=1024,
128
+ value=128,
129
+ help="The maximum length of the sequence to be generated.",
130
+ )
131
+ do_sample = st.sidebar.selectbox(
132
+ "Sampling?",
133
+ (True, False),
134
+ help="Whether or not to use sampling; use beam decoding otherwise.",
135
+ )
136
+ target = st.text_area(
137
+ "Target Sequence",
138
+ "MENTENSVDSKSIKNLEPKIIHGSESMDSGISLDNSYKMDYPEMGLCIIINNKNFHKSTG",
139
+ )
140
+ generate_new_molecules = st.form_submit_button("Generate Molecules")
141
+
142
+ num_beams = None if do_sample is True else int(num_mols)
143
+
144
+ pool = 1
145
+
146
+ if generate_new_molecules:
147
+ st.session_state.df = generate_molecules(model_name, num_mols, max_new_tokens, do_sample, num_beams,
148
+ target, pool)
149
+ if 'df' not in st.session_state:
150
+ st.session_state.df = generate_molecules(model_name, num_mols, max_new_tokens, do_sample, num_beams,
151
+ target, pool)
152
+ df = st.session_state.df
153
+
154
+ filtered_df = filter_dataframe(df)
155
+ if filtered_df.empty:
156
+ st.markdown(
157
+ """
158
+ <span style='color: blue; font-size: 30px;'>No molecules were found with specified properties.</span>
159
+ """,
160
+ unsafe_allow_html=True
161
+ )
162
+ else:
163
+ raw_html = mols2grid.display(filtered_df, height=1000)._repr_html_()
164
+ components.html(raw_html, width=900, height=450, scrolling=True)
165
+
166
+ st.markdown("## How to Generate")
167
+ generation_code = f"""
168
+ from transformers import EncoderDecoderModel, RobertaTokenizer
169
+ protein_tokenizer = RobertaTokenizer.from_pretrained("gokceuludogan/{model_name}")
170
+ mol_tokenizer = RobertaTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")
171
+ model = EncoderDecoderModel.from_pretrained("gokceuludogan/{model_name}")
172
+ inputs = protein_tokenizer("{target}", return_tensors="pt")
173
+ outputs = model.generate(**inputs, decoder_start_token_id=mol_tokenizer.bos_token_id,
174
+ eos_token_id=mol_tokenizer.eos_token_id, pad_token_id=mol_tokenizer.eos_token_id,
175
+ max_length={max_new_tokens}, num_return_sequences={num_mols}, do_sample={do_sample}, num_beams={num_beams})
176
+ mol_tokenizer.batch_decode(outputs, skip_special_tokens=True)
177
+ """
178
+ st.code(textwrap.dedent(generation_code)) # textwrap.dedent("".join("Halletcez")))
179
+
180
+
181
+ st.set_page_config(page_title="WarmMolGen Demo", page_icon="🔥", layout='wide')
182
+ st.markdown("# WarmMolGen Demo")
183
+ st.sidebar.header("WarmMolGen Demo")
184
+ st.markdown(
185
+ """
186
+ This demo illustrates WarmMolGen models' generation capabilities.
187
+ Given a target sequence and a set of parameters, the models generate molecules targeting the given protein sequence.
188
+ Please enter an input sequence below 👇 and configure parameters from the sidebar 👈 to generate molecules!
189
+ See below for saving the output molecules and the code snippet generating them!
190
+ """
191
+ )
192
+
193
+ warm_molgen_demo()
pages/__pycache__/util.cpython-37.pyc ADDED
Binary file (2.04 kB). View file
 
pages/util.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+ import streamlit.components.v1 as components
4
+ from pandas.api.types import (
5
+ is_categorical_dtype,
6
+ is_datetime64_any_dtype,
7
+ is_numeric_dtype,
8
+ is_object_dtype,
9
+ )
10
+
11
+ def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
12
+ """
13
+ Adds a UI on top of a dataframe to let viewers filter columns
14
+
15
+ Args:
16
+ df (pd.DataFrame): Original dataframe
17
+
18
+ Returns:
19
+ pd.DataFrame: Filtered dataframe
20
+ """
21
+ modify = st.checkbox("Add filters")
22
+
23
+ if not modify:
24
+ return df
25
+
26
+ df = df.copy()
27
+
28
+ # Try to convert datetimes into a standard format (datetime, no timezone)
29
+ for col in df.columns:
30
+ if is_object_dtype(df[col]):
31
+ try:
32
+ df[col] = pd.to_datetime(df[col])
33
+ except Exception:
34
+ pass
35
+
36
+ if is_datetime64_any_dtype(df[col]):
37
+ df[col] = df[col].dt.tz_localize(None)
38
+
39
+ modification_container = st.container()
40
+
41
+ with modification_container:
42
+ limit_non_unique = 1
43
+ to_filter_columns = st.multiselect("Filter dataframe on", df.columns)
44
+ for column in to_filter_columns:
45
+ if df[column].dtype == 'O': # Check if the column is of 'object' dtype (i.e., string)
46
+ df[column] = df[column].astype(pd.CategoricalDtype())
47
+ left, right = st.columns((1, 20))
48
+ # Treat columns with < 10 unique values as categorical
49
+ if is_categorical_dtype(df[column]) or df[column].nunique() < limit_non_unique:
50
+ user_cat_input = right.multiselect(
51
+ f"Values for {column}",
52
+ df[column].unique(),
53
+ default=list(df[column].unique()),
54
+ )
55
+ df = df[df[column].isin(user_cat_input)]
56
+ elif is_numeric_dtype(df[column]):
57
+ _min = float(df[column].min())
58
+ _max = float(df[column].max())
59
+ step = (_max - _min) / 100
60
+ user_num_input = right.slider(
61
+ f"Values for {column}",
62
+ min_value=_min,
63
+ max_value=_max,
64
+ value=(_min, _max),
65
+ step=step,
66
+ )
67
+ df = df[df[column].between(*user_num_input)]
68
+ elif is_datetime64_any_dtype(df[column]):
69
+ user_date_input = right.date_input(
70
+ f"Values for {column}",
71
+ value=(
72
+ df[column].min(),
73
+ df[column].max(),
74
+ ),
75
+ )
76
+ if len(user_date_input) == 2:
77
+ user_date_input = tuple(map(pd.to_datetime, user_date_input))
78
+ start_date, end_date = user_date_input
79
+ df = df.loc[df[column].between(start_date, end_date)]
80
+ else:
81
+ user_text_input = right.text_input(
82
+ f"Substring or regex in {column}",
83
+ )
84
+ if user_text_input:
85
+ df = df[df[column].astype(str).str.contains(user_text_input)]
86
+
87
+ return df