matanninio
commited on
Commit
·
71382c0
1
Parent(s):
81fb8a8
first attemt on unified test - the actual use case needs to be clearer
Browse files- .pre-commit-config.yaml +49 -0
- README.md +2 -2
- app.py +173 -41
- requirements.txt +1 -0
.pre-commit-config.yaml
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
exclude: .*\.pdb$
|
2 |
+
|
3 |
+
repos:
|
4 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
5 |
+
rev: v4.6.0
|
6 |
+
hooks:
|
7 |
+
- id: check-case-conflict
|
8 |
+
- id: end-of-file-fixer
|
9 |
+
- id: mixed-line-ending
|
10 |
+
- id: trailing-whitespace
|
11 |
+
- repo: https://github.com/psf/black
|
12 |
+
rev: 24.8.0
|
13 |
+
hooks:
|
14 |
+
- id: black
|
15 |
+
- repo: https://github.com/PyCQA/flake8
|
16 |
+
rev: 5.0.4
|
17 |
+
hooks:
|
18 |
+
- id: flake8
|
19 |
+
args:
|
20 |
+
- "--ignore=E203,E266,E501,F405,F403,W503"
|
21 |
+
- "--statistics"
|
22 |
+
|
23 |
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
24 |
+
# Ruff version.
|
25 |
+
rev: v0.6.5
|
26 |
+
hooks:
|
27 |
+
- id: ruff
|
28 |
+
args:
|
29 |
+
- "--fix"
|
30 |
+
- "--select"
|
31 |
+
- "UP,PT,I,E"#,F,W,C90,I,N,F405,E402" # Specify the rules to select
|
32 |
+
- "--line-length"
|
33 |
+
- "88"
|
34 |
+
- "--exit-non-zero-on-fix"
|
35 |
+
- "--ignore"
|
36 |
+
- "F405,F403,E501,E402,PT018,PT015,E722,E741"
|
37 |
+
types_or: [ python, pyi] #, jupyter ]
|
38 |
+
- repo: https://github.com/pre-commit/mirrors-mypy
|
39 |
+
rev: v1.13.0
|
40 |
+
hooks:
|
41 |
+
- id: mypy
|
42 |
+
|
43 |
+
- repo: https://github.com/srstevenson/nb-clean
|
44 |
+
rev: "2.4.0"
|
45 |
+
hooks:
|
46 |
+
- id: nb-clean
|
47 |
+
args:
|
48 |
+
- --remove-empty-cells
|
49 |
+
- --preserve-cell-outputs
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title: Biomed-multi-alignment
|
3 |
emoji: 🐁
|
4 |
colorFrom: gray
|
5 |
colorTo: purple
|
@@ -8,7 +8,7 @@ sdk_version: 5.4.0
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
-
short_description: Demo for MAMMAL approch Protein-Protein Interaction
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Biomed-multi-alignment (PPI and DTI)
|
3 |
emoji: 🐁
|
4 |
colorFrom: gray
|
5 |
colorTo: purple
|
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
+
short_description: Demo for MAMMAL approch Protein-Protein Interaction and Drug-Target Binding Affinity
|
12 |
---
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -1,112 +1,244 @@
|
|
1 |
import gradio as gr
|
2 |
-
|
3 |
import torch
|
4 |
from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
|
5 |
-
from mammal.
|
6 |
from mammal.keys import *
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
|
|
|
|
|
|
8 |
|
9 |
|
10 |
-
|
11 |
-
# Load Model
|
12 |
-
model = Mammal.from_pretrained(model_path)
|
13 |
-
model.eval()
|
14 |
|
15 |
-
|
16 |
-
tokenizer_op =
|
17 |
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
# Default input proteins
|
22 |
protein_calmodulin = "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK"
|
23 |
protein_calcineurin = "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ"
|
24 |
|
25 |
|
26 |
-
def
|
27 |
# Formatting prompt to match pre-training syntax
|
28 |
return f"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>{prot1}<SEQUENCE_NATURAL_END><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>{prot2}<SEQUENCE_NATURAL_END><EOS>"
|
29 |
|
|
|
30 |
def run_prompt(prompt):
|
31 |
# Create and load sample
|
32 |
sample_dict = dict()
|
33 |
sample_dict[ENCODER_INPUTS_STR] = prompt
|
34 |
|
35 |
# Tokenize
|
36 |
-
sample_dict=tokenizer_op(
|
37 |
sample_dict=sample_dict,
|
38 |
key_in=ENCODER_INPUTS_STR,
|
39 |
key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
|
40 |
key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
|
41 |
)
|
42 |
-
sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
45 |
|
46 |
# Generate Prediction
|
47 |
-
batch_dict =
|
48 |
[sample_dict],
|
49 |
output_scores=True,
|
50 |
return_dict_in_generate=True,
|
51 |
max_new_tokens=5,
|
52 |
-
)
|
53 |
-
|
54 |
|
55 |
# Get output
|
56 |
-
generated_output = tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0])
|
57 |
-
score = batch_dict[
|
58 |
-
|
59 |
-
return generated_output,score
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
64 |
return res
|
65 |
|
66 |
-
|
|
|
67 |
markup_text = f"""
|
68 |
# Mammal based Protein-Protein Interaction (PPI) demonstration
|
69 |
|
70 |
Given two protein sequences, estimate if the proteins interact or not.
|
71 |
|
72 |
-
### Using the model from
|
73 |
|
74 |
-
```{
|
75 |
"""
|
76 |
-
|
77 |
-
with gr.Blocks() as demo:
|
78 |
gr.Markdown(markup_text)
|
79 |
with gr.Row():
|
80 |
prot1 = gr.Textbox(
|
81 |
label="Protein 1 sequence",
|
82 |
# info="standard",
|
83 |
interactive=True,
|
84 |
-
lines=
|
85 |
value=protein_calmodulin,
|
86 |
)
|
87 |
prot2 = gr.Textbox(
|
88 |
label="Protein 2 sequence",
|
89 |
# info="standard",
|
90 |
interactive=True,
|
91 |
-
lines=
|
92 |
value=protein_calcineurin,
|
93 |
)
|
94 |
with gr.Row():
|
95 |
-
run_mammal = gr.Button(
|
|
|
|
|
96 |
with gr.Row():
|
97 |
-
prompt_box = gr.Textbox(label="Mammal prompt",lines=5)
|
98 |
-
|
99 |
with gr.Row():
|
100 |
decoded = gr.Textbox(label="Mammal output")
|
101 |
run_mammal.click(
|
102 |
fn=create_and_run_prompt,
|
103 |
-
inputs=[prot1,prot2],
|
104 |
-
outputs=[prompt_box,decoded,gr.Number(label=
|
105 |
)
|
106 |
with gr.Row():
|
107 |
-
gr.Markdown(
|
108 |
-
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
def main():
|
112 |
demo = create_application()
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import torch
|
3 |
from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
|
4 |
+
from mammal.examples.dti_bindingdb_kd.task import DtiBindingdbKdTask
|
5 |
from mammal.keys import *
|
6 |
+
from mammal.model import Mammal
|
7 |
+
|
8 |
+
model_paths = dict()
|
9 |
+
|
10 |
+
# Protein protein interaction:
|
11 |
+
ppi = "Protein-Protein Interaction (PPI)"
|
12 |
+
model_paths[ppi] = "ibm/biomed.omics.bl.sm.ma-ted-458m"
|
13 |
|
14 |
+
#
|
15 |
+
dti = "Drug-Target Binding Affinity"
|
16 |
+
model_paths[dti] = "ibm/biomed.omics.bl.sm.ma-ted-458m.dti_bindingdb_pkd"
|
17 |
|
18 |
|
19 |
+
# load models (should probably be lazy)
|
|
|
|
|
|
|
20 |
|
21 |
+
models = dict()
|
22 |
+
tokenizer_op = dict()
|
23 |
|
24 |
+
|
25 |
+
for task, model_path in model_paths.items():
|
26 |
+
if task not in models:
|
27 |
+
models[task] = Mammal.from_pretrained(model_path)
|
28 |
+
models[task].eval()
|
29 |
+
# Load Tokenizer
|
30 |
+
tokenizer_op[task] = ModularTokenizerOp.from_pretrained(model_path)
|
31 |
+
|
32 |
+
|
33 |
+
### PPI:
|
34 |
+
# token for positive binding
|
35 |
+
positive_token_id = tokenizer_op[ppi].get_token_id("<1>")
|
36 |
|
37 |
# Default input proteins
|
38 |
protein_calmodulin = "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK"
|
39 |
protein_calcineurin = "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ"
|
40 |
|
41 |
|
42 |
+
def format_prompt_ppi(prot1, prot2):
|
43 |
# Formatting prompt to match pre-training syntax
|
44 |
return f"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>{prot1}<SEQUENCE_NATURAL_END><MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN><SEQUENCE_NATURAL_START>{prot2}<SEQUENCE_NATURAL_END><EOS>"
|
45 |
|
46 |
+
|
47 |
def run_prompt(prompt):
|
48 |
# Create and load sample
|
49 |
sample_dict = dict()
|
50 |
sample_dict[ENCODER_INPUTS_STR] = prompt
|
51 |
|
52 |
# Tokenize
|
53 |
+
sample_dict = tokenizer_op[ppi](
|
54 |
sample_dict=sample_dict,
|
55 |
key_in=ENCODER_INPUTS_STR,
|
56 |
key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
|
57 |
key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
|
58 |
)
|
59 |
+
sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
|
60 |
+
sample_dict[ENCODER_INPUTS_TOKENS]
|
61 |
+
)
|
62 |
+
sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
|
63 |
+
sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
|
64 |
+
)
|
65 |
|
66 |
# Generate Prediction
|
67 |
+
batch_dict = models[ppi].generate(
|
68 |
[sample_dict],
|
69 |
output_scores=True,
|
70 |
return_dict_in_generate=True,
|
71 |
max_new_tokens=5,
|
72 |
+
)
|
|
|
73 |
|
74 |
# Get output
|
75 |
+
generated_output = tokenizer_op[ppi]._tokenizer.decode(batch_dict[CLS_PRED][0])
|
76 |
+
score = batch_dict["model.out.scores"][0][1][positive_token_id].item()
|
77 |
+
|
78 |
+
return generated_output, score
|
79 |
+
|
80 |
+
|
81 |
+
def create_and_run_prompt(protein1, protein2):
|
82 |
+
prompt = format_prompt_ppi(protein1, protein2)
|
83 |
+
res = prompt, *run_prompt(prompt=prompt)
|
84 |
return res
|
85 |
|
86 |
+
|
87 |
+
def create_ppi_demo():
|
88 |
markup_text = f"""
|
89 |
# Mammal based Protein-Protein Interaction (PPI) demonstration
|
90 |
|
91 |
Given two protein sequences, estimate if the proteins interact or not.
|
92 |
|
93 |
+
### Using the model from
|
94 |
|
95 |
+
```{model_paths[ppi]} ```
|
96 |
"""
|
97 |
+
with gr.Group() as ppi_demo:
|
|
|
98 |
gr.Markdown(markup_text)
|
99 |
with gr.Row():
|
100 |
prot1 = gr.Textbox(
|
101 |
label="Protein 1 sequence",
|
102 |
# info="standard",
|
103 |
interactive=True,
|
104 |
+
lines=3,
|
105 |
value=protein_calmodulin,
|
106 |
)
|
107 |
prot2 = gr.Textbox(
|
108 |
label="Protein 2 sequence",
|
109 |
# info="standard",
|
110 |
interactive=True,
|
111 |
+
lines=3,
|
112 |
value=protein_calcineurin,
|
113 |
)
|
114 |
with gr.Row():
|
115 |
+
run_mammal = gr.Button(
|
116 |
+
"Run Mammal prompt for Protein-Protein Interaction", variant="primary"
|
117 |
+
)
|
118 |
with gr.Row():
|
119 |
+
prompt_box = gr.Textbox(label="Mammal prompt", lines=5)
|
120 |
+
|
121 |
with gr.Row():
|
122 |
decoded = gr.Textbox(label="Mammal output")
|
123 |
run_mammal.click(
|
124 |
fn=create_and_run_prompt,
|
125 |
+
inputs=[prot1, prot2],
|
126 |
+
outputs=[prompt_box, decoded, gr.Number(label="PPI score")],
|
127 |
)
|
128 |
with gr.Row():
|
129 |
+
gr.Markdown(
|
130 |
+
"```<SENTINEL_ID_0>``` contains the binding affinity class, which is ```<1>``` for interacting and ```<0>``` for non-interacting"
|
131 |
+
)
|
132 |
+
ppi_demo.visible = False
|
133 |
+
return ppi_demo
|
134 |
+
|
135 |
+
|
136 |
+
### DTI:
|
137 |
+
# input
|
138 |
+
target_seq = "NLMKRCTRGFRKLGKCTTLEEEKCKTLYPRGQCTCSDSKMNTHSCDCKSC"
|
139 |
+
drug_seq = "CC(=O)NCCC1=CNc2c1cc(OC)cc2"
|
140 |
+
|
141 |
+
|
142 |
+
# token for positive binding
|
143 |
+
positive_token_id = tokenizer_op[dti].get_token_id("<1>")
|
144 |
+
|
145 |
+
|
146 |
+
def format_prompt_dti(prot, drug):
|
147 |
+
sample_dict = {"target_seq": target_seq, "drug_seq": drug_seq}
|
148 |
+
sample_dict = DtiBindingdbKdTask.data_preprocessing(
|
149 |
+
sample_dict=sample_dict,
|
150 |
+
tokenizer_op=tokenizer_op[dti],
|
151 |
+
target_sequence_key="target_seq",
|
152 |
+
drug_sequence_key="drug_seq",
|
153 |
+
norm_y_mean=None,
|
154 |
+
norm_y_std=None,
|
155 |
+
device=models[dti].device,
|
156 |
+
)
|
157 |
+
return sample_dict
|
158 |
+
|
159 |
+
|
160 |
+
def create_and_run_prompt_dtb(prot, drug):
|
161 |
+
sample_dict = format_prompt_dti(prot, drug)
|
162 |
+
# Post-process the model's output
|
163 |
+
# batch_dict = model_dti.forward_encoder_only([sample_dict])
|
164 |
+
batch_dict = models[dti].forward_encoder_only([sample_dict])
|
165 |
+
batch_dict = DtiBindingdbKdTask.process_model_output(
|
166 |
+
batch_dict,
|
167 |
+
scalars_preds_processed_key="model.out.dti_bindingdb_kd",
|
168 |
+
norm_y_mean=5.79384684128215,
|
169 |
+
norm_y_std=1.33808027428196,
|
170 |
+
)
|
171 |
+
ans = [
|
172 |
+
"model.out.dti_bindingdb_kd",
|
173 |
+
float(batch_dict["model.out.dti_bindingdb_kd"][0]),
|
174 |
+
]
|
175 |
+
res = sample_dict["data.query.encoder_input"], *ans
|
176 |
+
return res
|
177 |
+
|
178 |
+
|
179 |
+
def create_tdb_demo():
|
180 |
+
markup_text = f"""
|
181 |
+
# Mammal based Target-Drug binding affinity demonstration
|
182 |
+
|
183 |
+
Given a protein sequence and a drug (in SMILES), estimate the binding affinity.
|
184 |
+
|
185 |
+
### Using the model from
|
186 |
+
|
187 |
+
```{model_paths[dti]} ```
|
188 |
+
"""
|
189 |
+
with gr.Group() as tdb_demo:
|
190 |
+
gr.Markdown(markup_text)
|
191 |
+
with gr.Row():
|
192 |
+
prot = gr.Textbox(
|
193 |
+
label="Protein sequence",
|
194 |
+
# info="standard",
|
195 |
+
interactive=True,
|
196 |
+
lines=3,
|
197 |
+
value=target_seq,
|
198 |
+
)
|
199 |
+
drug = gr.Textbox(
|
200 |
+
label="drug sequence (SMILES)",
|
201 |
+
# info="standard",
|
202 |
+
interactive=True,
|
203 |
+
lines=3,
|
204 |
+
value=drug_seq,
|
205 |
+
)
|
206 |
+
with gr.Row():
|
207 |
+
run_mammal = gr.Button(
|
208 |
+
"Run Mammal prompt for Target Drug Affinity", variant="primary"
|
209 |
+
)
|
210 |
+
with gr.Row():
|
211 |
+
prompt_box = gr.Textbox(label="Mammal prompt", lines=5)
|
212 |
+
|
213 |
+
with gr.Row():
|
214 |
+
decoded = gr.Textbox(label="Mammal output")
|
215 |
+
run_mammal.click(
|
216 |
+
fn=create_and_run_prompt_dtb,
|
217 |
+
inputs=[prot, drug],
|
218 |
+
outputs=[prompt_box, decoded, gr.Number(label="DTI score")],
|
219 |
+
)
|
220 |
+
tdb_demo.visible = False
|
221 |
+
return tdb_demo
|
222 |
+
|
223 |
+
|
224 |
+
def create_application():
|
225 |
+
|
226 |
+
with gr.Blocks() as demo:
|
227 |
+
main_dropdown = gr.Dropdown(choices=["select demo", ppi, dti])
|
228 |
+
main_dropdown.interactive = True
|
229 |
+
ppi_demo = create_ppi_demo()
|
230 |
+
dtb_demo = create_tdb_demo()
|
231 |
+
|
232 |
+
def set_ppi_vis(main_text):
|
233 |
+
return gr.Group(visible=main_text == ppi), gr.Group(
|
234 |
+
visible=main_text == dti
|
235 |
+
)
|
236 |
+
|
237 |
+
main_dropdown.change(
|
238 |
+
set_ppi_vis, inputs=main_dropdown, outputs=[ppi_demo, dtb_demo]
|
239 |
+
)
|
240 |
+
return demo
|
241 |
+
|
242 |
|
243 |
def main():
|
244 |
demo = create_application()
|
requirements.txt
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
# for the mammal demo app
|
2 |
mammal @ git+https://github.com/BiomedSciAI/biomed-multi-alignment.git
|
|
|
|
1 |
# for the mammal demo app
|
2 |
mammal @ git+https://github.com/BiomedSciAI/biomed-multi-alignment.git
|
3 |
+
pytdc
|