Spaces:
Sleeping
Sleeping
matanninio
commited on
Commit
·
b93c8a7
1
Parent(s):
e3cb71b
cleanup and normalization of tasks
Browse files- mammal_demo/demo_framework.py +49 -19
- mammal_demo/ppi_task.py +21 -31
- mammal_demo/ps_task.py +7 -4
- mammal_demo/tcr_task.py +18 -38
mammal_demo/demo_framework.py
CHANGED
@@ -11,6 +11,8 @@ class MammalObjectBroker:
|
|
11 |
model_path: str,
|
12 |
name: str | None = None,
|
13 |
task_list: list[str] | None = None,
|
|
|
|
|
14 |
) -> None:
|
15 |
self.model_path = model_path
|
16 |
if name is None:
|
@@ -22,12 +24,14 @@ class MammalObjectBroker:
|
|
22 |
self.tasks = task_list
|
23 |
self._model: Mammal | None = None
|
24 |
self._tokenizer_op = None
|
|
|
|
|
25 |
|
26 |
@property
|
27 |
def model(self) -> Mammal:
|
28 |
if self._model is None:
|
29 |
self._model = Mammal.from_pretrained(self.model_path)
|
30 |
-
|
31 |
return self._model
|
32 |
|
33 |
@property
|
@@ -36,6 +40,11 @@ class MammalObjectBroker:
|
|
36 |
self._tokenizer_op = ModularTokenizerOp.from_pretrained(self.model_path)
|
37 |
return self._tokenizer_op
|
38 |
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
class MammalTask(ABC):
|
41 |
def __init__(self, name: str, model_dict: dict[str, MammalObjectBroker]) -> None:
|
@@ -44,19 +53,6 @@ class MammalTask(ABC):
|
|
44 |
self._demo = None
|
45 |
self.model_dict = model_dict
|
46 |
|
47 |
-
# @abstractmethod
|
48 |
-
# def _generate_prompt(self, **kwargs) -> str:
|
49 |
-
# """Formatting prompt to match pre-training syntax
|
50 |
-
|
51 |
-
# Args:
|
52 |
-
# prot1 (_type_): _description_
|
53 |
-
# prot2 (_type_): _description_
|
54 |
-
|
55 |
-
# Raises:
|
56 |
-
# No: _description_
|
57 |
-
# """
|
58 |
-
# raise NotImplementedError()
|
59 |
-
|
60 |
@abstractmethod
|
61 |
def crate_sample_dict(
|
62 |
self, sample_inputs: dict, model_holder: MammalObjectBroker
|
@@ -97,10 +93,39 @@ class MammalTask(ABC):
|
|
97 |
def decode_output(self, batch_dict, model: Mammal) -> list:
|
98 |
raise NotImplementedError()
|
99 |
|
100 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
104 |
|
105 |
|
106 |
class TaskRegistry(dict[str, MammalTask]):
|
@@ -114,7 +139,9 @@ class TaskRegistry(dict[str, MammalTask]):
|
|
114 |
class ModelRegistry(dict[str, MammalObjectBroker]):
|
115 |
"""just a dictionary with a register models"""
|
116 |
|
117 |
-
def register_model(
|
|
|
|
|
118 |
"""register a model and return the name of the model
|
119 |
Args:
|
120 |
model_path (_type_): _description_
|
@@ -124,7 +151,10 @@ class ModelRegistry(dict[str, MammalObjectBroker]):
|
|
124 |
str: model name
|
125 |
"""
|
126 |
model_holder = MammalObjectBroker(
|
127 |
-
model_path=model_path,
|
|
|
|
|
|
|
128 |
)
|
129 |
self[model_holder.name] = model_holder
|
130 |
return model_holder.name
|
|
|
11 |
model_path: str,
|
12 |
name: str | None = None,
|
13 |
task_list: list[str] | None = None,
|
14 |
+
*,
|
15 |
+
force_preload=False,
|
16 |
) -> None:
|
17 |
self.model_path = model_path
|
18 |
if name is None:
|
|
|
24 |
self.tasks = task_list
|
25 |
self._model: Mammal | None = None
|
26 |
self._tokenizer_op = None
|
27 |
+
if force_preload:
|
28 |
+
self.force_preload()
|
29 |
|
30 |
@property
|
31 |
def model(self) -> Mammal:
|
32 |
if self._model is None:
|
33 |
self._model = Mammal.from_pretrained(self.model_path)
|
34 |
+
self._model.eval()
|
35 |
return self._model
|
36 |
|
37 |
@property
|
|
|
40 |
self._tokenizer_op = ModularTokenizerOp.from_pretrained(self.model_path)
|
41 |
return self._tokenizer_op
|
42 |
|
43 |
+
def force_preload(self):
|
44 |
+
"""pre-load the model and tokenizer (in this order)"""
|
45 |
+
_ = self.model
|
46 |
+
_ = self.tokenizer_op
|
47 |
+
|
48 |
|
49 |
class MammalTask(ABC):
|
50 |
def __init__(self, name: str, model_dict: dict[str, MammalObjectBroker]) -> None:
|
|
|
53 |
self._demo = None
|
54 |
self.model_dict = model_dict
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
@abstractmethod
|
57 |
def crate_sample_dict(
|
58 |
self, sample_inputs: dict, model_holder: MammalObjectBroker
|
|
|
93 |
def decode_output(self, batch_dict, model: Mammal) -> list:
|
94 |
raise NotImplementedError()
|
95 |
|
96 |
+
# classification helpers
|
97 |
+
@staticmethod
|
98 |
+
def positive_token_id(tokenizer_op: ModularTokenizerOp) -> int:
|
99 |
+
"""token for positive binding
|
100 |
+
|
101 |
+
Args:
|
102 |
+
model (MammalTrainedModel): model holding tokenizer
|
103 |
+
|
104 |
+
Returns:
|
105 |
+
int: id of positive binding token
|
106 |
+
"""
|
107 |
+
return tokenizer_op.get_token_id("<1>")
|
108 |
+
|
109 |
+
@staticmethod
|
110 |
+
def negative_token_id(tokenizer_op: ModularTokenizerOp) -> int:
|
111 |
+
"""token for negative binding
|
112 |
+
|
113 |
+
Args:
|
114 |
+
model (MammalTrainedModel): model holding tokenizer
|
115 |
+
|
116 |
+
Returns:
|
117 |
+
int: id of negative binding token
|
118 |
+
"""
|
119 |
+
return tokenizer_op.get_token_id("<0>")
|
120 |
+
|
121 |
+
@staticmethod
|
122 |
+
def get_label_from_token(tokenizer_op: ModularTokenizerOp, token_id):
|
123 |
|
124 |
+
label_mapping = {
|
125 |
+
MammalTask.negative_token_id(tokenizer_op): "negative",
|
126 |
+
MammalTask.positive_token_id(tokenizer_op): "positive",
|
127 |
+
}
|
128 |
+
return label_mapping.get(token_id, token_id)
|
129 |
|
130 |
|
131 |
class TaskRegistry(dict[str, MammalTask]):
|
|
|
139 |
class ModelRegistry(dict[str, MammalObjectBroker]):
|
140 |
"""just a dictionary with a register models"""
|
141 |
|
142 |
+
def register_model(
|
143 |
+
self, model_path, task_list=None, name=None, *, force_preload=False
|
144 |
+
):
|
145 |
"""register a model and return the name of the model
|
146 |
Args:
|
147 |
model_path (_type_): _description_
|
|
|
151 |
str: model name
|
152 |
"""
|
153 |
model_holder = MammalObjectBroker(
|
154 |
+
model_path=model_path,
|
155 |
+
task_list=task_list,
|
156 |
+
name=name,
|
157 |
+
force_preload=force_preload,
|
158 |
)
|
159 |
self[model_holder.name] = model_holder
|
160 |
return model_holder.name
|
mammal_demo/ppi_task.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
import gradio as gr
|
2 |
import torch
|
|
|
3 |
from mammal.keys import (
|
4 |
CLS_PRED,
|
5 |
ENCODER_INPUTS_ATTENTION_MASK,
|
6 |
ENCODER_INPUTS_STR,
|
7 |
ENCODER_INPUTS_TOKENS,
|
|
|
8 |
)
|
9 |
from mammal.model import Mammal
|
10 |
|
@@ -24,24 +26,12 @@ class PpiTask(MammalTask):
|
|
24 |
|
25 |
Given two protein sequences, estimate if the proteins interact or not."""
|
26 |
|
27 |
-
|
28 |
-
def positive_token_id(model_holder: MammalObjectBroker):
|
29 |
-
"""token for positive binding
|
30 |
-
|
31 |
-
Args:
|
32 |
-
model (MammalTrainedModel): model holding tokenizer
|
33 |
-
|
34 |
-
Returns:
|
35 |
-
int: id of positive binding token
|
36 |
-
"""
|
37 |
-
return model_holder.tokenizer_op.get_token_id("<1>")
|
38 |
-
|
39 |
-
def generate_prompt(self, prot1, prot2):
|
40 |
"""Formatting prompt to match pre-training syntax
|
41 |
|
42 |
Args:
|
43 |
-
|
44 |
-
|
45 |
|
46 |
Returns:
|
47 |
str: prompt
|
@@ -49,9 +39,9 @@ class PpiTask(MammalTask):
|
|
49 |
prompt = (
|
50 |
"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"
|
51 |
+ "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"
|
52 |
-
+ f"<SEQUENCE_NATURAL_START>{
|
53 |
+ "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"
|
54 |
-
+ f"<SEQUENCE_NATURAL_START>{
|
55 |
)
|
56 |
return prompt
|
57 |
|
@@ -74,6 +64,7 @@ class PpiTask(MammalTask):
|
|
74 |
sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
|
75 |
sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
|
76 |
)
|
|
|
77 |
return sample_dict
|
78 |
|
79 |
def run_model(self, sample_dict, model: Mammal):
|
@@ -86,27 +77,26 @@ class PpiTask(MammalTask):
|
|
86 |
)
|
87 |
return batch_dict
|
88 |
|
89 |
-
def decode_output(self, batch_dict,
|
90 |
|
91 |
# Get output
|
92 |
-
generated_output =
|
93 |
-
|
94 |
-
)
|
95 |
-
score = batch_dict["model.out.scores"][0][1][
|
96 |
-
self.positive_token_id(model_holder)
|
97 |
-
].item()
|
98 |
|
99 |
-
|
|
|
100 |
|
101 |
-
def create_and_run_prompt(self, model_name,
|
102 |
model_holder = self.model_dict[model_name]
|
103 |
-
sample_inputs = {"
|
104 |
sample_dict = self.crate_sample_dict(
|
105 |
sample_inputs=sample_inputs, model_holder=model_holder
|
106 |
)
|
107 |
prompt = sample_dict[ENCODER_INPUTS_STR]
|
108 |
batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
|
109 |
-
res = prompt, *self.decode_output(
|
|
|
|
|
110 |
return res
|
111 |
|
112 |
def create_demo(self, model_name_widget: gr.component):
|
@@ -119,14 +109,14 @@ class PpiTask(MammalTask):
|
|
119 |
with gr.Group() as demo:
|
120 |
gr.Markdown(self.markup_text)
|
121 |
with gr.Row():
|
122 |
-
|
123 |
label="Protein 1 sequence",
|
124 |
# info="standard",
|
125 |
interactive=True,
|
126 |
lines=3,
|
127 |
value=self.examples["protein_calmodulin"],
|
128 |
)
|
129 |
-
|
130 |
label="Protein 2 sequence",
|
131 |
# info="standard",
|
132 |
interactive=True,
|
@@ -145,7 +135,7 @@ class PpiTask(MammalTask):
|
|
145 |
score_box = gr.Number(label="PPI score")
|
146 |
run_mammal.click(
|
147 |
fn=self.create_and_run_prompt,
|
148 |
-
inputs=[model_name_widget,
|
149 |
outputs=[prompt_box, decoded, score_box],
|
150 |
)
|
151 |
with gr.Row():
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
+
from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
|
4 |
from mammal.keys import (
|
5 |
CLS_PRED,
|
6 |
ENCODER_INPUTS_ATTENTION_MASK,
|
7 |
ENCODER_INPUTS_STR,
|
8 |
ENCODER_INPUTS_TOKENS,
|
9 |
+
SCORES,
|
10 |
)
|
11 |
from mammal.model import Mammal
|
12 |
|
|
|
26 |
|
27 |
Given two protein sequences, estimate if the proteins interact or not."""
|
28 |
|
29 |
+
def generate_prompt(self, protein_seq_1, protein_seq_2):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
"""Formatting prompt to match pre-training syntax
|
31 |
|
32 |
Args:
|
33 |
+
protein_seq_1 (str): sequance of protein number 1
|
34 |
+
protein_seq_2 (str): sequance of protein number 2
|
35 |
|
36 |
Returns:
|
37 |
str: prompt
|
|
|
39 |
prompt = (
|
40 |
"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"
|
41 |
+ "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"
|
42 |
+
+ f"<SEQUENCE_NATURAL_START>{protein_seq_1}<SEQUENCE_NATURAL_END>"
|
43 |
+ "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"
|
44 |
+
+ f"<SEQUENCE_NATURAL_START>{protein_seq_2}<SEQUENCE_NATURAL_END><EOS>"
|
45 |
)
|
46 |
return prompt
|
47 |
|
|
|
64 |
sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
|
65 |
sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
|
66 |
)
|
67 |
+
|
68 |
return sample_dict
|
69 |
|
70 |
def run_model(self, sample_dict, model: Mammal):
|
|
|
77 |
)
|
78 |
return batch_dict
|
79 |
|
80 |
+
def decode_output(self, batch_dict, tokenizer_op: ModularTokenizerOp) -> list:
|
81 |
|
82 |
# Get output
|
83 |
+
generated_output = tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0])
|
84 |
+
score = batch_dict[SCORES][0][1][self.positive_token_id(tokenizer_op)].item()
|
|
|
|
|
|
|
|
|
85 |
|
86 |
+
ans = [generated_output, score]
|
87 |
+
return ans
|
88 |
|
89 |
+
def create_and_run_prompt(self, model_name, protein_seq_1, protein_seq_2):
|
90 |
model_holder = self.model_dict[model_name]
|
91 |
+
sample_inputs = {"protein_seq_1": protein_seq_1, "protein_seq_2": protein_seq_2}
|
92 |
sample_dict = self.crate_sample_dict(
|
93 |
sample_inputs=sample_inputs, model_holder=model_holder
|
94 |
)
|
95 |
prompt = sample_dict[ENCODER_INPUTS_STR]
|
96 |
batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
|
97 |
+
res = prompt, *self.decode_output(
|
98 |
+
batch_dict, tokenizer_op=model_holder.tokenizer_op
|
99 |
+
)
|
100 |
return res
|
101 |
|
102 |
def create_demo(self, model_name_widget: gr.component):
|
|
|
109 |
with gr.Group() as demo:
|
110 |
gr.Markdown(self.markup_text)
|
111 |
with gr.Row():
|
112 |
+
protein_seq_1 = gr.Textbox(
|
113 |
label="Protein 1 sequence",
|
114 |
# info="standard",
|
115 |
interactive=True,
|
116 |
lines=3,
|
117 |
value=self.examples["protein_calmodulin"],
|
118 |
)
|
119 |
+
protein_seq_2 = gr.Textbox(
|
120 |
label="Protein 2 sequence",
|
121 |
# info="standard",
|
122 |
interactive=True,
|
|
|
135 |
score_box = gr.Number(label="PPI score")
|
136 |
run_mammal.click(
|
137 |
fn=self.create_and_run_prompt,
|
138 |
+
inputs=[model_name_widget, protein_seq_1, protein_seq_2],
|
139 |
outputs=[prompt_box, decoded, score_box],
|
140 |
)
|
141 |
with gr.Row():
|
mammal_demo/ps_task.py
CHANGED
@@ -10,6 +10,9 @@ from mammal.model import Mammal
|
|
10 |
|
11 |
from mammal_demo.demo_framework import MammalObjectBroker, MammalTask
|
12 |
|
|
|
|
|
|
|
13 |
|
14 |
class PsTask(MammalTask):
|
15 |
def __init__(self, model_dict):
|
@@ -34,7 +37,7 @@ Given the protein sequence, estimate if it's water-soluble.
|
|
34 |
dict: sample_dict for feeding into model
|
35 |
"""
|
36 |
sample_dict = dict(sample_inputs) # shallow copy
|
37 |
-
sample_dict =
|
38 |
sample_dict=sample_dict,
|
39 |
protein_sequence_key="protein_seq",
|
40 |
tokenizer_op=model_holder.tokenizer_op,
|
@@ -57,7 +60,7 @@ Given the protein sequence, estimate if it's water-soluble.
|
|
57 |
"""
|
58 |
Extract predicted class and scores
|
59 |
"""
|
60 |
-
ans_dict =
|
61 |
tokenizer_op=tokenizer_op,
|
62 |
decoder_output=batch_dict[CLS_PRED][0],
|
63 |
decoder_output_scores=batch_dict[SCORES][0],
|
@@ -72,11 +75,11 @@ Given the protein sequence, estimate if it's water-soluble.
|
|
72 |
|
73 |
def create_and_run_prompt(self, model_name, protein_seq):
|
74 |
model_holder = self.model_dict[model_name]
|
75 |
-
|
76 |
"protein_seq": protein_seq,
|
77 |
}
|
78 |
sample_dict = self.crate_sample_dict(
|
79 |
-
sample_inputs=
|
80 |
)
|
81 |
prompt = sample_dict[ENCODER_INPUTS_STR]
|
82 |
batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
|
|
|
10 |
|
11 |
from mammal_demo.demo_framework import MammalObjectBroker, MammalTask
|
12 |
|
13 |
+
data_preprocessing = ProteinSolubilityTask.data_preprocessing
|
14 |
+
process_model_output = ProteinSolubilityTask.process_model_output
|
15 |
+
|
16 |
|
17 |
class PsTask(MammalTask):
|
18 |
def __init__(self, model_dict):
|
|
|
37 |
dict: sample_dict for feeding into model
|
38 |
"""
|
39 |
sample_dict = dict(sample_inputs) # shallow copy
|
40 |
+
sample_dict = data_preprocessing(
|
41 |
sample_dict=sample_dict,
|
42 |
protein_sequence_key="protein_seq",
|
43 |
tokenizer_op=model_holder.tokenizer_op,
|
|
|
60 |
"""
|
61 |
Extract predicted class and scores
|
62 |
"""
|
63 |
+
ans_dict = process_model_output(
|
64 |
tokenizer_op=tokenizer_op,
|
65 |
decoder_output=batch_dict[CLS_PRED][0],
|
66 |
decoder_output_scores=batch_dict[SCORES][0],
|
|
|
75 |
|
76 |
def create_and_run_prompt(self, model_name, protein_seq):
|
77 |
model_holder = self.model_dict[model_name]
|
78 |
+
sample_inputs = {
|
79 |
"protein_seq": protein_seq,
|
80 |
}
|
81 |
sample_dict = self.crate_sample_dict(
|
82 |
+
sample_inputs=sample_inputs, model_holder=model_holder
|
83 |
)
|
84 |
prompt = sample_dict[ENCODER_INPUTS_STR]
|
85 |
batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
|
mammal_demo/tcr_task.py
CHANGED
@@ -29,7 +29,7 @@ class TcrTask(MammalTask):
|
|
29 |
Given a TCR beta chain and epitope amino acid sequences, estimate the binding affinity score.
|
30 |
"""
|
31 |
|
32 |
-
def
|
33 |
prompt = (
|
34 |
"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"
|
35 |
+ f"<@TOKENIZER-TYPE=AA><MOLECULAR_ENTITY><MOLECULAR_ENTITY_TCR_BETA_VDJ><SEQUENCE_NATURAL_START>{tcr_beta_seq}<SEQUENCE_NATURAL_END>"
|
@@ -48,20 +48,21 @@ Given a TCR beta chain and epitope amino acid sequences, estimate the binding af
|
|
48 |
dict: sample_dict for feeding into model
|
49 |
"""
|
50 |
sample_dict = dict()
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
55 |
sample_dict=sample_dict,
|
56 |
key_in=ENCODER_INPUTS_STR,
|
57 |
key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
|
58 |
key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
|
59 |
)
|
60 |
sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
|
61 |
-
sample_dict[ENCODER_INPUTS_TOKENS]
|
62 |
)
|
63 |
sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
|
64 |
-
sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
|
65 |
)
|
66 |
|
67 |
return sample_dict
|
@@ -76,47 +77,26 @@ Given a TCR beta chain and epitope amino acid sequences, estimate the binding af
|
|
76 |
)
|
77 |
return batch_dict
|
78 |
|
79 |
-
@staticmethod
|
80 |
-
def positive_token_id(tokenizer_op: ModularTokenizerOp):
|
81 |
-
"""token for positive binding
|
82 |
-
|
83 |
-
Args:
|
84 |
-
model (MammalTrainedModel): model holding tokenizer
|
85 |
-
|
86 |
-
Returns:
|
87 |
-
int: id of positive binding token
|
88 |
-
"""
|
89 |
-
return tokenizer_op.get_token_id("<1>")
|
90 |
-
|
91 |
-
@staticmethod
|
92 |
-
def negative_token_id(tokenizer_op: ModularTokenizerOp):
|
93 |
-
"""token for negative binding
|
94 |
-
|
95 |
-
Args:
|
96 |
-
model (MammalTrainedModel): model holding tokenizer
|
97 |
-
|
98 |
-
Returns:
|
99 |
-
int: id of negative binding token
|
100 |
-
"""
|
101 |
-
return tokenizer_op.get_token_id("<0>")
|
102 |
-
|
103 |
def decode_output(self, batch_dict, tokenizer_op: ModularTokenizerOp) -> list:
|
104 |
"""
|
105 |
Extract predicted class and scores
|
106 |
"""
|
107 |
|
108 |
-
|
109 |
-
|
110 |
|
111 |
-
negative_token_id = tokenizer_op.get_token_id("<0>")
|
112 |
-
positive_token_id = tokenizer_op.get_token_id("<1>")
|
113 |
|
114 |
label_id_to_int = {
|
115 |
-
negative_token_id:
|
116 |
-
positive_token_id:
|
117 |
}
|
118 |
classification_position = 1
|
119 |
|
|
|
|
|
|
|
120 |
decoder_output = batch_dict[CLS_PRED][0]
|
121 |
decoder_output_scores = batch_dict[SCORES][0]
|
122 |
|
@@ -126,7 +106,7 @@ Given a TCR beta chain and epitope amino acid sequences, estimate the binding af
|
|
126 |
scores = [None]
|
127 |
|
128 |
ans = [
|
129 |
-
|
130 |
label_id_to_int.get(int(decoder_output[classification_position]), -1),
|
131 |
scores.item(),
|
132 |
]
|
|
|
29 |
Given a TCR beta chain and epitope amino acid sequences, estimate the binding affinity score.
|
30 |
"""
|
31 |
|
32 |
+
def generate_prompt(self, tcr_beta_seq, epitope_seq):
|
33 |
prompt = (
|
34 |
"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"
|
35 |
+ f"<@TOKENIZER-TYPE=AA><MOLECULAR_ENTITY><MOLECULAR_ENTITY_TCR_BETA_VDJ><SEQUENCE_NATURAL_START>{tcr_beta_seq}<SEQUENCE_NATURAL_END>"
|
|
|
48 |
dict: sample_dict for feeding into model
|
49 |
"""
|
50 |
sample_dict = dict()
|
51 |
+
prompt = self.generate_prompt(**sample_inputs)
|
52 |
+
sample_dict[ENCODER_INPUTS_STR] = prompt
|
53 |
+
|
54 |
+
# Tokenize
|
55 |
+
sample_dict = model_holder.tokenizer_op(
|
56 |
sample_dict=sample_dict,
|
57 |
key_in=ENCODER_INPUTS_STR,
|
58 |
key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
|
59 |
key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
|
60 |
)
|
61 |
sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
|
62 |
+
sample_dict[ENCODER_INPUTS_TOKENS]
|
63 |
)
|
64 |
sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
|
65 |
+
sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
|
66 |
)
|
67 |
|
68 |
return sample_dict
|
|
|
77 |
)
|
78 |
return batch_dict
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
def decode_output(self, batch_dict, tokenizer_op: ModularTokenizerOp) -> list:
|
81 |
"""
|
82 |
Extract predicted class and scores
|
83 |
"""
|
84 |
|
85 |
+
positive_token_id = self.positive_token_id(tokenizer_op)
|
86 |
+
negative_token_id = self.negative_token_id(tokenizer_op)
|
87 |
|
88 |
+
# negative_token_id = tokenizer_op.get_token_id("<0>")
|
89 |
+
# positive_token_id = tokenizer_op.get_token_id("<1>")
|
90 |
|
91 |
label_id_to_int = {
|
92 |
+
negative_token_id: "negative",
|
93 |
+
positive_token_id: "positive",
|
94 |
}
|
95 |
classification_position = 1
|
96 |
|
97 |
+
# Get output
|
98 |
+
generated_output = tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0])
|
99 |
+
|
100 |
decoder_output = batch_dict[CLS_PRED][0]
|
101 |
decoder_output_scores = batch_dict[SCORES][0]
|
102 |
|
|
|
106 |
scores = [None]
|
107 |
|
108 |
ans = [
|
109 |
+
generated_output,
|
110 |
label_id_to_int.get(int(decoder_output[classification_position]), -1),
|
111 |
scores.item(),
|
112 |
]
|