Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp | |
from mammal.keys import ( | |
CLS_PRED, | |
ENCODER_INPUTS_ATTENTION_MASK, | |
ENCODER_INPUTS_STR, | |
ENCODER_INPUTS_TOKENS, | |
SCORES, | |
) | |
from mammal.model import Mammal | |
from mammal_demo.demo_framework import MammalObjectBroker, MammalTask | |
class PpiTask(MammalTask): | |
def __init__(self, model_dict): | |
super().__init__(name="Protein-Protein Interaction", model_dict=model_dict) | |
self.description = "Protein-Protein Interaction (PPI)" | |
self.examples = { | |
"protein_calmodulin": "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK", | |
"protein_calcineurin": "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ", | |
} | |
self.markup_text = f""" | |
# Mammal based {self.description} demonstration | |
Given two protein sequences, estimate if the proteins interact or not.""" | |
def generate_prompt(self, protein_seq_1, protein_seq_2): | |
"""Formatting prompt to match pre-training syntax | |
Args: | |
protein_seq_1 (str): sequance of protein number 1 | |
protein_seq_2 (str): sequance of protein number 2 | |
Returns: | |
str: prompt | |
""" | |
prompt = ( | |
"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>" | |
+ "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>" | |
+ f"<SEQUENCE_NATURAL_START>{protein_seq_1}<SEQUENCE_NATURAL_END>" | |
+ "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>" | |
+ f"<SEQUENCE_NATURAL_START>{protein_seq_2}<SEQUENCE_NATURAL_END><EOS>" | |
) | |
return prompt | |
def crate_sample_dict(self, sample_inputs: dict, model_holder: MammalObjectBroker): | |
# Create and load sample | |
sample_dict = dict() | |
prompt = self.generate_prompt(**sample_inputs) | |
sample_dict[ENCODER_INPUTS_STR] = prompt | |
# Tokenize | |
sample_dict = model_holder.tokenizer_op( | |
sample_dict=sample_dict, | |
key_in=ENCODER_INPUTS_STR, | |
key_out_tokens_ids=ENCODER_INPUTS_TOKENS, | |
key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK, | |
) | |
sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor( | |
sample_dict[ENCODER_INPUTS_TOKENS] | |
) | |
sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor( | |
sample_dict[ENCODER_INPUTS_ATTENTION_MASK] | |
) | |
return sample_dict | |
def run_model(self, sample_dict, model: Mammal): | |
# Generate Prediction | |
batch_dict = model.generate( | |
[sample_dict], | |
output_scores=True, | |
return_dict_in_generate=True, | |
max_new_tokens=5, | |
) | |
return batch_dict | |
def decode_output(self, batch_dict, tokenizer_op: ModularTokenizerOp) -> list: | |
# Get output | |
generated_output = tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0]) | |
score = batch_dict[SCORES][0][1][self.positive_token_id(tokenizer_op)].item() | |
ans = [generated_output, score] | |
return ans | |
def create_and_run_prompt(self, model_name, protein_seq_1, protein_seq_2): | |
model_holder = self.model_dict[model_name] | |
sample_inputs = {"protein_seq_1": protein_seq_1, "protein_seq_2": protein_seq_2} | |
sample_dict = self.crate_sample_dict( | |
sample_inputs=sample_inputs, model_holder=model_holder | |
) | |
prompt = sample_dict[ENCODER_INPUTS_STR] | |
batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model) | |
res = prompt, *self.decode_output( | |
batch_dict, tokenizer_op=model_holder.tokenizer_op | |
) | |
return res | |
def create_demo(self, model_name_widget: gr.component): | |
# """ | |
# ### Using the model from | |
# ```{model} ``` | |
# """ | |
with gr.Group() as demo: | |
gr.Markdown(self.markup_text) | |
with gr.Row(): | |
protein_seq_1 = gr.Textbox( | |
label="Protein 1 sequence", | |
# info="standard", | |
interactive=True, | |
lines=3, | |
value=self.examples["protein_calmodulin"], | |
) | |
protein_seq_2 = gr.Textbox( | |
label="Protein 2 sequence", | |
# info="standard", | |
interactive=True, | |
lines=3, | |
value=self.examples["protein_calcineurin"], | |
) | |
with gr.Row(): | |
run_mammal: gr.Button = gr.Button( | |
"Run Mammal prompt for Protein-Protein Interaction", | |
variant="primary", | |
) | |
with gr.Row(): | |
prompt_box = gr.Textbox(label="Mammal prompt", lines=5) | |
with gr.Row(): | |
decoded = gr.Textbox(label="Mammal output") | |
score_box = gr.Number(label="PPI score") | |
run_mammal.click( | |
fn=self.create_and_run_prompt, | |
inputs=[model_name_widget, protein_seq_1, protein_seq_2], | |
outputs=[prompt_box, decoded, score_box], | |
) | |
with gr.Row(): | |
gr.Markdown( | |
"```<SENTINEL_ID_0>``` contains the binding affinity class, which is ```<1>``` for interacting and ```<0>``` for non-interacting" | |
) | |
demo.visible = False | |
return demo | |