biomed-multi-alignment

Sleeping

App Files Files Community

matanninio commited on 23 days ago

Commit

f8080fc

•

1 Parent(s): 93d0d1a

improved handling of global all_models

Browse files

Files changed (5) hide show

__init__.py → mammal_demo/__init__.py +0 -0
demo_framework.py → mammal_demo/demo_framework.py +2 -1
mammal_demo/dti_task.py +117 -0
mammal_demo/ppi_task.py +152 -0
new_app.py +7 -261

__init__.py → mammal_demo/__init__.py RENAMED Viewed

File without changes

demo_framework.py → mammal_demo/demo_framework.py RENAMED Viewed

@@ -41,10 +41,11 @@ class MammalObjectBroker():
 class MammalTask(ABC):
-    def __init__(self, name:str) -> None:
             self.name = name
             self.description = None
             self._demo = None
     # @abstractmethod
     # def _generate_prompt(self, **kwargs) -> str:

 class MammalTask(ABC):
+    def __init__(self, name:str, model_dict: dict[str,MammalObjectBroker]) -> None:
             self.name = name
             self.description = None
             self._demo = None
+            self.model_dict = model_dict
     # @abstractmethod
     # def _generate_prompt(self, **kwargs) -> str:

mammal_demo/dti_task.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import gradio as gr
+from mammal.examples.dti_bindingdb_kd.task import DtiBindingdbKdTask
+from mammal.keys import *
+from mammal.model import Mammal
+from mammal_demo.demo_framework import MammalObjectBroker, MammalTask
+class DtiTask(MammalTask):
+    def __init__(self, model_dict):
+        super().__init__(name="Drug-Target Binding Affinity", model_dict=model_dict)
+        self.description = "Drug-Target Binding Affinity (tdi)"
+        self.examples = {
+            "target_seq": "NLMKRCTRGFRKLGKCTTLEEEKCKTLYPRGQCTCSDSKMNTHSCDCKSC",
+            "drug_seq":"CC(=O)NCCC1=CNc2c1cc(OC)cc2"
+            }
+        self.markup_text = """
+# Mammal based Target-Drug binding affinity demonstration
+Given a protein sequence and a drug (in SMILES), estimate the binding affinity.
+"""
+    def crate_sample_dict(self, sample_inputs:dict, model_holder:MammalObjectBroker):
+        """convert sample_inputs to sample_dict including creating a proper prompt
+        Args:
+            sample_inputs (dict): dictionary containing the inputs to the model
+            model_holder (MammalObjectBroker): model holder
+        Returns:
+           dict: sample_dict for feeding into model
+        """
+        sample_dict = dict(sample_inputs)
+        sample_dict = DtiBindingdbKdTask.data_preprocessing(
+            sample_dict=sample_dict,
+            tokenizer_op=model_holder.tokenizer_op,
+            target_sequence_key="target_seq",
+            drug_sequence_key="drug_seq",
+            norm_y_mean=None,
+            norm_y_std=None,
+            device=model_holder.model.device,
+        )
+        return sample_dict
+    def run_model(self, sample_dict, model: Mammal):
+        # Generate Prediction
+        batch_dict = model.forward_encoder_only([sample_dict])
+        return batch_dict
+    def decode_output(self,batch_dict, model_holder):
+        # Get output
+        batch_dict = DtiBindingdbKdTask.process_model_output(
+            batch_dict,
+            scalars_preds_processed_key="model.out.dti_bindingdb_kd",
+            norm_y_mean=5.79384684128215,
+            norm_y_std=1.33808027428196,
+            )
+        ans = (
+        "model.out.dti_bindingdb_kd",
+        float(batch_dict["model.out.dti_bindingdb_kd"][0]),
+        )
+        return ans
+    def create_and_run_prompt(self,model_name,target_seq, drug_seq):
+        model_holder = self.model_dict[model_name]
+        inputs = {
+            "target_seq": target_seq,
+            "drug_seq": drug_seq,
+        }
+        sample_dict = self.crate_sample_dict(sample_inputs=inputs, model_holder=model_holder)
+        prompt=sample_dict[ENCODER_INPUTS_STR]
+        batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
+        res = prompt, *self.decode_output(batch_dict,model_holder=model_holder)
+        return res
+    def create_demo(self,model_name_widget):
+    # """
+    # ### Using the model from
+    # ```{model} ```
+    # """
+        with gr.Group() as demo:
+            gr.Markdown(self.markup_text)
+            with gr.Row():
+                target_textbox = gr.Textbox(
+                    label="target sequence",
+                    # info="standard",
+                    interactive=True,
+                    lines=3,
+                    value=self.examples["target_seq"],
+                )
+                drug_textbox = gr.Textbox(
+                    label="Drug sequance (in SMILES)",
+                    # info="standard",
+                    interactive=True,
+                    lines=3,
+                    value=self.examples["drug_seq"],
+                )
+            with gr.Row():
+                run_mammal = gr.Button(
+                    "Run Mammal prompt for Protein-Protein Interaction", variant="primary"
+                )
+            with gr.Row():
+                prompt_box = gr.Textbox(label="Mammal prompt", lines=5)
+            with gr.Row():
+                decoded = gr.Textbox(label="Mammal output key")
+                run_mammal.click(
+                    fn=self.create_and_run_prompt,
+                    inputs=[model_name_widget, target_textbox, drug_textbox],
+                    outputs=[prompt_box, decoded, gr.Number(label="binding affinity")],
+                )
+            demo.visible = False
+            return demo

mammal_demo/ppi_task.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import gradio as gr
+import torch
+from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
+from mammal.examples.dti_bindingdb_kd.task import DtiBindingdbKdTask
+from mammal.keys import *
+from mammal.model import Mammal
+from mammal_demo.demo_framework import MammalObjectBroker, MammalTask
+class PpiTask(MammalTask):
+    def __init__(self, model_dict):
+        super().__init__(name="Protein-Protein Interaction", model_dict=model_dict)
+        self.description = "Protein-Protein Interaction (PPI)"
+        self.examples = {
+            "protein_calmodulin": "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK",
+            "protein_calcineurin": "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ",
+        }
+        self.markup_text = """
+    # Mammal based {self.description} demonstration
+    Given two protein sequences, estimate if the proteins interact or not."""
+    @staticmethod
+    def positive_token_id(model_holder: MammalObjectBroker):
+        """token for positive binding
+        Args:
+            model (MammalTrainedModel): model holding tokenizer
+        Returns:
+            int: id of positive binding token
+        """
+        return model_holder.tokenizer_op.get_token_id("<1>")
+    def generate_prompt(self, prot1, prot2):
+        """Formatting prompt to match pre-training syntax
+        Args:
+            prot1 (str): sequance of protein number 1
+            prot2 (str): sequance of protein number 2
+        Returns:
+            str: prompt
+        """
+        prompt =  f"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"\
+            "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"\
+            "<SEQUENCE_NATURAL_START>{prot1}<SEQUENCE_NATURAL_END>"\
+            "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"\
+            "<SEQUENCE_NATURAL_START>{prot2}<SEQUENCE_NATURAL_END><EOS>"
+        return prompt
+    def crate_sample_dict(self,sample_inputs: dict, model_holder:MammalObjectBroker):
+        # Create and load sample
+        sample_dict = dict()
+        prompt = self.generate_prompt(*sample_inputs)
+        sample_dict[ENCODER_INPUTS_STR] = prompt
+        # Tokenize
+        sample_dict = model_holder.tokenizer_op(
+            sample_dict=sample_dict,
+            key_in=ENCODER_INPUTS_STR,
+            key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
+            key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
+        )
+        sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
+            sample_dict[ENCODER_INPUTS_TOKENS]
+        )
+        sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
+            sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
+        )
+        return sample_dict
+    def run_model(self, sample_dict, model: Mammal):
+        # Generate Prediction
+        batch_dict = model.generate(
+            [sample_dict],
+            output_scores=True,
+            return_dict_in_generate=True,
+            max_new_tokens=5,
+        )
+        return batch_dict
+    def decode_output(self,batch_dict, model_holder:MammalObjectBroker):
+        # Get output
+        generated_output = model_holder.tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0])
+        score = batch_dict["model.out.scores"][0][1][self.positive_token_id(model_holder)].item()
+        return generated_output, score
+    def create_and_run_prompt(self,model_name,protein1, protein2):
+        model_holder = self.model_dict[model_name]
+        sample_inputs = {"prot1":protein1,
+                  "prot2":protein2
+                  }
+        sample_dict = self.crate_sample_dict(sample_inputs=sample_inputs, model_holder=model_holder)
+        prompt = sample_dict[ENCODER_INPUTS_STR]
+        batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
+        res = prompt, *self.decode_output(batch_dict,model_holder=model_holder)
+        return res
+    def create_demo(self,model_name_widget:gr.component):
+    # """
+    # ### Using the model from
+    # ```{model} ```
+    # """
+        with gr.Group() as demo:
+            gr.Markdown(self.markup_text)
+            with gr.Row():
+                prot1 = gr.Textbox(
+                    label="Protein 1 sequence",
+                    # info="standard",
+                    interactive=True,
+                    lines=3,
+                    value=self.examples["protein_calmodulin"],
+                )
+                prot2 = gr.Textbox(
+                    label="Protein 2 sequence",
+                    # info="standard",
+                    interactive=True,
+                    lines=3,
+                    value=self.examples["protein_calcineurin"],
+                )
+            with gr.Row():
+                run_mammal: gr.Button = gr.Button(
+                    "Run Mammal prompt for Protein-Protein Interaction", variant="primary"
+                )
+            with gr.Row():
+                prompt_box = gr.Textbox(label="Mammal prompt", lines=5)
+            with gr.Row():
+                decoded = gr.Textbox(label="Mammal output")
+                run_mammal.click(
+                    fn=self.create_and_run_prompt,
+                    inputs=[model_name_widget, prot1, prot2],
+                    outputs=[prompt_box, decoded, gr.Number(label="PPI score")],
+                )
+            with gr.Row():
+                gr.Markdown(
+                    "```<SENTINEL_ID_0>``` contains the binding affinity class, which is ```<1>``` for interacting and ```<0>``` for non-interacting"
+                )
+            demo.visible = False
+            return demo

new_app.py CHANGED Viewed

@@ -1,273 +1,19 @@
 import gradio as gr
-import torch
-from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp
-from mammal.examples.dti_bindingdb_kd.task import DtiBindingdbKdTask
 from mammal.keys import *
-from mammal.model import Mammal
-from demo_framework import MammalObjectBroker, MammalTask
-all_tasks = dict()
-all_models= dict()
-class PpiTask(MammalTask):
-    def __init__(self):
-        super().__init__(name="Protein-Protein Interaction")
-        self.description = "Protein-Protein Interaction (PPI)"
-        self.examples = {
-            "protein_calmodulin": "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK",
-            "protein_calcineurin": "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ",
-        }
-        self.markup_text = """
-    # Mammal based {self.description} demonstration
-    Given two protein sequences, estimate if the proteins interact or not."""
-    @staticmethod
-    def positive_token_id(model_holder: MammalObjectBroker):
-        """token for positive binding
-        Args:
-            model (MammalTrainedModel): model holding tokenizer
-        Returns:
-            int: id of positive binding token
-        """
-        return model_holder.tokenizer_op.get_token_id("<1>")
-    def generate_prompt(self, prot1, prot2):
-        """Formatting prompt to match pre-training syntax
-        Args:
-            prot1 (str): sequance of protein number 1
-            prot2 (str): sequance of protein number 2
-        Returns:
-            str: prompt
-        """
-        prompt =  f"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>"\
-            "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"\
-            "<SEQUENCE_NATURAL_START>{prot1}<SEQUENCE_NATURAL_END>"\
-            "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>"\
-            "<SEQUENCE_NATURAL_START>{prot2}<SEQUENCE_NATURAL_END><EOS>"
-        return prompt
-    def crate_sample_dict(self,sample_inputs: dict, model_holder:MammalObjectBroker):
-        # Create and load sample
-        sample_dict = dict()
-        prompt = self.generate_prompt(*sample_inputs)
-        sample_dict[ENCODER_INPUTS_STR] = prompt
-        # Tokenize
-        sample_dict = model_holder.tokenizer_op(
-            sample_dict=sample_dict,
-            key_in=ENCODER_INPUTS_STR,
-            key_out_tokens_ids=ENCODER_INPUTS_TOKENS,
-            key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK,
-        )
-        sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor(
-            sample_dict[ENCODER_INPUTS_TOKENS]
-        )
-        sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor(
-            sample_dict[ENCODER_INPUTS_ATTENTION_MASK]
-        )
-        return sample_dict
-    def run_model(self, sample_dict, model: Mammal):
-        # Generate Prediction
-        batch_dict = model.generate(
-            [sample_dict],
-            output_scores=True,
-            return_dict_in_generate=True,
-            max_new_tokens=5,
-        )
-        return batch_dict
-    def decode_output(self,batch_dict, model_holder:MammalObjectBroker):
-        # Get output
-        generated_output = model_holder.tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0])
-        score = batch_dict["model.out.scores"][0][1][self.positive_token_id(model_holder)].item()
-        return generated_output, score
-    def create_and_run_prompt(self,model_name,protein1, protein2):
-        model_holder = all_models[model_name]
-        sample_inputs = {"prot1":protein1,
-                  "prot2":protein2
-                  }
-        sample_dict = self.crate_sample_dict(sample_inputs=sample_inputs, model_holder=model_holder)
-        prompt = sample_dict[ENCODER_INPUTS_STR]
-        batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
-        res = prompt, *self.decode_output(batch_dict,model_holder=model_holder)
-        return res
-    def create_demo(self,model_name_widget:gr.component):
-    # """
-    # ### Using the model from
-    # ```{model} ```
-    # """
-        with gr.Group() as demo:
-            gr.Markdown(self.markup_text)
-            with gr.Row():
-                prot1 = gr.Textbox(
-                    label="Protein 1 sequence",
-                    # info="standard",
-                    interactive=True,
-                    lines=3,
-                    value=self.examples["protein_calmodulin"],
-                )
-                prot2 = gr.Textbox(
-                    label="Protein 2 sequence",
-                    # info="standard",
-                    interactive=True,
-                    lines=3,
-                    value=self.examples["protein_calcineurin"],
-                )
-            with gr.Row():
-                run_mammal: gr.Button = gr.Button(
-                    "Run Mammal prompt for Protein-Protein Interaction", variant="primary"
-                )
-            with gr.Row():
-                prompt_box = gr.Textbox(label="Mammal prompt", lines=5)
-            with gr.Row():
-                decoded = gr.Textbox(label="Mammal output")
-                run_mammal.click(
-                    fn=self.create_and_run_prompt,
-                    inputs=[model_name_widget, prot1, prot2],
-                    outputs=[prompt_box, decoded, gr.Number(label="PPI score")],
-                )
-            with gr.Row():
-                gr.Markdown(
-                    "```<SENTINEL_ID_0>``` contains the binding affinity class, which is ```<1>``` for interacting and ```<0>``` for non-interacting"
-                )
-            demo.visible = False
-            return demo
-ppi_task = PpiTask()
 all_tasks[ppi_task.name]=ppi_task
-class DtiTask(MammalTask):
-    def __init__(self):
-        super().__init__(name="Drug-Target Binding Affinity")
-        self.description = "Drug-Target Binding Affinity (tdi)"
-        self.examples = {
-            "target_seq": "NLMKRCTRGFRKLGKCTTLEEEKCKTLYPRGQCTCSDSKMNTHSCDCKSC",
-            "drug_seq":"CC(=O)NCCC1=CNc2c1cc(OC)cc2"
-            }
-        self.markup_text = """
-# Mammal based Target-Drug binding affinity demonstration
-Given a protein sequence and a drug (in SMILES), estimate the binding affinity.
-"""
-    def crate_sample_dict(self, sample_inputs:dict, model_holder:MammalObjectBroker):
-        """convert sample_inputs to sample_dict including creating a proper prompt
-        Args:
-            sample_inputs (dict): dictionary containing the inputs to the model
-            model_holder (MammalObjectBroker): model holder
-        Returns:
-           dict: sample_dict for feeding into model
-        """
-        sample_dict = dict(sample_inputs)
-        sample_dict = DtiBindingdbKdTask.data_preprocessing(
-            sample_dict=sample_dict,
-            tokenizer_op=model_holder.tokenizer_op,
-            target_sequence_key="target_seq",
-            drug_sequence_key="drug_seq",
-            norm_y_mean=None,
-            norm_y_std=None,
-            device=model_holder.model.device,
-        )
-        return sample_dict
-    def run_model(self, sample_dict, model: Mammal):
-        # Generate Prediction
-        batch_dict = model.forward_encoder_only([sample_dict])
-        return batch_dict
-    def decode_output(self,batch_dict, model_holder):
-        # Get output
-        batch_dict = DtiBindingdbKdTask.process_model_output(
-            batch_dict,
-            scalars_preds_processed_key="model.out.dti_bindingdb_kd",
-            norm_y_mean=5.79384684128215,
-            norm_y_std=1.33808027428196,
-            )
-        ans = (
-        "model.out.dti_bindingdb_kd",
-        float(batch_dict["model.out.dti_bindingdb_kd"][0]),
-        )
-        return ans
-    def create_and_run_prompt(self,model_name,target_seq, drug_seq):
-        model_holder = all_models[model_name]
-        inputs = {
-            "target_seq": target_seq,
-            "drug_seq": drug_seq,
-        }
-        sample_dict = self.crate_sample_dict(sample_inputs=inputs, model_holder=model_holder)
-        prompt=sample_dict[ENCODER_INPUTS_STR]
-        batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model)
-        res = prompt, *self.decode_output(batch_dict,model_holder=model_holder)
-        return res
-    def create_demo(self,model_name_widget):
-    # """
-    # ### Using the model from
-    # ```{model} ```
-    # """
-        with gr.Group() as demo:
-            gr.Markdown(self.markup_text)
-            with gr.Row():
-                target_textbox = gr.Textbox(
-                    label="target sequence",
-                    # info="standard",
-                    interactive=True,
-                    lines=3,
-                    value=self.examples["target_seq"],
-                )
-                drug_textbox = gr.Textbox(
-                    label="Drug sequance (in SMILES)",
-                    # info="standard",
-                    interactive=True,
-                    lines=3,
-                    value=self.examples["drug_seq"],
-                )
-            with gr.Row():
-                run_mammal = gr.Button(
-                    "Run Mammal prompt for Protein-Protein Interaction", variant="primary"
-                )
-            with gr.Row():
-                prompt_box = gr.Textbox(label="Mammal prompt", lines=5)
-            with gr.Row():
-                decoded = gr.Textbox(label="Mammal output key")
-                run_mammal.click(
-                    fn=self.create_and_run_prompt,
-                    inputs=[model_name_widget, target_textbox, drug_textbox],
-                    outputs=[prompt_box, decoded, gr.Number(label="binding affinity")],
-                )
-            demo.visible = False
-            return demo
-tdi_task = DtiTask()
 all_tasks[tdi_task.name]=tdi_task
 ppi_model = MammalObjectBroker(model_path="ibm/biomed.omics.bl.sm.ma-ted-458m", task_list=[ppi_task.name])

 import gradio as gr
 from mammal.keys import *
+from mammal_demo.demo_framework import MammalObjectBroker
+from mammal_demo.ppi_task import PpiTask
+from mammal_demo.dti_task import DtiTask
+all_tasks = dict()
+all_models= dict()
+ppi_task = PpiTask(model_dict = all_models)
 all_tasks[ppi_task.name]=ppi_task
+tdi_task = DtiTask(model_dict = all_models)
 all_tasks[tdi_task.name]=tdi_task
 ppi_model = MammalObjectBroker(model_path="ibm/biomed.omics.bl.sm.ma-ted-458m", task_list=[ppi_task.name])