Spaces:

westlake-repl
/

Demo_ProTrek_650M_UniRef50

Running on T4

App Files Files Community

LTEnjoy commited on May 22

Commit

619ec19

•

1 Parent(s): d578510

Add application file

Browse files

Files changed (18) hide show

Dockerfile +0 -2
bin/README.md +1 -0
demo/__init__.py +0 -0
demo/modules/__init__.py +19 -0
demo/modules/compute_score.py +113 -0
demo/modules/init_model.py +77 -0
demo/modules/search.py +163 -0
demo/run.py +22 -0
model/ProtTrek/protein_encoder.py +95 -0
model/ProtTrek/protrek_trimodal_model.py +853 -0
model/ProtTrek/structure_encoder.py +86 -0
model/ProtTrek/text_encoder.py +81 -0
model/abstract_model.py +401 -0
model/model_interface.py +104 -0
utils/constants.py +54 -0
utils/foldseek_util.py +121 -0
utils/lr_scheduler.py +187 -0
utils/mpr.py +397 -0

Dockerfile CHANGED Viewed

@@ -5,8 +5,6 @@ FROM continuumio/anaconda3:main
 WORKDIR /code
-WORKDIR /code
 COPY ./requirements.txt /code/requirements.txt
 RUN apt-get update

 WORKDIR /code
 COPY ./requirements.txt /code/requirements.txt
 RUN apt-get update

bin/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Place the Foldseek binary file here

demo/__init__.py ADDED Viewed

File without changes

demo/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import sys
+sys.path += []
+import argparse
+def main():
+    pass
+def get_args():
+    parser = argparse.ArgumentParser()
+    return parser.parse_args()
+if __name__ == '__main__':
+    args = get_args()
+    main()

demo/modules/compute_score.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import gradio as gr
+import torch
+from .init_model import model
+from utils.foldseek_util import get_struc_seq
+def compute_seq_text_score(input_1: str, input_2: str):
+    with torch.no_grad():
+        protein_embedding = model.get_protein_repr([input_1])
+        text_embedding = model.get_text_repr([input_2])
+        score = text_embedding @ protein_embedding.T / model.temperature
+    return f"{score.item():.4f}"
+def compute_struc_text_score(input_1: str, input_2: str):
+    with torch.no_grad():
+        protein_embedding = model.get_structure_repr([input_1])
+        text_embedding = model.get_text_repr([input_2])
+        score = text_embedding @ protein_embedding.T / model.temperature
+    return f"{score.item():.4f}"
+def compute_seq_struc_score(input_1: str, input_2: str):
+    with torch.no_grad():
+        protein_embedding_1 = model.get_protein_repr([input_1])
+        protein_embedding_2 = model.get_structure_repr([input_2])
+        score = protein_embedding_1 @ protein_embedding_2.T / model.temperature
+    return f"{score.item():.4f}"
+# Parse the uploaded structure file and return the sequence
+def pdb2seq(file):
+    parsed_seqs = get_struc_seq("/sujin/bin/foldseek", file)
+    for seqs in parsed_seqs.values():
+        return seqs[0]
+# Parse the uploaded structure file and return the foldseek sequence
+def pdb2foldseek(file):
+    parsed_seqs = get_struc_seq("/sujin/bin/foldseek", file)
+    for seqs in parsed_seqs.values():
+        return seqs[1].lower()
+# Build the block for computing protein-text similarity
+def build_score_computation():
+    gr.Markdown(f"# Compute similarity score between two modalities")
+    with gr.Row(equal_height=True):
+        with gr.Column():
+            # Compute similarity score between sequence and text
+            with gr.Tab("sequence - text"):
+                with gr.Row():
+                    seq_text_input_1 = gr.Textbox(label="sequence")
+                    # Provide an upload button to upload a pdb file
+                    upload_btn = gr.UploadButton(label="Upload .pdb/.cif file", scale=0)
+                    upload_btn.upload(pdb2seq, inputs=[upload_btn], outputs=[seq_text_input_1])
+                seq_text_input_2 = gr.Textbox(label="text")
+                seq_text_examples = gr.Examples(examples=[["MSATAEQNARNPKGKGGFARTVSQRKRKRLFLIGGALAVLAVAVGLMLTAFNQDIRFFRTPADLTEQDMTSGARFRLGGLVEEGSVSRTGSELRFTVTDTIKTVKVVFEGIPPDLFREGQGVVAEGRFGSDGLFRADNVLAKHDENYVPKDLADSLKKKGVWEGK", "Proteins with zinc bindings."],
+                                                             ["MITLDWEKANGLITTVVQDATTKQVLMVAYMNQESLAKTMATGETWFWSRSRKTLWHKGATSGNIQTVKTIAVDCDADTLLVTVDPAGPACHTGHISCFYRHYPEGKDLT", "Proteins locating at cell membrane."],
+                                                             ["MDLKQYVSEVQDWPKPGVSFKDITTIMDNGEAYGYATDKIVEYAKDRDVDIVVGPEARGFIIGCPVAYSMGIGFAPVRKEGKLPREVIRYEYDLEYGTNVLTMHKDAIKPGQRVLITDDLLATGGTIEAAIKLVEKLGGIVVGIAFIIELKYLNGIEKIKDYDVMSLISYDE", "Human represents the name assigned to the organism responsible for the protein sequence."]],
+                                                  inputs=[seq_text_input_1, seq_text_input_2])
+                seq_text_btn = gr.Button(value="Compute")
+            # Compute similarity score between structure and text
+            with gr.Tab("structure - text"):
+                with gr.Row():
+                    struc_text_input_1 = gr.Textbox(label="structure")
+                    # Provide an upload button to upload a pdb file
+                    upload_btn = gr.UploadButton(label="Upload .pdb/.cif file", scale=0)
+                    upload_btn.upload(pdb2foldseek, inputs=[upload_btn], outputs=[struc_text_input_1])
+                struc_text_input_2 = gr.Textbox(label="text")
+                struc_text_examples = gr.Examples(examples=[["dddddddddddddddpdpppvcppvnvvvvvvvvvvvvvvvvvvvvvvvvvvqdpqdedeqvrddpcqqpvqhkhkykafwappqwdddpqkiwtwghnppgiaieieghdappqddhrfikifiaghdpvrhtygdhidtdddpddddvvnvvvcvvvvndpdd", "Proteins with zinc bindings."],
+                                                             ["dddadcpvpvqkakefeaeppprdtadiaiagpvqvvvcvvpqwhwgqdpvvrdidgqcpvpvqiwrwddwdaddnrryiytythtpahsdpvrhvhpppadvvgpddpd", "Proteins locating at cell membrane."],
+                                                             ["dplvvqwdwdaqpphhpdtdthcvscvvppvslvvqlvvvlvvcvvqvaqeeeeepdqrcsnrvsscvvvvhyywykyfpppddaawdwdwdddppgitiiithlpseaaageyeyegaeqalqprvlrvvvrcvvnnyddaeyeyqeyevcrvncvsvvvhhydyvyydpd", "Human represents the name assigned to the organism responsible for the protein sequence."]],
+                                                  inputs=[struc_text_input_1, struc_text_input_2])
+                struc_text_btn = gr.Button(value="Compute")
+            # Compute similarity score between sequence and structure
+            with gr.Tab("sequence - structure"):
+                with gr.Row():
+                    seq_struc_input_1 = gr.Textbox(label="sequence")
+                    # Provide an upload button to upload a pdb file
+                    upload_btn = gr.UploadButton(label="Upload .pdb/.cif file", scale=0)
+                    upload_btn.upload(pdb2seq, inputs=[upload_btn], outputs=[seq_struc_input_1])
+                with gr.Row():
+                    seq_struc_input_2 = gr.Textbox(label="structure")
+                    # Provide an upload button to upload a pdb file
+                    upload_btn = gr.UploadButton(label="Upload .pdb/.cif file", scale=0)
+                    upload_btn.upload(pdb2foldseek, inputs=[upload_btn], outputs=[seq_struc_input_2])
+                seq_struc_examples = gr.Examples(examples=[["MSATAEQNARNPKGKGGFARTVSQRKRKRLFLIGGALAVLAVAVGLMLTAFNQDIRFFRTPADLTEQDMTSGARFRLGGLVEEGSVSRTGSELRFTVTDTIKTVKVVFEGIPPDLFREGQGVVAEGRFGSDGLFRADNVLAKHDENYVPKDLADSLKKKGVWEGK", "dddadcpvpvqkakefeaeppprdtadiaiagpvqvvvcvvpqwhwgqdpvvrdidgqcpvpvqiwrwddwdaddnrryiytythtpahsdpvrhvhpppadvvgpddpd"],
+                                                             ["MITLDWEKANGLITTVVQDATTKQVLMVAYMNQESLAKTMATGETWFWSRSRKTLWHKGATSGNIQTVKTIAVDCDADTLLVTVDPAGPACHTGHISCFYRHYPEGKDLT", "dddddddddddddddpdpppvcppvnvvvvvvvvvvvvvvvvvvvvvvvvvvqdpqdedeqvrddpcqqpvqhkhkykafwappqwdddpqkiwtwghnppgiaieieghdappqddhrfikifiaghdpvrhtygdhidtdddpddddvvnvvvcvvvvndpdd"],
+                                                             ["MDLKQYVSEVQDWPKPGVSFKDITTIMDNGEAYGYATDKIVEYAKDRDVDIVVGPEARGFIIGCPVAYSMGIGFAPVRKEGKLPREVIRYEYDLEYGTNVLTMHKDAIKPGQRVLITDDLLATGGTIEAAIKLVEKLGGIVVGIAFIIELKYLNGIEKIKDYDVMSLISYDE", "dplvvqwdwdaqpphhpdtdthcvscvvppvslvvqlvvvlvvcvvqvaqeeeeepdqrcsnrvsscvvvvhyywykyfpppddaawdwdwdddppgitiiithlpseaaageyeyegaeqalqprvlrvvvrcvvnnyddaeyeyqeyevcrvncvsvvvhhydyvyydpd"]],
+                                                  inputs=[seq_struc_input_1, seq_struc_input_2])
+                seq_struc_btn = gr.Button(value="Compute")
+        similarity_score = gr.Label(label="similarity score")
+        seq_text_btn.click(fn=compute_seq_text_score, inputs=[seq_text_input_1, seq_text_input_2], outputs=[similarity_score])
+        struc_text_btn.click(fn=compute_struc_text_score, inputs=[struc_text_input_1, struc_text_input_2], outputs=[similarity_score])
+        seq_struc_btn.click(fn=compute_seq_struc_score, inputs=[seq_struc_input_1, seq_struc_input_2], outputs=[similarity_score])

demo/modules/init_model.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import faiss
+import pandas as pd
+import os
+from utils.constants import sequence_level
+from model.ProtTrek.protrek_trimodal_model import ProTrekTrimodalModel
+def load_model():
+    config = {
+        "protein_config": "weights/ProTrek_35M_UniRef50/esm2_t12_35M_UR50D",
+        "text_config": "weights/ProTrek_35M_UniRef50/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
+        "structure_config": "weights/ProTrek_35M_UniRef50/foldseek_t12_35M",
+        "load_protein_pretrained": False,
+        "load_text_pretrained": False,
+        "from_checkpoint": "weights/ProTrek_35M_UniRef50/ProTrek_35M_UniRef50.pt"
+    }
+    model = ProTrekTrimodalModel(**config)
+    model.eval()
+    return model
+def load_index():
+    index_dir = "weights/faiss_index/faiss_index_ProTrek_35M_UniRef50"
+    all_index = {}
+    # Load protein sequence index
+    index_path = f"{index_dir}/sequence.index"
+    sequence_index = faiss.read_index(index_path)
+    id_path = f"{index_dir}/sequence_ids.tsv"
+    uniprot_ids = pd.read_csv(id_path, sep="\t", header=None).values.flatten()
+    all_index["sequence"] = {"index": sequence_index, "ids": uniprot_ids}
+    # Load protein structure index
+    index_path = f"{index_dir}/structure.index"
+    structure_index = faiss.read_index(index_path)
+    id_path = f"{index_dir}/structure_ids.tsv"
+    uniprot_ids = pd.read_csv(id_path, sep="\t", header=None).values.flatten()
+    all_index["structure"] = {"index": structure_index, "ids": uniprot_ids}
+    # Load text index
+    all_index["text"] = {}
+    text_dir = f"{index_dir}/text"
+    # Remove "Taxonomic lineage" from sequence_level. This is a special case which we don't need to index.
+    valid_subsections = set()
+    sequence_level.add("Global")
+    for subsection in sequence_level:
+        index_path = f"{text_dir}/{subsection.replace(' ', '_')}.index"
+        if not os.path.exists(index_path):
+            continue
+        text_index = faiss.read_index(index_path)
+        id_path = f"{text_dir}/{subsection.replace(' ', '_')}_ids.tsv"
+        text_ids = pd.read_csv(id_path, sep="\t", header=None).values.flatten()
+        all_index["text"][subsection] = {"index": text_index, "ids": text_ids}
+        valid_subsections.add(subsection)
+    return all_index, valid_subsections
+device = "cuda"
+print("Loading model...")
+model = load_model()
+model.to(device)
+print("Loading index...")
+all_index, valid_subsections = load_index()
+print("Done...")

demo/modules/search.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import gradio as gr
+import torch
+import pandas as pd
+from utils.foldseek_util import get_struc_seq
+from .init_model import model, all_index
+# Samples for input
+samples = [
+    ["Proteins with zinc bindings."],
+    ["Proteins locating at cell membrane."],
+    ["Protein that serves as an enzyme."]
+]
+# Choices for subsection type
+# valid_subsections = {"Function", "Subcellular location", "Protein names", "Sequence similarities", "GO annotation", "Global"}
+valid_subsections = all_index["text"].keys()
+# Sort the subsections
+valid_subsections = sorted(valid_subsections)
+def clear_results():
+    return ""
+# Search from database
+def search(input: str, topk: int, input_type: str, query_type: str, subsection_type: str):
+    input_modality = input_type.split(" ")[-1].replace("sequence", "protein")
+    with torch.no_grad():
+        input_embedding = getattr(model, f"get_{input_modality}_repr")([input]).cpu().numpy()
+    output_modality = query_type.split(" ")[-1]
+    if output_modality == "text":
+        index = all_index["text"][subsection_type]["index"]
+        ids = all_index["text"][subsection_type]["ids"]
+    else:
+        index = all_index[output_modality]["index"]
+        ids = all_index[output_modality]["ids"]
+    scores, ranks = index.search(input_embedding, topk)
+    scores = scores / model.temperature.item()
+    # Get topk ids
+    topk_ids = []
+    for rank in ranks[0]:
+        now_id = ids[rank]
+        if query_type == "text":
+            topk_ids.append(now_id)
+        else:
+            # Provide link to uniprot website
+            topk_ids.append(f"[{now_id}](https://www.uniprot.org/uniprotkb/{now_id})")
+    df = pd.DataFrame({"Id": topk_ids, "Matching score": scores[0]})
+    output = df.to_markdown()
+    return output
+def change_input_type(choice: str):
+    # Change examples if input type is changed
+    global samples
+    if choice == "text":
+        samples = [
+            ["Proteins with zinc bindings."],
+            ["Proteins locating at cell membrane."],
+            ["Protein that serves as an enzyme."]
+        ]
+    elif choice == "protein sequence":
+        samples = [
+            ["MSATAEQNARNPKGKGGFARTVSQRKRKRLFLIGGALAVLAVAVGLMLTAFNQDIRFFRTPADLTEQDMTSGARFRLGGLVEEGSVSRTGSELRFTVTDTIKTVKVVFEGIPPDLFREGQGVVAEGRFGSDGLFRADNVLAKHDENYVPKDLADSLKKKGVWEGK"],
+            ["MITLDWEKANGLITTVVQDATTKQVLMVAYMNQESLAKTMATGETWFWSRSRKTLWHKGATSGNIQTVKTIAVDCDADTLLVTVDPAGPACHTGHISCFYRHYPEGKDLT"],
+            ["MDLKQYVSEVQDWPKPGVSFKDITTIMDNGEAYGYATDKIVEYAKDRDVDIVVGPEARGFIIGCPVAYSMGIGFAPVRKEGKLPREVIRYEYDLEYGTNVLTMHKDAIKPGQRVLITDDLLATGGTIEAAIKLVEKLGGIVVGIAFIIELKYLNGIEKIKDYDVMSLISYDE"]
+        ]
+    elif choice == "protein structure":
+        samples = [
+            ["dddddddddddddddpdpppvcppvnvvvvvvvvvvvvvvvvvvvvvvvvvvqdpqdedeqvrddpcqqpvqhkhkykafwappqwdddpqkiwtwghnppgiaieieghdappqddhrfikifiaghdpvrhtygdhidtdddpddddvvnvvvcvvvvndpdd"],
+            ["dddadcpvpvqkakefeaeppprdtadiaiagpvqvvvcvvpqwhwgqdpvvrdidgqcpvpvqiwrwddwdaddnrryiytythtpahsdpvrhvhpppadvvgpddpd"],
+            ["dplvvqwdwdaqpphhpdtdthcvscvvppvslvvqlvvvlvvcvvqvaqeeeeepdqrcsnrvsscvvvvhyywykyfpppddaawdwdwdddppgitiiithlpseaaageyeyegaeqalqprvlrvvvrcvvnnyddaeyeyqeyevcrvncvsvvvhhydyvyydpd"]
+        ]
+    # Set visibility of upload button
+    if choice == "text":
+        visible = False
+    else:
+        visible = True
+    return samples, "", gr.update(visible=visible)
+# Load example from dataset
+def load_example(example_id):
+    return samples[example_id][0]
+# Change the visibility of subsection type
+def subsection_visibility(query_type: str):
+    if query_type == "text":
+        return gr.update(visible=True)
+    else:
+        return gr.update(visible=False)
+# Parse the uploaded structure file
+def parse_pdb_file(input_type, file):
+    parsed_seqs = get_struc_seq("bin/foldseek", file)
+    for seqs in parsed_seqs.values():
+        if input_type == "protein sequence":
+            return seqs[0]
+        else:
+            return seqs[1].lower()
+# Build the block for text to protein
+def build_search_module():
+    gr.Markdown(f"# Search from Swiss-Prot database (the whole UniProt database will be supported soon)")
+    with gr.Row(equal_height=True):
+        with gr.Column():
+            # Set input type
+            input_type = gr.Radio(["protein sequence", "protein structure", "text"], label="Input type (e.g. 'text' means searching based on text descriptions)", value="text")
+            with gr.Row():
+                # Set query type
+                query_type = gr.Radio(["protein sequence", "protein structure", "text"], label="Query type (e.g. 'protein sequence' means returning qualified protein sequences)", value="protein sequence")
+                # If the query type is "text", provide an option to choose the subsection of text
+                subsection_type = gr.Dropdown(list(valid_subsections), label="Subsection of text", value="Function",
+                                              scale=0, interactive=True, visible=False)
+                # Add event listener to query type
+                query_type.change(fn=subsection_visibility, inputs=[query_type], outputs=[subsection_type])
+            with gr.Row():
+                # Input box
+                input = gr.Text(label="Input")
+                # Provide an upload button to upload a pdb file
+                upload_btn = gr.UploadButton(label="Upload .pdb/.cif file", scale=0, visible=False)
+                upload_btn.upload(parse_pdb_file, inputs=[input_type, upload_btn], outputs=[input])
+            # Choose topk results
+            topk = gr.Slider(1, 100, 5,  step=1, label="Retrieve top k results")
+            # Provide examples
+            examples = gr.Dataset(samples=samples, components=[input], type="index", label="Input examples")
+            # Add click event to examples
+            examples.click(fn=load_example, inputs=[examples], outputs=input)
+            # Change examples based on input type
+            input_type.change(fn=change_input_type, inputs=[input_type], outputs=[examples, input, upload_btn])
+            with gr.Row():
+                t2p_btn = gr.Button(value="Search")
+                clear_btn = gr.Button(value="Clear")
+        results = gr.Markdown(label="results")
+        t2p_btn.click(fn=search, inputs=[input, topk, input_type, query_type, subsection_type], outputs=results)
+        clear_btn.click(fn=clear_results, outputs=results)

demo/run.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import sys
+root_dir = __file__.rsplit("/", 2)[0]
+if root_dir not in sys.path:
+    sys.path.append(root_dir)
+import gradio as gr
+from modules.search import build_search_module
+from modules.compute_score import build_score_computation
+# Build demo
+with gr.Blocks() as demo:
+    build_search_module()
+    build_score_computation()
+if __name__ == '__main__':
+    # args = get_args()
+    # Run demo
+    demo.launch()

model/ProtTrek/protein_encoder.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import torch
+from tqdm import tqdm
+from torch.nn.functional import normalize
+from transformers import EsmConfig, EsmForMaskedLM, EsmTokenizer
+class ProteinEncoder(torch.nn.Module):
+    def __init__(self,
+                 config_path: str,
+                 out_dim: int,
+                 load_pretrained: bool = True,
+                 gradient_checkpointing: bool = False):
+        """
+        Args:
+            config_path: Path to the config file
+            out_dim    : Output dimension of the protein representation
+            load_pretrained: Whether to load pretrained weights
+            gradient_checkpointing: Whether to use gradient checkpointing
+        """
+        super().__init__()
+        config = EsmConfig.from_pretrained(config_path)
+        if load_pretrained:
+            self.model = EsmForMaskedLM.from_pretrained(config_path)
+        else:
+            self.model = EsmForMaskedLM(config)
+        self.out = torch.nn.Linear(config.hidden_size, out_dim)
+        # Set gradient checkpointing
+        self.model.esm.encoder.gradient_checkpointing = gradient_checkpointing
+        # Remove contact head
+        self.model.esm.contact_head = None
+        # Remove position embedding if the embedding type is ``rotary``
+        if config.position_embedding_type == "rotary":
+            self.model.esm.embeddings.position_embeddings = None
+        self.tokenizer = EsmTokenizer.from_pretrained(config_path)
+    def get_repr(self, proteins: list, batch_size: int = 64, verbose: bool = False) -> torch.Tensor:
+        """
+        Compute protein representation for the given proteins
+        Args:
+            protein: A list of protein sequences
+            batch_size: Batch size for inference
+            verbose: Whether to print progress
+        """
+        device = next(self.parameters()).device
+        protein_repr = []
+        if verbose:
+            iterator = tqdm(range(0, len(proteins), batch_size), desc="Computing protein embeddings")
+        else:
+            iterator = range(0, len(proteins), batch_size)
+        for i in iterator:
+            protein_inputs = self.tokenizer.batch_encode_plus(proteins[i:i + batch_size],
+                                                              return_tensors="pt",
+                                                              padding=True)
+            protein_inputs = {k: v.to(device) for k, v in protein_inputs.items()}
+            output, _ = self.forward(protein_inputs)
+            protein_repr.append(output)
+        protein_repr = torch.cat(protein_repr, dim=0)
+        return normalize(protein_repr, dim=-1)
+    def forward(self, inputs: dict, get_mask_logits: bool = False):
+        """
+        Encode protein sequence into protein representation
+        Args:
+            inputs: A dictionary containing the following keys:
+                - input_ids: [batch, seq_len]
+                - attention_mask: [batch, seq_len]
+            get_mask_logits: Whether to return the logits for masked tokens
+        Returns:
+            protein_repr: [batch, protein_repr_dim]
+            mask_logits : [batch, seq_len, vocab_size]
+        """
+        last_hidden_state = self.model.esm(**inputs).last_hidden_state
+        reprs = last_hidden_state[:, 0, :]
+        reprs = self.out(reprs)
+        # Get logits for masked tokens
+        if get_mask_logits:
+            mask_logits = self.model.lm_head(last_hidden_state)
+        else:
+            mask_logits = None
+        return reprs, mask_logits

model/ProtTrek/protrek_trimodal_model.py ADDED Viewed

	@@ -0,0 +1,853 @@

+import torch
+import torch.distributed as dist
+import torchmetrics
+import json
+import math
+import numpy as np
+import os
+import copy
+import faiss
+import time
+import pandas as pd
+import random
+from tqdm import tqdm
+from .protein_encoder import ProteinEncoder
+from .structure_encoder import StructureEncoder
+from .text_encoder import TextEncoder
+from ..abstract_model import AbstractModel
+from ..model_interface import register_model
+from utils.mpr import MultipleProcessRunnerSimplifier
+from torch.nn.functional import normalize, cross_entropy
+from utils.constants import residue_level, sequence_level
+from sklearn.metrics import roc_auc_score
+def multilabel_cross_entropy(logits, labels):
+    """
+    Compute cross entropy loss for multilabel classification。 See "https://arxiv.org/pdf/2208.02955.pdf"
+    Args:
+        logits: [num_samples, num_classes]
+        labels: [num_samples, num_classes]
+    """
+    loss = 0
+    for pred, label in zip(logits, labels):
+        pos_logits = pred[label == 1]
+        neg_logits = pred[label == 0]
+        diff = neg_logits.unsqueeze(-1) - pos_logits
+        loss += torch.log(1 + torch.exp(diff).sum())
+    return loss / len(logits)
+    # pred = (1 - 2 * labels) * logits
+    # pred_neg = pred - labels * 1e12
+    # pred_pos = pred - (1 - labels) * 1e12
+    #
+    # zeros = torch.zeros_like(logits[..., :1], dtype=logits.dtype)
+    # pred_neg = torch.cat([pred_neg, zeros], dim=-1)
+    # pred_pos = torch.cat([pred_pos, zeros], dim=-1)
+    #
+    # neg_loss = torch.logsumexp(pred_neg, dim=-1)
+    # pos_loss = torch.logsumexp(pred_pos, dim=-1)
+    #
+    # return (neg_loss + pos_loss).mean()
+@register_model
+class ProTrekTrimodalModel(AbstractModel):
+    def __init__(self,
+                 protein_config: str,
+                 text_config: str,
+                 structure_config: str = None,
+                 repr_dim: int = 1024,
+                 temperature: float = 0.07,
+                 load_protein_pretrained: bool = True,
+                 load_text_pretrained: bool = True,
+                 use_mlm_loss: bool = False,
+                 use_zlpr_loss: bool = False,
+                 use_saprot: bool = False,
+                 gradient_checkpointing: bool = False,
+                 **kwargs):
+        """
+        Args:
+            protein_config: Path to the config file for protein sequence encoder
+            text_config: Path to the config file for text encoder
+            structure_config: Path to the config file for structure encoder
+            repr_dim: Output dimension of the protein and text representation
+            temperature: Temperature for softmax
+            load_protein_pretrained: Whether to load pretrained weights for protein encoder
+            load_text_pretrained: Whether to load pretrained weights for text encoder
+            use_mlm_loss: Whether to use masked language modeling loss
+            use_zlpr_loss: Whether to use zlpr loss. See "https://arxiv.org/pdf/2208.02955.pdf"
+            use_saprot: Whether to use SaProt as protein encoder
+            gradient_checkpointing: Whether to use gradient checkpointing for protein encoder
+        """
+        self.protein_config = protein_config
+        self.structure_config = structure_config
+        self.text_config = text_config
+        self.repr_dim = repr_dim
+        self.temperature = temperature
+        self.load_protein_pretrained = load_protein_pretrained
+        self.load_text_pretrained = load_text_pretrained
+        self.use_mlm_loss = use_mlm_loss
+        self.use_zlpr_loss = use_zlpr_loss
+        self.use_saprot = use_saprot
+        self.gradient_checkpointing = gradient_checkpointing
+        super().__init__(**kwargs)
+    def initialize_metrics(self, stage: str) -> dict:
+        return_dict = {
+            f"{stage}_protein_text_acc": torchmetrics.Accuracy(),
+            f"{stage}_text_protein_acc": torchmetrics.Accuracy(),
+        }
+        if self.use_mlm_loss:
+            return_dict[f"{stage}_protein_mask_acc"] = torchmetrics.Accuracy(ignore_index=-1)
+            if self.structure_config is not None:
+                return_dict[f"{stage}_structure_mask_acc"] = torchmetrics.Accuracy(ignore_index=-1)
+        if self.structure_config is not None:
+            return_dict[f"{stage}_structure_protein_acc"] = torchmetrics.Accuracy()
+            return_dict[f"{stage}_structure_text_acc"] = torchmetrics.Accuracy()
+            return_dict[f"{stage}_text_structure_acc"] = torchmetrics.Accuracy()
+            return_dict[f"{stage}_protein_structure_acc"] = torchmetrics.Accuracy()
+        return return_dict
+    def initialize_model(self):
+        # Initialize encoders
+        self.protein_encoder = ProteinEncoder(self.protein_config,
+                                              self.repr_dim,
+                                              self.load_protein_pretrained,
+                                              self.gradient_checkpointing)
+        self.text_encoder = TextEncoder(self.text_config,
+                                        self.repr_dim,
+                                        self.load_text_pretrained,
+                                        self.gradient_checkpointing)
+        # Learnable temperature
+        self.temperature = torch.nn.Parameter(torch.tensor(self.temperature))
+        # self.model is used for saving and loading
+        self.model = torch.nn.ParameterList([self.temperature,
+                                             self.protein_encoder,
+                                             self.text_encoder])
+        # If the structure encoder is specified
+        if self.structure_config is not None:
+            self.structure_encoder = StructureEncoder(self.structure_config, self.repr_dim)
+            self.model.append(self.structure_encoder)
+    def get_text_repr(self, texts: list, batch_size: int = 64, verbose: bool = False) -> torch.Tensor:
+        return self.text_encoder.get_repr(texts, batch_size, verbose)
+    def get_structure_repr(self, proteins: list, batch_size: int = 64, verbose: bool = False) -> torch.Tensor:
+        return self.structure_encoder.get_repr(proteins, batch_size, verbose)
+    def get_protein_repr(self, proteins: list, batch_size: int = 64, verbose: bool = False) -> torch.Tensor:
+        return self.protein_encoder.get_repr(proteins, batch_size, verbose)
+    def forward(self, protein_inputs: dict, text_inputs: dict, structure_inputs: dict = None):
+        """
+        Args:
+            protein_inputs: A dictionary for protein encoder
+            structure_inputs: A dictionary for structure encoder
+            text_inputs   : A dictionary for text encoder
+        """
+        protein_repr, protein_mask_logits = self.protein_encoder(protein_inputs, self.use_mlm_loss)
+        text_repr = self.text_encoder(text_inputs)
+        outputs = [text_repr, protein_repr, protein_mask_logits]
+        if self.structure_config is not None:
+            structure_repr, structure_mask_logits = self.structure_encoder(structure_inputs, self.use_mlm_loss)
+            outputs += [structure_repr, structure_mask_logits]
+        return outputs
+    def loss_func(self, stage: str, outputs, labels):
+        if self.structure_config is not None:
+            text_repr, protein_repr, protein_mask_logits, structure_repr, structure_mask_logits = outputs
+        else:
+            text_repr, protein_repr, protein_mask_logits = outputs
+        device = text_repr.device
+        text_repr = normalize(text_repr, dim=-1)
+        protein_repr = normalize(protein_repr, dim=-1)
+        # Gather representations from all GPUs
+        all_protein_repr = self.all_gather(protein_repr).view(-1, protein_repr.shape[-1]).detach()
+        all_text_repr = self.all_gather(text_repr).view(-1, text_repr.shape[-1]).detach()
+        if self.structure_config is not None:
+            structure_repr = normalize(structure_repr, dim=-1)
+            all_structure_repr = self.all_gather(structure_repr).view(-1, structure_repr.shape[-1]).detach()
+        # text_idx = labels["text_idx"]
+        # text_candidates = labels["text_candidates"]
+        #
+        # # Gather all text ids
+        # text_inds = self.all_gather(text_idx).flatten()
+        # # Create text classification labels
+        # text_labels = torch.zeros(len(text_candidates), len(text_inds), dtype=int).to(device)
+        # for i, candidate in enumerate(text_candidates):
+        #     for j, idx in enumerate(text_inds):
+        #         if idx.item() in candidate:
+        #             text_labels[i, j] = 1
+        #
+        # # Gather text labels from all GPUs
+        # text_labels = self.all_gather(text_labels).view(-1, text_labels.shape[-1])
+        #
+        # # Protein classification labels are the transpose of text labels
+        # protein_labels = text_labels.T
+        # Batch size
+        rank = dist.get_rank()
+        bs = text_repr.shape[0]
+        # Get current labels
+        # protein_labels = protein_labels[rank * bs: rank * bs + bs]
+        # text_labels = text_labels[rank * bs: rank * bs + bs]
+        # Create classification labels between structure and sequence
+        bs_labels = torch.linspace(rank * bs, rank * bs + bs - 1, bs, dtype=int).to(device)
+        if self.structure_config is not None:
+            pairs = {
+                "protein": ["structure", "text"],
+                "structure": ["protein", "text"],
+                "text": ["protein", "structure"]
+            }
+        else:
+            pairs = {
+                "protein": ["text"],
+                "text": ["protein"]
+            }
+        loss_list = []
+        for k, values in pairs.items():
+            for v in values:
+                # Only calculate the similarity for the current batch
+                sim = torch.matmul(eval(f"{k}_repr"), eval(f"all_{v}_repr").T).div(self.temperature)
+                # if k == "text":
+                #     if self.use_zlpr_loss:
+                #         loss = multilabel_cross_entropy(sim, protein_labels)
+                #     else:
+                #         loss = cross_entropy(sim, bs_labels)
+                #
+                #     pred = []
+                #     for s, l in zip(sim, protein_labels):
+                #         n_label = l.sum()
+                #         topk = torch.topk(s, k=n_label).indices
+                #         if l[topk].sum() == n_label:
+                #             pred.append(1)
+                #         else:
+                #             pred.append(0)
+                #
+                #     pred = torch.tensor(pred).to(device)
+                #     label = torch.ones_like(pred)
+                #     self.metrics[stage][f"{stage}_{k}_{v}_acc"].update(pred.detach(), label)
+                #     # if v == "protein":
+                #     #     acc = self.metrics[stage][f"{stage}_{k}_{v}_acc"].compute()
+                #     #     print(f"{stage}_{k}_{v}_acc: {acc:.4f}")
+                #
+                # elif v == "text":
+                #     if self.use_zlpr_loss:
+                #         loss = multilabel_cross_entropy(sim, text_labels)
+                #     else:
+                #         loss = cross_entropy(sim, bs_labels)
+                #
+                #     pred = []
+                #     for s, l in zip(sim, text_labels):
+                #         n_label = l.sum()
+                #         topk = torch.topk(s, k=n_label).indices
+                #         if l[topk].sum() == n_label:
+                #             pred.append(1)
+                #         else:
+                #             pred.append(0)
+                #
+                #     pred = torch.tensor(pred).to(device)
+                #     label = torch.ones_like(pred)
+                #     # if k == "protein":
+                #     #     acc = pred.sum() / len(pred)
+                #     #     print(f"{stage}_{k}_{v}_acc: {acc:.4f}")
+                #     self.metrics[stage][f"{stage}_{k}_{v}_acc"].update(pred.detach(), label)
+                #
+                # else:
+                #     loss = cross_entropy(sim, bs_labels)
+                #     self.metrics[stage][f"{stage}_{k}_{v}_acc"].update(sim.detach(), bs_labels)
+                loss = cross_entropy(sim, bs_labels)
+                self.metrics[stage][f"{stage}_{k}_{v}_acc"].update(sim.detach(), bs_labels)
+                loss_list.append(loss)
+        # Masked language modeling loss
+        if self.use_mlm_loss:
+            k_label = [("protein", labels["seq_labels"])]
+            if self.structure_config is not None:
+                k_label.append(("structure", labels["struc_labels"]))
+            for k, label in k_label:
+                logits = eval(f"{k}_mask_logits")
+                # merge the first and second dimension of logits
+                logits = logits.view(-1, logits.shape[-1])
+                label = label.flatten().to(device)
+                mlm_loss = cross_entropy(logits, label, ignore_index=-1)
+                loss_list.append(mlm_loss)
+                self.metrics[stage][f"{stage}_{k}_mask_acc"].update(logits.detach(), label)
+        loss = sum(loss_list) / len(loss_list)
+        if stage == "train":
+            log_dict = self.get_log_dict("train")
+            log_dict["train_loss"] = loss
+            self.log_info(log_dict)
+            # Reset train metrics
+            self.reset_metrics("train")
+        return loss
+    def _get_protein_indices(self):
+        world_size = dist.get_world_size()
+        rank = dist.get_rank()
+        if self.use_saprot:
+            proteins = []
+            for sub_dict in self.uniprot2label.values():
+                aa_seq = sub_dict["seq"]
+                foldseek_seq = sub_dict["foldseek"]
+                assert len(aa_seq) == len(foldseek_seq)
+                seq = "".join([a + b for a, b in zip(aa_seq, foldseek_seq)])
+                proteins.append(seq)
+        else:
+            proteins = [sub_dict["seq"] for sub_dict in self.uniprot2label.values()]
+        span = math.ceil(len(proteins) / world_size)
+        sub_proteins = proteins[rank * span: (rank + 1) * span]
+        # Display the progress bar on the rank 0 process
+        verbose = self.trainer.local_rank == 0
+        # Get protein representations
+        sub_protein_repr = self.protein_encoder.get_repr(sub_proteins, batch_size=1, verbose=verbose)
+        protein_repr = self.padded_gather(sub_protein_repr)
+        # Construct faiss index
+        d = protein_repr.shape[-1]
+        protein_indices = faiss.IndexFlatIP(d)
+        protein_indices.add(protein_repr.cpu().numpy())
+        return protein_indices
+    def _get_structure_indices(self):
+        world_size = dist.get_world_size()
+        rank = dist.get_rank()
+        proteins = [sub_dict["foldseek"] for sub_dict in self.uniprot2label.values()]
+        span = math.ceil(len(proteins) / world_size)
+        sub_proteins = proteins[rank * span: (rank + 1) * span]
+        # Display the progress bar on the rank 0 process
+        verbose = self.trainer.local_rank == 0
+        # Get protein representations
+        sub_protein_repr = self.structure_encoder.get_repr(sub_proteins, batch_size=1, verbose=verbose)
+        protein_repr = self.padded_gather(sub_protein_repr)
+        # Construct faiss index
+        d = protein_repr.shape[-1]
+        structure_indices = faiss.IndexFlatIP(d)
+        structure_indices.add(protein_repr.cpu().numpy())
+        return structure_indices
+    def _get_text_indices(self):
+        world_size = dist.get_world_size()
+        rank = dist.get_rank()
+        # Display the progress bar on the rank 0 process
+        verbose = self.trainer.local_rank == 0
+        if verbose:
+            iterator = tqdm(self.label2text.keys(), desc="Get text representations")
+        else:
+            iterator = self.label2text.keys()
+        text_embeddings = {}
+        for subsection in iterator:
+            if subsection == "Total":
+                continue
+            texts = []
+            for text_list in self.label2text[subsection].values():
+                # Only use the first text for efficiency
+                texts.append(text_list[0:1])
+            span = math.ceil(len(texts) / world_size)
+            texts = texts[rank * span: (rank + 1) * span]
+            embeddings = []
+            for text_list in texts:
+                text_repr = self.text_encoder.get_repr(text_list)
+                mean_repr = text_repr.mean(dim=0, keepdim=True)
+                norm_repr = torch.nn.functional.normalize(mean_repr, dim=-1)
+                embeddings.append(norm_repr)
+            if len(embeddings) > 0:
+                embeddings = torch.cat(embeddings, dim=0)
+            else:
+                embeddings = torch.zeros(0, self.repr_dim, dtype=self.dtype, device=self.device)
+            text_repr = self.padded_gather(embeddings)
+            text_embeddings[subsection] = text_repr
+        # Aggregate text embeddings for global retrieval
+        total_embeddings = []
+        for idx in self.label2text["Total"].values():
+            subsection, i = idx.split("|")
+            total_embeddings.append(text_embeddings[subsection][int(i)])
+        text_embeddings["Total"] = torch.stack(total_embeddings)
+        # Construct faiss index
+        text_indices = {}
+        for subsection, text_repr in text_embeddings.items():
+            d = text_repr.shape[-1]
+            text_indices[subsection] = faiss.IndexFlatIP(d)
+            text_indices[subsection].add(text_repr.cpu().numpy())
+        return text_indices
+    def _protein2text(self, modality: str, protein_indices, text_indices: dict):
+        def do(process_id, idx, row, writer):
+            subsection, uniprot_id, prob_idx, label = row
+            # Retrieve ranking results
+            p_embedding = protein_indices.reconstruct(prob_idx).reshape(1, -1)
+            text_inds = text_indices[subsection]
+            sim_scores, rank_inds = text_inds.search(p_embedding, text_inds.ntotal)
+            sim_scores, rank_inds = sim_scores[0], rank_inds[0]
+            # Calculate Average Precision(AP)
+            ranks = []
+            label = set(label)
+            for i, rk in enumerate(rank_inds):
+                # Find the rank of this label in all labels
+                if rk in label:
+                    ranks.append(i + 1)
+            ranks = np.array(ranks)
+            ap = np.mean([(i + 1) / rank for i, rank in enumerate(ranks)])
+            # Calculate Mean Reciprocal Rank(MRR)
+            best_rank = ranks[0]
+            mrr = 1 / best_rank
+            # Calculate the AUC
+            true_labels = np.zeros_like(sim_scores)
+            true_labels[ranks - 1] = 1
+            if true_labels.sum() == 0 or true_labels.sum() == true_labels.shape[0]:
+                auc = 0
+            else:
+                auc = roc_auc_score(true_labels, sim_scores)
+            output = json.dumps([ap, mrr, auc])
+            writer.write(output + "\n")
+        inputs = []
+        swissprot_subsections = set()
+        for subsection in text_indices.keys():
+            for i, (uniprot_id, labels) in enumerate(self.uniprot2label.items()):
+                if uniprot_id in self.swissprot_ids:
+                    if subsection in labels:
+                        swissprot_subsections.add(subsection)
+                        label = labels[subsection]
+                        inputs.append((subsection, uniprot_id, i, label))
+        # Randomly shuffle the inputs
+        random.seed(20000812)
+        random.shuffle(inputs)
+        # Split inputs into chunks for parallel processing
+        world_size = dist.get_world_size()
+        rank = dist.get_rank()
+        span = math.ceil(len(inputs) / world_size)
+        sub_inputs = inputs[rank * span: (rank + 1) * span]
+        # Display the progress bar on the rank 0 process
+        verbose = self.trainer.local_rank == 0
+        if verbose:
+            print("Evaluating on each subsection...")
+        tmp_path = f"/sujin/PycharmProjects/Pretraining/{time.time()}_{rank}.tsv"
+        mpr = MultipleProcessRunnerSimplifier(sub_inputs, do, save_path=tmp_path, n_process=8, verbose=verbose,
+                                              return_results=True)
+        outputs = mpr.run()
+        os.remove(tmp_path)
+        # Aggregate results
+        tensor_outputs = []
+        for output in outputs:
+            ap, mrr, auc = json.loads(output)
+            tensor_outputs.append([float(ap), float(mrr), float(auc)])
+        tensor_outputs = torch.tensor(tensor_outputs, dtype=torch.float32, device=self.device)
+        tensor_outputs = self.padded_gather(tensor_outputs)
+        # Record results
+        avg_results = {}
+        for subsection in swissprot_subsections:
+            avg_results[subsection] = {"map": [],
+                                       "mrr": [],
+                                       "auc": []}
+        for input, output in zip(inputs, tensor_outputs):
+            ap, mrr, auc = output
+            subsection, _, _, _ = input
+            avg_results[subsection]["map"].append(ap.cpu().item())
+            avg_results[subsection]["mrr"].append(mrr.cpu().item())
+            avg_results[subsection]["auc"].append(auc.cpu().item())
+        results = {
+            f"{modality}2Text_Total_mrr": np.mean(avg_results["Total"]["mrr"]),
+            f"{modality}2Text_Total_map": np.mean(avg_results["Total"]["map"]),
+            f"{modality}2Text_Total_auc": np.mean(avg_results["Total"]["auc"]),
+        }
+        # Average the precision and recall for each level
+        for level, labels in [("residue-level", residue_level),
+                              ("sequence-level", sequence_level),
+                              ("all", residue_level | sequence_level)]:
+            mrrs = []
+            maps = []
+            aucs = []
+            for subsection in labels:
+                if subsection in avg_results:
+                    mrrs.append(np.mean(avg_results[subsection]["mrr"]))
+                    maps.append(np.mean(avg_results[subsection]["map"]))
+                    aucs.append(np.mean(avg_results[subsection]["auc"]))
+            results[f"{modality}2Text_{level}_mrr"] = np.mean(mrrs)
+            results[f"{modality}2Text_{level}_map"] = np.mean(maps)
+            results[f"{modality}2Text_{level}_auc"] = np.mean(aucs)
+        return results
+    def _text2protein(self, modality: str, protein_indices, text_indices: dict):
+        def do(process_id, idx, row, writer):
+            subsection, text_id, label = row
+            # Retrieve ranking results
+            t_embedding = text_indices[subsection].reconstruct(text_id).reshape(1, -1)
+            sim_scores, rank_inds = protein_indices.search(t_embedding, protein_indices.ntotal)
+            sim_scores, rank_inds = sim_scores[0], rank_inds[0]
+            # Calculate Average Precision(AP)
+            ranks = []
+            label = set(label)
+            for i, rk in enumerate(rank_inds):
+                # Find the rank of this label in all labels
+                if rk in label:
+                    ranks.append(i + 1)
+            ranks = np.array(ranks)
+            ap = np.mean([(i + 1) / rank for i, rank in enumerate(ranks)])
+            # Calculate Mean Reciprocal Rank(MRR)
+            best_rank = ranks[0]
+            mrr = 1 / best_rank
+            # Calculate the AUC
+            true_labels = np.zeros_like(sim_scores)
+            true_labels[ranks - 1] = 1
+            if true_labels.sum() == 0 or true_labels.sum() == true_labels.shape[0]:
+                auc = 0
+            else:
+                auc = roc_auc_score(true_labels, sim_scores)
+            output = json.dumps([ap, mrr, auc])
+            writer.write(output + "\n")
+        text2label = {}
+        swissprot_subsections = set()
+        for i, (uniprot_id, subsections) in enumerate(self.uniprot2label.items()):
+            # Only evaluate the texts in Swiss-Prot
+            if uniprot_id not in self.swissprot_ids:
+                continue
+            for subsection, text_ids in subsections.items():
+                if subsection == "seq" or subsection == "foldseek":
+                    continue
+                swissprot_subsections.add(subsection)
+                if subsection not in text2label:
+                    text2label[subsection] = {}
+                for text_id in text_ids:
+                    text2label[subsection][text_id] = text2label[subsection].get(text_id, []) + [i]
+        inputs = []
+        for subsection in swissprot_subsections:
+            for i, (text_id, label) in enumerate(text2label[subsection].items()):
+                inputs.append((subsection, text_id, label))
+        # Randomly shuffle the inputs
+        random.seed(20000812)
+        random.shuffle(inputs)
+        # Split inputs into chunks for parallel processing
+        world_size = dist.get_world_size()
+        rank = dist.get_rank()
+        span = math.ceil(len(inputs) / world_size)
+        sub_inputs = inputs[rank * span: (rank + 1) * span]
+        # Display the progress bar on the rank 0 process
+        verbose = self.trainer.local_rank == 0
+        if verbose:
+            print("Evaluating on each text...")
+        # Add time stamp to the temporary file name to avoid conflicts
+        tmp_path = f"/sujin/PycharmProjects/Pretraining/{time.time()}_{rank}.tsv"
+        mpr = MultipleProcessRunnerSimplifier(sub_inputs, do, save_path=tmp_path, n_process=8, verbose=verbose,
+                                              return_results=True)
+        outputs = mpr.run()
+        os.remove(tmp_path)
+        # Aggregate results
+        tensor_outputs = []
+        for output in outputs:
+            ap, mrr, auc = json.loads(output)
+            tensor_outputs.append([float(ap), float(mrr), float(auc)])
+        tensor_outputs = torch.tensor(tensor_outputs, dtype=torch.float32, device=self.device)
+        tensor_outputs = self.padded_gather(tensor_outputs)
+        # Record results
+        avg_results = {}
+        for subsection in swissprot_subsections:
+            avg_results[subsection] = {"map": [],
+                                       "mrr": [],
+                                       "auc": []}
+        for input, output in zip(inputs, tensor_outputs):
+            ap, mrr, auc = output
+            subsection, _, _ = input
+            avg_results[subsection]["map"].append(ap.cpu().item())
+            avg_results[subsection]["mrr"].append(mrr.cpu().item())
+            avg_results[subsection]["auc"].append(auc.cpu().item())
+        results = {
+            f"Text2{modality}_Total_mrr": np.mean(avg_results["Total"]["mrr"]),
+            f"Text2{modality}_Total_map": np.mean(avg_results["Total"]["map"]),
+            f"Text2{modality}_Total_auc": np.mean(avg_results["Total"]["auc"]),
+        }
+        # Average the precision and recall for each level
+        for level, labels in [("residue-level", residue_level),
+                              ("sequence-level", sequence_level),
+                              ("all", residue_level | sequence_level)]:
+            mrrs = []
+            maps = []
+            aucs = []
+            for subsection in labels:
+                if subsection in avg_results:
+                    mrrs.append(np.mean(avg_results[subsection]["mrr"]))
+                    maps.append(np.mean(avg_results[subsection]["map"]))
+                    aucs.append(np.mean(avg_results[subsection]["auc"]))
+            results[f"Text2{modality}_{level}_mrr"] = np.mean(mrrs)
+            results[f"Text2{modality}_{level}_map"] = np.mean(maps)
+            results[f"Text2{modality}_{level}_auc"] = np.mean(aucs)
+        return results
+    def retrieval_eval(self) -> dict:
+        # Get protein representations
+        protein_indices = self._get_protein_indices()
+        # Get structure representations
+        # if self.structure_config is not None:
+        #     structure_embeddings = self._get_structure_embeddings()
+        # Get text representations
+        text_indices = self._get_text_indices()
+        # Retrieve texts for each protein
+        results = {}
+        results.update(self._protein2text("Sequence", protein_indices, text_indices))
+        # if self.structure_config is not None:
+        #     results.update(self._protein2text("Structure", structure_embeddings, text_embeddings))
+        #     results.update(self._text2protein("Structure", structure_embeddings, text_embeddings))
+        # Retrieve proteins for each text
+        results.update(self._text2protein("Sequence", protein_indices, text_indices))
+        return results
+    def _apply_bert_mask(self, tokens, tokenizer, mask_ratio):
+        while True:
+            masked_tokens = copy.copy(tokens)
+            labels = torch.full((len(tokens) + 2,), -1, dtype=torch.long)
+            vocab = [k for k in tokenizer.get_vocab().keys()]
+            for i in range(len(tokens)):
+                token = tokens[i]
+                prob = random.random()
+                if prob < mask_ratio:
+                    prob /= mask_ratio
+                    labels[i + 1] = tokenizer.convert_tokens_to_ids(token)
+                    if prob < 0.8:
+                        # 80% random change to mask token
+                        if self.use_saprot:
+                            token = "#" + token[-1]
+                        else:
+                            token = tokenizer.mask_token
+                    elif prob < 0.9:
+                        # 10% chance to change to random token
+                        token = random.choice(vocab)
+                    else:
+                        # 10% chance to keep current token
+                        pass
+                    masked_tokens[i] = token
+            # Check if there is at least one masked token
+            if (labels != -1).any():
+                return masked_tokens, labels
+    def mlm_eval(self) -> float:
+        world_size = dist.get_world_size()
+        rank = dist.get_rank()
+        if self.use_saprot:
+            proteins = []
+            for sub_dict in self.uniprot2label.values():
+                aa_seq = sub_dict["seq"]
+                foldseek_seq = sub_dict["foldseek"]
+                assert len(aa_seq) == len(foldseek_seq)
+                seq = "".join([a + b for a, b in zip(aa_seq, foldseek_seq)])
+                proteins.append(seq)
+        else:
+            proteins = [sub_dict["seq"] for sub_dict in self.uniprot2label.values()]
+        span = math.ceil(len(proteins) / world_size)
+        sub_proteins = proteins[rank * span: (rank + 1) * span]
+        # Display the progress bar on the rank 0 process
+        if self.trainer.local_rank == 0:
+            iterator = tqdm(sub_proteins, desc="Computing mlm...")
+        else:
+            iterator = sub_proteins
+        total = torch.tensor([0], dtype=torch.long, device=self.device)
+        correct = torch.tensor([0], dtype=torch.long, device=self.device)
+        for seq in iterator:
+            tokens = self.protein_encoder.tokenizer.tokenize(seq)
+            masked_tokens, labels = self._apply_bert_mask(tokens, self.protein_encoder.tokenizer, 0.15)
+            seq = " ".join(masked_tokens)
+            inputs = self.protein_encoder.tokenizer(seq, return_tensors="pt")
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            _, logits = self.protein_encoder(inputs, get_mask_logits=True)
+            logits = logits.squeeze(0)
+            labels = labels.to(self.device)
+            selecor = labels != -1
+            preds = logits.argmax(dim=-1)[selecor]
+            labels = labels[selecor]
+            total += len(preds)
+            correct += (preds == labels).sum()
+        # Gather all results
+        total = self.padded_gather(total).sum()
+        correct = self.padded_gather(correct).sum()
+        acc = correct / total
+        return acc.cpu().item()
+    def _load_eval_data(self, stage):
+        # Load the data
+        lmdb_dir = eval(f"self.trainer.datamodule.{stage}_lmdb")
+        uniprot2label_path = os.path.join(lmdb_dir, "uniprot2label.json")
+        label2text_path = os.path.join(lmdb_dir, "label2text.json")
+        swissprot_id_path = os.path.join(lmdb_dir, "swissprot_ids.tsv")
+        self.uniprot2label = json.load(open(uniprot2label_path, "r"))
+        self.label2text = json.load(open(label2text_path, "r"))
+        self.swissprot_ids = set(pd.read_csv(swissprot_id_path, sep="\t", header=None).values.flatten().tolist())
+        self.k = 3
+    def on_test_start(self):
+        self._load_eval_data("test")
+        log_dict = self.retrieval_eval()
+        log_dict = {"test_" + k: v for k, v in log_dict.items()}
+        if self.use_mlm_loss:
+            log_dict["test_mask_acc"] = self.mlm_eval()
+        self.log_info(log_dict)
+        print(log_dict)
+    def on_validation_start(self):
+        # Clear the cache
+        torch.cuda.empty_cache()
+        self._load_eval_data("valid")
+        log_dict = self.retrieval_eval()
+        log_dict = {"valid_" + k: v for k, v in log_dict.items()}
+        if self.use_mlm_loss:
+            log_dict["valid_mask_acc"] = self.mlm_eval()
+        self.log_info(log_dict)
+        self.check_save_condition(self.step, mode="max")
+    def test_step(self, batch, batch_idx):
+        return
+    def validation_step(self, batch, batch_idx):
+        return
+    def on_train_epoch_end(self):
+        super().on_train_epoch_end()
+        # Re-sample the subset of the training data
+        if self.trainer.datamodule.train_dataset.fixed_dataset_num is not None:
+            self.trainer.datamodule.train_dataset.sample_subset()
+    # def test_epoch_end(self, outputs):
+    #     log_dict = self.get_log_dict("test")
+    #     log_dict["test_loss"] = torch.cat(self.all_gather(outputs), dim=-1).mean()
+    #
+    #     print(log_dict)
+    #     self.log_info(log_dict)
+    #
+    #     self.reset_metrics("test")
+    #
+    # def validation_epoch_end(self, outputs):
+    #     log_dict = self.get_log_dict("valid")
+    #     log_dict["valid_loss"] = torch.cat(self.all_gather(outputs), dim=-1).mean()
+    #
+    #     self.log_info(log_dict)
+    #     self.reset_metrics("valid")
+    #     self.check_save_condition(log_dict["valid_loss"], mode="min")

model/ProtTrek/structure_encoder.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch
+from tqdm import tqdm
+from transformers import EsmConfig, EsmForMaskedLM, EsmTokenizer
+from torch.nn.functional import normalize
+class StructureEncoder(torch.nn.Module):
+    def __init__(self, config_path: str, out_dim: int, gradient_checkpointing: bool = False):
+        """
+        Args:
+            config_path: Path to the config file
+            out_dim: Output dimension of the structure representation
+            gradient_checkpointing: Whether to use gradient checkpointing
+        """
+        super().__init__()
+        config = EsmConfig.from_pretrained(config_path)
+        self.model = EsmForMaskedLM(config)
+        self.out = torch.nn.Linear(config.hidden_size, out_dim)
+        # Set gradient checkpointing
+        self.model.esm.encoder.gradient_checkpointing = gradient_checkpointing
+        # Remove contact head
+        self.model.esm.contact_head = None
+        # Remove position embedding if the embedding type is ``rotary``
+        if config.position_embedding_type == "rotary":
+            self.model.esm.embeddings.position_embeddings = None
+        self.tokenizer = EsmTokenizer.from_pretrained(config_path)
+    def get_repr(self, proteins: list, batch_size: int = 64, verbose: bool = False) -> torch.Tensor:
+        """
+        Compute protein structure representation for the given proteins
+        Args:
+            protein: A list of protein structural sequences
+            batch_size: Batch size for inference
+            verbose: Whether to print progress
+        """
+        device = next(self.parameters()).device
+        protein_repr = []
+        if verbose:
+            iterator = tqdm(range(0, len(proteins), batch_size), desc="Computing protein embeddings")
+        else:
+            iterator = range(0, len(proteins), batch_size)
+        for i in iterator:
+            protein_inputs = self.tokenizer.batch_encode_plus(proteins[i:i + batch_size],
+                                                              return_tensors="pt",
+                                                              padding=True)
+            protein_inputs = {k: v.to(device) for k, v in protein_inputs.items()}
+            output, _ = self.forward(protein_inputs)
+            protein_repr.append(output)
+        protein_repr = torch.cat(protein_repr, dim=0)
+        return normalize(protein_repr, dim=-1)
+    def forward(self, inputs: dict, get_mask_logits: bool = False):
+        """
+        Encode protein structure into protein representation
+        Args:
+            inputs: A dictionary containing the following keys:
+                - input_ids: [batch, seq_len]
+                - attention_mask: [batch, seq_len]
+            get_mask_logits: Whether to return the logits for masked tokens
+        Returns:
+            protein_repr: [batch, protein_repr_dim]
+            mask_logits : [batch, seq_len, vocab_size]
+        """
+        last_hidden_state = self.model.esm(**inputs).last_hidden_state
+        reprs = last_hidden_state[:, 0, :]
+        reprs = self.out(reprs)
+        # Get logits for masked tokens
+        if get_mask_logits:
+            mask_logits = self.model.lm_head(last_hidden_state)
+        else:
+            mask_logits = None
+        return reprs, mask_logits

model/ProtTrek/text_encoder.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import torch
+from tqdm import tqdm
+from torch.nn.functional import normalize
+from transformers import BertConfig, BertModel, BertTokenizer
+class TextEncoder(torch.nn.Module):
+    def __init__(self,
+                 config_path: str,
+                 out_dim: int,
+                 load_pretrained: bool = True,
+                 gradient_checkpointing: bool = False):
+        """
+        Args:
+            config_path: Path to the config file
+            out_dim: Output dimension of the text representation
+            load_pretrained: Whether to load pretrained weights
+            gradient_checkpointing: Whether to enable gradient checkpointing
+        """
+        super().__init__()
+        config = BertConfig.from_pretrained(config_path)
+        if load_pretrained:
+            self.model = BertModel.from_pretrained(config_path, add_pooling_layer=False)
+        else:
+            self.model = BertModel(config, add_pooling_layer=False)
+        self.out = torch.nn.Linear(config.hidden_size, out_dim)
+        # Set gradient checkpointing
+        self.model.encoder.gradient_checkpointing = gradient_checkpointing
+        self.tokenizer = BertTokenizer.from_pretrained(config_path)
+    def get_repr(self, texts: list, batch_size: int = 64, verbose: bool = False) -> torch.Tensor:
+        """
+        Compute text representation for the given texts
+        Args:
+            texts: A list of strings
+            batch_size: Batch size for inference
+            verbose: Whether to print progress
+        """
+        device = next(self.parameters()).device
+        text_repr = []
+        if verbose:
+            iterator = tqdm(range(0, len(texts), batch_size), desc="Computing text embeddings")
+        else:
+            iterator = range(0, len(texts), batch_size)
+        for i in iterator:
+            text_inputs = self.tokenizer.batch_encode_plus(texts[i: i+batch_size],
+                                                           return_tensors="pt",
+                                                           truncation=True,
+                                                           max_length=512,
+                                                           padding=True)
+            text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
+            output = self(text_inputs)
+            text_repr.append(output)
+        text_repr = torch.cat(text_repr, dim=0)
+        return normalize(text_repr, dim=-1)
+    def forward(self, inputs: dict):
+        """
+        Encode text into text representation
+        Args:
+            inputs: A dictionary containing the following keys:
+                - input_ids: [batch, seq_len]
+                - attention_mask: [batch, seq_len]
+                - token_type_ids: [batch, seq_len]
+        Returns:
+            text_repr: [batch, text_repr_dim]
+        """
+        reprs = self.model(**inputs).last_hidden_state[:, 0, :]
+        reprs = self.out(reprs)
+        return reprs

model/abstract_model.py ADDED Viewed

	@@ -0,0 +1,401 @@

+import torch
+import abc
+import os
+import copy
+import pytorch_lightning as pl
+from utils.lr_scheduler import *
+from torch import distributed as dist
+class AbstractModel(pl.LightningModule):
+    def __init__(self,
+                 lr_scheduler_kwargs: dict = None,
+                 optimizer_kwargs: dict = None,
+                 save_path: str = None,
+                 from_checkpoint: str = None,
+                 load_prev_scheduler: bool = False,
+                 save_weights_only: bool = True,):
+        """
+        Args:
+            lr_scheduler: Kwargs for lr_scheduler
+            optimizer_kwargs: Kwargs for optimizer_kwargs
+            save_path: Save trained model
+            from_checkpoint: Load model from checkpoint
+            load_prev_scheduler: Whether load previous scheduler from checkpoint
+            load_strict: Whether load model strictly
+            save_weights_only: Whether save only weights or also optimizer and lr_scheduler
+        """
+        super().__init__()
+        self.initialize_model()
+        self.metrics = {}
+        for stage in ["train", "valid", "test"]:
+            stage_metrics = self.initialize_metrics(stage)
+            # Rigister metrics as attributes
+            for metric_name, metric in stage_metrics.items():
+                setattr(self, metric_name, metric)
+            self.metrics[stage] = stage_metrics
+        if lr_scheduler_kwargs is None:
+            # Default lr_scheduler
+            self.lr_scheduler_kwargs = {
+                "class": "ConstantLRScheduler",
+                "init_lr": 0,
+            }
+            print("No lr_scheduler_kwargs provided. The default learning rate is 0.")
+        else:
+            self.lr_scheduler_kwargs = lr_scheduler_kwargs
+        if optimizer_kwargs is None:
+            # Default optimizer
+            self.optimizer_kwargs = {
+                "class": "AdamW",
+                "betas": (0.9, 0.98),
+                "weight_decay": 0.01,
+            }
+            print("No optimizer_kwargs provided. The default optimizer is AdamW.")
+        else:
+            self.optimizer_kwargs = optimizer_kwargs
+        self.init_optimizers()
+        self.save_path = save_path
+        self.save_weights_only = save_weights_only
+        # temp_step is used for accumulating gradients
+        self.temp_step = 0
+        self.step = 0
+        self.epoch = 0
+        self.load_prev_scheduler = load_prev_scheduler
+        self.from_checkpoint = from_checkpoint
+        if from_checkpoint:
+            self.load_checkpoint(from_checkpoint)
+    @abc.abstractmethod
+    def initialize_model(self) -> None:
+        """
+        All model initialization should be done here
+        Note that the whole model must be named as "self.model" for model saving and loading
+        """
+        raise NotImplementedError
+    @abc.abstractmethod
+    def forward(self, *args, **kwargs):
+        """
+        Forward propagation
+        """
+        raise NotImplementedError
+    @abc.abstractmethod
+    def initialize_metrics(self, stage: str) -> dict:
+        """
+        Initialize metrics for each stage
+        Args:
+            stage: "train", "valid" or "test"
+        Returns:
+            A dictionary of metrics for the stage. Keys are metric names and values are metric objects
+        """
+        raise NotImplementedError
+    @abc.abstractmethod
+    def loss_func(self, stage: str, outputs, labels) -> torch.Tensor:
+        """
+        Args:
+            stage: "train", "valid" or "test"
+            outputs: model outputs for calculating loss
+            labels: labels for calculating loss
+        Returns:
+            loss
+        """
+        raise NotImplementedError
+    @staticmethod
+    def load_weights(model, weights):
+        model_dict = model.state_dict()
+        unused_params = []
+        missed_params = list(model_dict.keys())
+        for k, v in weights.items():
+            if k in model_dict.keys():
+                model_dict[k] = v
+                missed_params.remove(k)
+            else:
+                unused_params.append(k)
+        if len(missed_params) > 0:
+            print(f"\033[31mSome weights of {type(model).__name__} were not "
+                  f"initialized from the model checkpoint: {missed_params}\033[0m")
+        if len(unused_params) > 0:
+            print(f"\033[31mSome weights of the model checkpoint were not used: {unused_params}\033[0m")
+        model.load_state_dict(model_dict)
+    def optimizer_step(
+        self,
+        epoch: int,
+        batch_idx: int,
+        optimizer,
+        optimizer_closure=None,
+    ) -> None:
+        super().optimizer_step(epoch, batch_idx, optimizer, optimizer_closure)
+        self.temp_step += 1
+        if self.temp_step == self.trainer.accumulate_grad_batches:
+            self.step += 1
+            self.temp_step = 0
+    # For pytorch-lightning 1.9.5
+    # def optimizer_step(
+    #     self,
+    #     epoch: int,
+    #     batch_idx: int,
+    #     optimizer,
+    #     optimizer_idx: int = 0,
+    #     optimizer_closure=None,
+    #     on_tpu: bool = False,
+    #     using_native_amp: bool = False,
+    #     using_lbfgs: bool = False,
+    # ) -> None:
+    #     super().optimizer_step(
+    #         epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs
+    #     )
+    #     self.temp_step += 1
+    #     if self.temp_step == self.trainer.accumulate_grad_batches:
+    #         self.step += 1
+    #         self.temp_step = 0
+    def on_train_epoch_end(self):
+        self.epoch += 1
+    def training_step(self, batch, batch_idx):
+        inputs, labels = batch
+        # optimizer = torch.optim.AdamW(self.model.parameters(), lr=5e-4, weight_decay=0.01, betas=(0.9, 0.98))
+        # for _ in range(1000):
+        #     outputs = self(**inputs)
+        #     loss = self.loss_func('train', outputs, labels)
+        #     loss.backward()
+        #     optimizer.step()
+        #     optimizer.zero_grad()
+        #
+        # raise
+        outputs = self(**inputs)
+        loss = self.loss_func('train', outputs, labels)
+        self.log("loss", loss, prog_bar=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        inputs, labels = batch
+        outputs = self(**inputs)
+        loss = self.loss_func('valid', outputs, labels)
+        self.valid_outputs.append(loss)
+        return loss
+    def test_step(self, batch, batch_idx):
+        inputs, labels = batch
+        outputs = self(**inputs)
+        loss = self.loss_func('test', outputs, labels)
+        self.test_outputs.append(loss)
+        return loss
+    def on_train_start(self) -> None:
+        # Load previous scheduler
+        if getattr(self, "prev_schechuler", None) is not None:
+            try:
+                self.step = self.prev_schechuler["global_step"]
+                self.epoch = self.prev_schechuler["epoch"]
+                self.best_value = self.prev_schechuler["best_value"]
+                self.lr_scheduler.load_state_dict(self.prev_schechuler["lr_scheduler"])
+                print(f"Previous training global step: {self.step}")
+                print(f"Previous training epoch: {self.epoch}")
+                print(f"Previous best value: {self.best_value}")
+                print(f"Previous lr_scheduler: {self.prev_schechuler['lr_scheduler']}")
+                # Load optimizer state
+                if hasattr(self.trainer.strategy, "deepspeed_engine"):
+                    # For DeepSpeed strategy
+                    try:
+                        self.trainer.strategy.deepspeed_engine.load_checkpoint(self.from_checkpoint)
+                    except Exception as e:
+                        print(e)
+                else:
+                    # For DDP strategy
+                    self.optimizer.load_state_dict(self.prev_schechuler["optimizer"])
+            except Exception as e:
+                print(e)
+                raise Exception("Error in loading previous scheduler. Please set load_prev_scheduler=False")
+    def on_validation_epoch_start(self) -> None:
+        setattr(self, "valid_outputs", [])
+    def on_test_epoch_start(self) -> None:
+        setattr(self, "test_outputs", [])
+    def load_checkpoint(self, from_checkpoint: str) -> None:
+        """
+        Args:
+            from_checkpoint:  Path to checkpoint.
+        """
+        # If ``from_checkpoint`` is a directory, load the checkpoint in it
+        if os.path.isdir(from_checkpoint):
+            basename = os.path.basename(from_checkpoint)
+            from_checkpoint = os.path.join(from_checkpoint, f"{basename}.pt")
+        state_dict = torch.load(from_checkpoint, map_location=self.device)
+        self.load_weights(self.model, state_dict["model"])
+        if self.load_prev_scheduler:
+            state_dict.pop("model")
+            self.prev_schechuler = state_dict
+    def save_checkpoint(self, save_path: str, save_info: dict = None, save_weights_only: bool = True) -> None:
+        """
+        Save model to save_path
+        Args:
+            save_path: Path to save model
+            save_info: Other info to save
+            save_weights_only: Whether only save model weights
+        """
+        dir = os.path.dirname(save_path)
+        os.makedirs(dir, exist_ok=True)
+        state_dict = {} if save_info is None else save_info
+        state_dict["model"] = self.model.state_dict()
+        # Convert model weights to fp32
+        for k, v in state_dict["model"].items():
+            state_dict["model"][k] = v.float()
+        if not save_weights_only:
+            state_dict["global_step"] = self.step
+            state_dict["epoch"] = self.epoch
+            state_dict["best_value"] = getattr(self, f"best_value", None)
+            state_dict["lr_scheduler"] = self.lr_schedulers().state_dict()
+            # If not using DeepSpeed, save optimizer state
+            if not hasattr(self.trainer.strategy, "deepspeed_engine"):
+                state_dict["optimizer"] = self.optimizers().optimizer.state_dict()
+        torch.save(state_dict, save_path)
+    def check_save_condition(self, now_value: float, mode: str, save_info: dict = None) -> None:
+        """
+        Check whether to save model. If save_path is not None and now_value is the best, save model.
+        Args:
+            now_value: Current metric value
+            mode: "min" or "max", meaning whether the lower the better or the higher the better
+            save_info: Other info to save
+        """
+        assert mode in ["min", "max"], "mode should be 'min' or 'max'"
+        if self.save_path is not None:
+            # In case there are variables to be included in the save path
+            save_path = eval(f"f'{self.save_path}'")
+            dir = os.path.dirname(save_path)
+            os.makedirs(dir, exist_ok=True)
+            # Check whether to save model
+            best_value = getattr(self, f"best_value", None)
+            if best_value is not None:
+                if mode == "min" and now_value >= best_value or mode == "max" and now_value <= best_value:
+                    return
+            setattr(self, "best_value", now_value)
+            # For DeepSpeed strategy
+            if hasattr(self.trainer.strategy, "deepspeed_engine"):
+                if not self.save_weights_only:
+                    self.trainer.strategy.deepspeed_engine.save_checkpoint(save_path, tag="deepspeed_ckpt")
+                # Save a complete checkpoint
+                if dist.get_rank() == 0:
+                    basename = os.path.basename(save_path)
+                    ckpt_path = os.path.join(save_path, f"{basename}.pt")
+                    self.save_checkpoint(ckpt_path, save_info, self.save_weights_only)
+            # For normal situation
+            else:
+                if dist.get_rank() == 0:
+                    self.save_checkpoint(save_path, save_info, self.save_weights_only)
+    def reset_metrics(self, stage) -> None:
+        """
+        Reset metrics for given stage
+        Args:
+            stage: "train", "valid" or "test"
+        """
+        for metric in self.metrics[stage].values():
+            metric.reset()
+    def get_log_dict(self, stage: str) -> dict:
+        """
+        Get log dict for the stage
+        Args:
+            stage: "train", "valid" or "test"
+        Returns:
+            A dictionary of metrics for the stage. Keys are metric names and values are metric values
+        """
+        return {name: metric.compute() for name, metric in self.metrics[stage].items()}
+    def log_info(self, info: dict) -> None:
+        """
+        Record metrics during training and testing
+        Args:
+            info: dict of metrics
+        """
+        if getattr(self, "logger", None) is not None and dist.get_rank() == 0:
+            info["learning_rate"] = self.lr_scheduler.get_last_lr()[0]
+            info["epoch"] = self.epoch
+            self.logger.log_metrics(info, step=self.step)
+    def init_optimizers(self):
+        copy_optimizer_kwargs = copy.deepcopy(self.optimizer_kwargs)
+        # No decay for layer norm and bias
+        no_decay = ['LayerNorm.weight', 'bias']
+        weight_decay = copy_optimizer_kwargs.pop("weight_decay")
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
+             'weight_decay': weight_decay},
+            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
+             'weight_decay': 0.0}
+        ]
+        optimizer_cls = eval(f"torch.optim.{copy_optimizer_kwargs.pop('class')}")
+        self.optimizer = optimizer_cls(optimizer_grouped_parameters,
+                                       lr=self.lr_scheduler_kwargs['init_lr'],
+                                       **copy_optimizer_kwargs)
+        tmp_kwargs = copy.deepcopy(self.lr_scheduler_kwargs)
+        lr_scheduler = tmp_kwargs.pop("class")
+        self.lr_scheduler = eval(lr_scheduler)(self.optimizer, **tmp_kwargs)
+    def configure_optimizers(self):
+        return {"optimizer": self.optimizer,
+                "lr_scheduler": {"scheduler": self.lr_scheduler,
+                                 "interval": "step",
+                                 "frequency": 1}
+                }

model/model_interface.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import os
+import yaml
+import glob
+# register all available models through *_model.py files
+# def construct_model():
+#     model_dir = os.path.dirname(__file__)
+#
+#     # lists all model files
+#     model_list = []
+#     for root, _, names in os.walk(model_dir):
+#         for name in names:
+#             if name.endswith('_model.py'):
+#                 sub_dirs = root.replace(model_dir, '').split(os.sep)
+#                 model_list.append((sub_dirs, name[:-3]))
+#
+#     # load model_config.yaml, controlling which models to be loaded
+#     model_config = yaml.safe_load(open(f"{model_dir}/model_config.yaml", "r"))
+#
+#     if model_config["verbose"]:
+#         print("*" * 30 + f" Loading model " + "*" * 30)
+#
+#     # register models
+#     for sub_dirs, name in model_list:
+#         if name in model_config["models"]:
+#             if len(sub_dirs) > 1:
+#                 cmd = f"from {'.'.join(sub_dirs)} import {name}"
+#             else:
+#                 cmd = f"from . import {name}"
+#
+#             exec(cmd)
+#
+#             if model_config["verbose"]:
+#                 info = f"Loaded model: {name}"
+#                 print(f"\033[32m{info}\033[0m")
+#         else:
+#             if model_config["verbose"]:
+#                 info = f"Skipped model: {name}"
+#                 print(f"\033[31m{info}\033[0m")
+#
+#     if model_config["verbose"]:
+#         print("*" * 75)
+#
+#
+# # register function as a wrapper for all models
+# def register_model(cls):
+#     model_dict[cls.__name__] = cls
+#     return cls
+#
+#
+# model_dict = {}
+# construct_model()
+#
+#
+# class ModelInterface:
+#     @classmethod
+#     def get_available_models(cls):
+#         return model_dict.keys()
+#
+#     @classmethod
+#     def init_model(cls, model: str, **kwargs):
+#         """
+#
+#         Args:
+#            model   : Class name of model you want to use. Must be in model_dict.keys()
+#            **kwargs: Kwargs for model initialization
+#
+#         Returns: Corresponding model
+#
+#         """
+#         assert model in model_dict.keys(), f"class {model} doesn't exist!"
+#         return model_dict[model](**kwargs)
+########################################################################
+#                             Version 2                                #
+########################################################################
+# register function as a wrapper for all models
+def register_model(cls):
+    global now_cls
+    now_cls = cls
+    return cls
+now_cls = None
+class ModelInterface:
+    @classmethod
+    def init_model(cls, model_py_path: str, **kwargs):
+        """
+        Args:
+            model_py_path: Py file Path of model you want to use.
+           **kwargs: Kwargs for model initialization
+        Returns: Corresponding model
+        """
+        sub_dirs = model_py_path.split(os.sep)
+        cmd = f"from {'.' + '.'.join(sub_dirs[:-1])} import {sub_dirs[-1]}"
+        exec(cmd)
+        return now_cls(**kwargs)

utils/constants.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import itertools
+aa_set = {"A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"}
+aa_list = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]
+foldseek_seq_vocab = "ACDEFGHIKLMNPQRSTVWY#"
+foldseek_struc_vocab = "pynwrqhgdlvtmfsaeikc#"
+struc_unit = "abcdefghijklmnopqrstuvwxyz"
+def create_vocab(size: int) -> dict:
+    """
+    Args:
+        size:   Size of the vocabulary
+    Returns:
+        vocab:  Vocabulary
+    """
+    token_len = 1
+    while size > len(struc_unit) ** token_len:
+        token_len += 1
+    vocab = {}
+    for i, token in enumerate(itertools.product(struc_unit, repeat=token_len)):
+        vocab[i] = "".join(token)
+        if len(vocab) == size:
+            vocab[i+1] = "#"
+            return vocab
+# ProTrek
+residue_level = {"Active site", "Binding site", "Site", "DNA binding", "Natural variant", "Mutagenesis",
+                 "Transmembrane", "Topological domain", "Intramembrane", "Signal peptide", "Propeptide",
+                 "Transit peptide",
+                 "Chain", "Peptide", "Modified residue", "Lipidation", "Glycosylation", "Disulfide bond",
+                 "Cross-link",
+                 "Domain", "Repeat", "Compositional bias", "Region", "Coiled coil", "Motif"}
+sequence_level = {"Function", "Miscellaneous", "Caution", "Catalytic activity", "Cofactor", "Activity regulation",
+                  "Biophysicochemical properties", "Pathway", "Involvement in disease", "Allergenic properties",
+                  "Toxic dose", "Pharmaceutical use", "Disruption phenotype", "Subcellular location",
+                  "Post-translational modification", "Subunit", "Domain (non-positional annotation)",
+                  "Sequence similarities", "RNA Editing", "Tissue specificity", "Developmental stage", "Induction",
+                  "Biotechnology", "Polymorphism", "GO annotation", "Proteomes", "Protein names", "Gene names",
+                  "Organism", "Taxonomic lineage", "Virus host"}
+raw_text_level = {"Function", "Subunit", "Tissue specificity", "Disruption phenotype", "Post-translational modification",
+                  "Induction", "Miscellaneous", "Sequence similarities", "Developmental stage",
+                  "Domain (non-positional annotation)", "Activity regulation", "Caution", "Polymorphism", "Toxic dose",
+                  "Allergenic properties", "Pharmaceutical use", "Cofactor", "Biophysicochemical properties",
+                  "Subcellular location", "RNA Editing"}

utils/foldseek_util.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import os
+import time
+import json
+import numpy as np
+import re
+import sys
+sys.path.append(".")
+# Get structural seqs from pdb file
+def get_struc_seq(foldseek,
+                  path,
+                  chains: list = None,
+                  process_id: int = 0,
+                  plddt_mask: bool = False,
+                  plddt_threshold: float = 70.,
+                  foldseek_verbose: bool = False) -> dict:
+    """
+    Args:
+        foldseek: Binary executable file of foldseek
+        path: Path to pdb file
+        chains: Chains to be extracted from pdb file. If None, all chains will be extracted.
+        process_id: Process ID for temporary files. This is used for parallel processing.
+        plddt_mask: If True, mask regions with plddt < plddt_threshold. plddt scores are from the pdb file.
+        plddt_threshold: Threshold for plddt. If plddt is lower than this value, the structure will be masked.
+        foldseek_verbose: If True, foldseek will print verbose messages.
+    Returns:
+        seq_dict: A dict of structural seqs. The keys are chain IDs. The values are tuples of
+        (seq, struc_seq, combined_seq).
+    """
+    assert os.path.exists(foldseek), f"Foldseek not found: {foldseek}"
+    assert os.path.exists(path), f"PDB file not found: {path}"
+    tmp_save_path = f"get_struc_seq_{process_id}_{time.time()}.tsv"
+    if foldseek_verbose:
+        cmd = f"{foldseek} structureto3didescriptor --threads 1 --chain-name-mode 1 {path} {tmp_save_path}"
+    else:
+        cmd = f"{foldseek} structureto3didescriptor -v 0 --threads 1 --chain-name-mode 1 {path} {tmp_save_path}"
+    os.system(cmd)
+    seq_dict = {}
+    name = os.path.basename(path)
+    with open(tmp_save_path, "r") as r:
+        for i, line in enumerate(r):
+            desc, seq, struc_seq = line.split("\t")[:3]
+            # Mask low plddt
+            if plddt_mask:
+                plddts = extract_plddt(path)
+                assert len(plddts) == len(struc_seq), f"Length mismatch: {len(plddts)} != {len(struc_seq)}"
+                # Mask regions with plddt < threshold
+                indices = np.where(plddts < plddt_threshold)[0]
+                np_seq = np.array(list(struc_seq))
+                np_seq[indices] = "#"
+                struc_seq = "".join(np_seq)
+            name_chain = desc.split(" ")[0]
+            chain = name_chain.replace(name, "").split("_")[-1]
+            if chains is None or chain in chains:
+                if chain not in seq_dict:
+                    combined_seq = "".join([a + b.lower() for a, b in zip(seq, struc_seq)])
+                    seq_dict[chain] = (seq, struc_seq, combined_seq)
+    os.remove(tmp_save_path)
+    os.remove(tmp_save_path + ".dbtype")
+    return seq_dict
+def extract_plddt(pdb_path: str) -> np.ndarray:
+    """
+    Extract plddt scores from pdb file.
+    Args:
+        pdb_path: Path to pdb file.
+    Returns:
+        plddts: plddt scores.
+    """
+    with open(pdb_path, "r") as r:
+        plddt_dict = {}
+        for line in r:
+            line = re.sub(' +', ' ', line).strip()
+            splits = line.split(" ")
+            if splits[0] == "ATOM":
+                # If position < 1000
+                if len(splits[4]) == 1:
+                    pos = int(splits[5])
+                # If position >= 1000, the blank will be removed, e.g. "A 999" -> "A1000"
+                # So the length of splits[4] is not 1
+                else:
+                    pos = int(splits[4][1:])
+                plddt = float(splits[-2])
+                if pos not in plddt_dict:
+                    plddt_dict[pos] = [plddt]
+                else:
+                    plddt_dict[pos].append(plddt)
+    plddts = np.array([np.mean(v) for v in plddt_dict.values()])
+    return plddts
+if __name__ == '__main__':
+    foldseek = "/sujin/bin/foldseek"
+    # test_path = "/sujin/Datasets/PDB/all/6xtd.cif"
+    test_path = "/sujin/Datasets/FLIP/meltome/af2_structures/A0A061ACX4.pdb"
+    plddt_path = "/sujin/Datasets/FLIP/meltome/af2_plddts/A0A061ACX4.json"
+    res = get_struc_seq(foldseek, test_path, plddt_path=plddt_path, plddt_threshold=70.)
+    print(res["A"][1].lower())

utils/lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import math
+from torch.optim.lr_scheduler import _LRScheduler, CosineAnnealingLR
+class ConstantLRScheduler(_LRScheduler):
+    def __init__(self,
+                 optimizer,
+                 last_epoch: int = -1,
+                 verbose: bool = False,
+                 init_lr: float = 0.,
+                 ):
+        """
+        This is an implementation of constant learning rate scheduler.
+        Args:
+            optimizer: Optimizer
+            last_epoch: The index of last epoch. Default: -1
+            verbose: If ``True``, prints a message to stdout for each update. Default: ``False``
+            init_lr: Initial learning rate
+        """
+        self.init_lr = init_lr
+        super().__init__(optimizer, last_epoch, verbose)
+    def state_dict(self):
+        state_dict = {k: v for k, v in self.__dict__.items() if k not in ["optimizer"]}
+        return state_dict
+    def load_state_dict(self, state_dict):
+        self.__dict__.update(state_dict)
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            raise RuntimeError(
+                "To get the last learning rate computed by the scheduler, use "
+                "get_last_lr()"
+            )
+        return [self.init_lr for group in self.optimizer.param_groups]
+class CosineAnnealingLRScheduler(_LRScheduler):
+    def __init__(self,
+                 optimizer,
+                 last_epoch: int = -1,
+                 verbose: bool = False,
+                 init_lr: float = 0.,
+                 max_lr: float = 4e-4,
+                 final_lr: float = 4e-5,
+                 warmup_steps: int = 2000,
+                 cosine_steps: int = 10000,
+                 ):
+        """
+        This is an implementation of cosine annealing learning rate scheduler.
+        Args:
+            optimizer: Optimizer
+            last_epoch: The index of last epoch. Default: -1
+            verbose: If ``True``, prints a message to stdout for each update. Default: ``False``
+            init_lr: Initial learning rate
+            max_lr: Maximum learning rate after warmup
+            final_lr: Final learning rate after decay
+            warmup_steps: Number of steps for warmup
+            cosine_steps: Number of steps for cosine annealing
+        """
+        self.init_lr = init_lr
+        self.max_lr = max_lr
+        self.final_lr = final_lr
+        self.warmup_steps = warmup_steps
+        self.cosine_steps = cosine_steps
+        super(CosineAnnealingLRScheduler, self).__init__(optimizer, last_epoch, verbose)
+    def state_dict(self):
+        state_dict = {k: v for k, v in self.__dict__.items() if k not in ["optimizer"]}
+        return state_dict
+    def load_state_dict(self, state_dict):
+        self.__dict__.update(state_dict)
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            raise RuntimeError(
+                "To get the last learning rate computed by the scheduler, use "
+                "get_last_lr()"
+            )
+        step_no = self.last_epoch
+        if step_no <= self.warmup_steps:
+            lr = self.init_lr + step_no / self.warmup_steps * (self.max_lr - self.init_lr)
+        else:
+            lr = self.final_lr + 0.5 * (self.max_lr - self.final_lr) \
+                    * (1 + math.cos(math.pi * (step_no - self.warmup_steps) / self.cosine_steps))
+        return [lr for group in self.optimizer.param_groups]
+class Esm2LRScheduler(_LRScheduler):
+    def __init__(self,
+                 optimizer,
+                 last_epoch: int = -1,
+                 verbose: bool = False,
+                 init_lr: float = 0.,
+                 max_lr: float = 4e-4,
+                 final_lr: float = 4e-5,
+                 warmup_steps: int = 2000,
+                 start_decay_after_n_steps: int = 500000,
+                 end_decay_after_n_steps: int = 5000000,
+                 on_use: bool = True,
+                 ):
+        """
+        This is an implementation of ESM2's learning rate scheduler.
+        Args:
+            optimizer: Optimizer
+            last_epoch: The index of last epoch. Default: -1
+            verbose: If ``True``, prints a message to stdout for each update. Default: ``False``
+            init_lr: Initial learning rate
+            max_lr: Maximum learning rate after warmup
+            final_lr: Final learning rate after decay
+            warmup_steps: Number of steps for warmup
+            start_decay_after_n_steps: Start decay after this number of steps
+            end_decay_after_n_steps: End decay after this number of steps
+            on_use: Whether to use this scheduler. If ``False``, the scheduler will not change the learning rate
+            and will only use the ``init_lr``. Default: ``True``
+        """
+        self.init_lr = init_lr
+        self.max_lr = max_lr
+        self.final_lr = final_lr
+        self.warmup_steps = warmup_steps
+        self.start_decay_after_n_steps = start_decay_after_n_steps
+        self.end_decay_after_n_steps = end_decay_after_n_steps
+        self.on_use = on_use
+        super(Esm2LRScheduler, self).__init__(optimizer, last_epoch, verbose)
+    def state_dict(self):
+        state_dict = {k: v for k, v in self.__dict__.items() if k not in ["optimizer"]}
+        return state_dict
+    def load_state_dict(self, state_dict):
+        self.__dict__.update(state_dict)
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            raise RuntimeError(
+                "To get the last learning rate computed by the scheduler, use "
+                "get_last_lr()"
+            )
+        step_no = self.last_epoch
+        if not self.on_use:
+            return [base_lr for base_lr in self.base_lrs]
+        if step_no <= self.warmup_steps:
+            lr = self.init_lr + step_no / self.warmup_steps * (self.max_lr - self.init_lr)
+        elif step_no <= self.start_decay_after_n_steps:
+            lr = self.max_lr
+        elif step_no <= self.end_decay_after_n_steps:
+            portion = (step_no - self.start_decay_after_n_steps) / (self.end_decay_after_n_steps - self.start_decay_after_n_steps)
+            lr = self.max_lr - portion * (self.max_lr - self.final_lr)
+        else:
+            lr = self.final_lr
+        return [lr for group in self.optimizer.param_groups]

utils/mpr.py ADDED Viewed

	@@ -0,0 +1,397 @@

+import abc
+import os
+import time
+import sys
+from tqdm import tqdm
+from math import ceil
+class MultipleProcessRunner:
+	"""
+	Abstarct class for running tasks with multiple process
+	There are three abstract methods that should be implemented:
+		1. __len__() : return the length of data
+		2. _target() : target function for each process
+		3. _aggregate() : aggregate results from each process
+	"""
+	def __init__(self,
+	             data,
+	             save_path=None,
+	             n_process=1,
+	             verbose=True,
+	             total_only=True,
+	             log_step=1,
+	             start_method='fork'):
+		"""
+		Args:
+			data     : data to be processed that can be sliced
+			path     : final output path
+			n_process: number of process
+			verbose  : if True, display progress bar
+			total_only: If True, only total progress bar is displayed
+			log_step : For total progress bar, Next log will be printed when
+			``current iteration`` - ``last log iteration`` >= log_step
+			start_method: start method for multiprocessing
+		"""
+		self.data = data
+		self.save_path = save_path
+		self.n_process = n_process
+		self.verbose = verbose
+		self.total_only = total_only
+		self.log_step = log_step
+		self.start_method = start_method
+		# get terminal width to format output
+		try:
+			self.terminal_y = os.get_terminal_size()[0]
+		except Exception as e:
+			print(e)
+			print("Can't get terminal size, set terminal_y = None")
+			self.terminal_y = None
+	def _s2hms(self, seconds: float):
+		"""
+		convert second format of time into hour:minute:second format
+		"""
+		m, s = divmod(seconds, 60)
+		h, m = divmod(m, 60)
+		return "%02d:%02d:%02d" % (h, m, s)
+	def _display_time(self, st_time, now, total):
+		ed_time = time.time()
+		running_time = ed_time - st_time
+		rest_time = running_time * (total - now) / now
+		iter_sec = f"{now / running_time:.2f}it/s" if now > running_time else f"{running_time / now:.2f}s/it"
+		return f' [{self._s2hms(running_time)} < {self._s2hms(rest_time)}, {iter_sec}]'
+	def _display_bar(self, now, total, length):
+		now = now if now <= total else total
+		num = now * length // total
+		progress_bar = '[' + '#' * num + '_' * (length - num) + ']'
+		return progress_bar
+	def _display_all(self, now, total, desc, st_time):
+		# make a progress bar
+		length = 50
+		progress_bar = self._display_bar(now, total, length)
+		time_display = self._display_time(st_time, now, total)
+		display = f'{desc}{progress_bar} {int(now / total * 100):02d}% {now}/{total}{time_display}'
+		# Clean a line
+		width = self.terminal_y if self.terminal_y is not None else 100
+		num_space = width - len(display)
+		if num_space > 0:
+			display += ' ' * num_space
+		else:
+			length += num_space
+			progress_bar = self._display_bar(now, total, length)
+			display = f'{desc}{progress_bar} {int(now / total * 100):02d}% {now}/{total}{time_display}'
+		# Set color
+		display = f"\033[31m{display}\033[0m"
+		return display
+	# Print progress bar at specific position in terminal
+	def terminal_progress_bar(self,
+	                          process_id: int,
+	                          now: int,
+	                          total: int,
+	                          desc: str = ''):
+		"""
+		Args:
+			process_id: process id
+			now: now iteration number
+			total: total iteration number
+			desc: description
+		"""
+		st_time = self.process_st_time[process_id]
+		# Aggregate total information
+		self.counts[process_id] = now
+		self._total_display(self.process_st_time["total"])
+		if not self.total_only:
+			process_display = self._display_all(now, total, desc, st_time)
+			if self.terminal_y is not None:
+				sys.stdout.write(f"\x1b7\x1b[{process_id + 1};{0}f{process_display}\x1b8")
+				sys.stdout.flush()
+			else:
+				print(f"\x1b7\x1b[{process_id + 1};{0}f{process_display}\x1b8", flush=True)
+	# Print global information
+	def _total_display(self, st_time):
+		if self.total_display_callable.value == 1:
+			self.total_display_callable.value = 0
+			cnt = sum([self.counts[i] for i in range(self.n_process)])
+			if cnt - self.last_cnt.value >= self.log_step:
+				total_display = self._display_all(cnt, self.__len__(), f"Total: ", st_time)
+				self.last_cnt.value = cnt
+				x = self.n_process + 1 if not self.total_only else 0
+				# if self.terminal_y is not None:
+				# 	sys.stdout.write(f"\x1b7\x1b[{x};{0}f{total_display}\x1b8")
+				# 	sys.stdout.flush()
+				# else:
+				# 	print(f"\x1b7\x1b[{x};{0}f{total_display}\x1b8", flush=True)
+				print(f"\r\x1b7\x1b[{x};{0}f{total_display}\x1b8", flush=True, end="")
+			self.total_display_callable.value = 1
+	def run(self):
+		"""
+		The function is used to run a multi-process task
+		Returns: return the result of function '_aggregate()'
+		"""
+		import multiprocess as mp
+		mp.set_start_method(self.start_method, force=True)
+		# total number of data that is already processed
+		self.counts = mp.Manager().dict({i: 0 for i in range(self.n_process)})
+		# record start time for each process
+		self.process_st_time = {"total": time.time()}
+		# set a lock to call total number display
+		self.total_display_callable = mp.Value('d', 1)
+		# Save last log iteration number
+		self.last_cnt = mp.Value('d', 0)
+		num_per_process = ceil(self.__len__() / self.n_process)
+		if self.save_path is not None:
+			file_name, suffix = os.path.splitext(self.save_path)
+		process_list = []
+		sub_paths = []
+		for i in range(self.n_process):
+			st = i * num_per_process
+			ed = st + num_per_process
+			# construct slice and sub path for sub process
+			data_slice = self.data[st: ed]
+			sub_path = None
+			# Create a directory to save sub-results
+			if self.save_path is not None:
+				save_dir = f"{file_name}{suffix}_temp"
+				os.makedirs(save_dir, exist_ok=True)
+				sub_path = f"{save_dir}/temp_{i}{suffix}"
+			# construct sub process
+			input_args = (i, data_slice, sub_path)
+			self.process_st_time[i] = time.time()
+			p = mp.Process(target=self._target, args=input_args)
+			p.start()
+			process_list.append(p)
+			sub_paths.append(sub_path)
+		for p in process_list:
+			p.join()
+		# aggregate results and remove temporary directory
+		results = self._aggregate(self.save_path, sub_paths)
+		if self.save_path is not None:
+			save_dir = f"{file_name}{suffix}_temp"
+			os.rmdir(save_dir)
+		return results
+	def parallel_run(self):
+		import multiprocess as mp
+		from joblib import Parallel, delayed
+		# total number of data that is already processed
+		self.counts = mp.Manager().dict({i: 0 for i in range(self.n_process)})
+		# record start time for each process
+		self.process_st_time = {"total": time.time()}
+		# set a lock to call total number display
+		self.total_display_callable = mp.Value('d', 1)
+		# Save last log iteration number
+		self.last_cnt = mp.Value('d', 0)
+		num_per_process = ceil(self.__len__() / self.n_process)
+		if self.save_path is not None:
+			file_name, suffix = os.path.splitext(self.save_path)
+		sub_paths = []
+		input_arg_list = []
+		for i in range(self.n_process):
+			st = i * num_per_process
+			ed = st + num_per_process
+			# construct slice and sub path for sub process
+			data_slice = self.data[st: ed]
+			sub_path = None
+			# Create a directory to save sub-results
+			if self.save_path is not None:
+				save_dir = f"{file_name}{suffix}_temp"
+				os.makedirs(save_dir, exist_ok=True)
+				sub_path = f"{save_dir}/temp_{i}{suffix}"
+			# construct sub process
+			input_args = (i, data_slice, sub_path)
+			self.process_st_time[i] = time.time()
+			sub_paths.append(sub_path)
+			input_arg_list.append(input_args)
+		# Start parallel processing
+		Parallel(n_jobs=self.n_process)(delayed(self._target)(input_args) for input_args in input_arg_list)
+		# aggregate results and remove temporary directory
+		results = self._aggregate(self.save_path, sub_paths)
+		if self.save_path is not None:
+			save_dir = f"{file_name}{suffix}_temp"
+			os.rmdir(save_dir)
+		return results
+	@abc.abstractmethod
+	def _aggregate(self, final_path: str, sub_paths):
+		"""
+		This function is used to aggregate results from sub processes into a file
+		Args:
+			final_path: path to save final results
+			sub_paths : list of sub paths
+		Returns: None or desirable results specified by user
+		"""
+		raise NotImplementedError
+	@abc.abstractmethod
+	def _target(self, process_id, data, sub_path):
+		"""
+		The main body to operate data in one process
+		Args:
+			i       : process id
+			data    : data slice
+			sub_path: sub path to save results
+		"""
+		raise NotImplementedError
+	@abc.abstractmethod
+	def __len__(self):
+		raise NotImplementedError
+class MultipleProcessRunnerSimplifier(MultipleProcessRunner):
+	"""
+	A simplified version of MultipleProcessRunner.
+	User only need to implement the function 'do', then it will be automatically executed
+	in every iteration after call the function 'run'.
+	If 'save_path' is specified, it will open a file in the 'sub_path' into which
+	user can write results, and results will be aggregated into 'save_path'.
+	The procedure would be like:
+		...
+		with open(sub_path, 'w') as w:
+			for i, d in enumerate(data):
+				self.do(process_id, i, d, w) # You can write results into the file.
+				...
+	The 'do' function should be like:
+		def do(process_id, idx, data, writer):
+			...
+	If 'save_path' is None, the argument 'writer' will be set to None.
+	"""
+	def __init__(self,
+	             data,
+	             do,
+	             save_path=None,
+	             n_process=1,
+	             verbose=True,
+	             total_only=True,
+	             log_step=1,
+	             return_results=False,
+	             start_method='fork'):
+		super().__init__(data=data,
+		                 save_path=save_path,
+		                 n_process=n_process,
+		                 verbose=verbose,
+		                 total_only=total_only,
+		                 log_step=log_step,
+		                 start_method=start_method)
+		self.do = do
+		self.return_results = return_results
+	def run(self):
+		self.start_time = time.time()
+		return super().run()
+	def _aggregate(self, final_path: str, sub_paths):
+		results = []
+		w = open(final_path, 'w') if final_path is not None else None
+		if self.verbose:
+			iterator = tqdm(enumerate(sub_paths), "Aggregating results...")
+		else:
+			iterator = enumerate(sub_paths)
+		for i, sub_path in iterator:
+			if sub_path is None and self.return_results:
+				sub_path = f"MultipleProcessRunnerSimplifier_{self.start_time}_{i}.tmp"
+			if sub_path is not None:
+				with open(sub_path, 'r') as r:
+					for line in r:
+						if w is not None:
+							w.write(line)
+						if self.return_results:
+							results.append(line[:-1])
+				os.remove(sub_path)
+		return results
+	def _target(self, process_id, data, sub_path):
+		if sub_path is None and self.return_results:
+			sub_path = f"MultipleProcessRunnerSimplifier_{self.start_time}_{process_id}.tmp"
+		w = open(sub_path, 'w') if sub_path is not None else None
+		for i, d in enumerate(data):
+			self.do(process_id, i, d, w)
+			if self.verbose:
+				self.terminal_progress_bar(process_id, i + 1, len(data), f"Process{process_id} running...")
+		if w is not None:
+			w.close()
+	def __len__(self):
+		return len(self.data)