Spaces:

wilbin
/

DSCRIPT

Runtime error

App Files Files Community

wilbin commited on Feb 20

Commit

285bca4

•

1 Parent(s): 8896a5f

Upload 4 files

Browse files

Files changed (4) hide show

app.py +189 -0
dscript_architecture1.png +0 -0
predict_3di.py +354 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import time
+import gradio as gr
+import pandas as pd
+import torch
+from pathlib import Path
+from Bio import SeqIO
+from dscript.pretrained import get_pretrained
+from dscript.language_model import lm_embed
+from tqdm.auto import tqdm
+from uuid import uuid4
+from predict_3di import get_3di_sequences, predictions_to_dict, one_hot_3di_sequence
+model_map = {
+    "D-SCRIPT": "human_v1",
+    "Topsy-Turvy": "human_v2",
+    "TT3D": "human_tt3d",
+}
+theme = "Default"
+title = "D-SCRIPT: Predicting Protein-Protein Interactions"
+description = """
+If you use this interface to make predictions, please let us know (by emailing samsl@mit.edu)!
+We want to keep this web version free to use with GPU support, and to do that we need to demonstrate to
+our funders that it is being used. Thank you!
+"""
+# article = """
+# <hr>
+# <img style="margin-left:auto; margin-right:auto" src="https://raw.githubusercontent.com/samsledje/D-SCRIPT/main/docs/source/img/dscript_architecture.png" alt="D-SCRIPT architecture" width="70%"/>
+# <hr>
+# D-SCRIPT is a deep learning method for predicting a physical interaction between two proteins given just their sequences.
+# It generalizes well to new species and is robust to limitations in training data size. Its design reflects the intuition that for two proteins to physically interact,
+# a subset of amino acids from each protein should be in contact with the other. The intermediate stages of D-SCRIPT directly implement this intuition, with the penultimate stage
+# in D-SCRIPT being a rough estimate of the inter-protein contact map of the protein dimer. This structurally-motivated design enhances the interpretability of the results and,
+# since structure is more conserved evolutionarily than sequence, improves generalizability across species.
+# <hr>
+# Computational methods to predict protein-protein interaction (PPI) typically segregate into sequence-based "bottom-up" methods that infer properties from the characteristics of the
+# individual protein sequences, or global "top-down" methods that infer properties from the pattern of already known PPIs in the species of interest. However, a way to incorporate
+# top-down insights into sequence-based bottom-up PPI prediction methods has been elusive. Topsy-Turvy builds upon D-SCRIPT by synthesizing both views in a sequence-based,
+# multi-scale, deep-learning model for PPI prediction. While Topsy-Turvy makes predictions using only sequence data, during the training phase it takes a transfer-learning approach by
+# incorporating patterns from both global and molecular-level views of protein interaction. In a cross-species context, we show it achieves state-of-the-art performance, offering the
+# ability to perform genome-scale, interpretable PPI prediction for non-model organisms with no existing experimental PPI data.
+# """
+article = """
+Note that running here with the "TT3D" model does not run structure prediction on the sequences, but rather uses the [ProstT5](https://github.com/mheinzinger/ProstT5) language model to
+translate amino acid to 3di sequences. This is much faster than running structure prediction, but the results may not be as accurate.
+"""
+fold_vocab = {
+    "D": 0,
+    "P": 1,
+    "V": 2,
+    "Q": 3,
+    "A": 4,
+    "W": 5,
+    "K": 6,
+    "E": 7,
+    "I": 8,
+    "T": 9,
+    "L": 10,
+    "F": 11,
+    "G": 12,
+    "S": 13,
+    "M": 14,
+    "H": 15,
+    "C": 16,
+    "R": 17,
+    "Y": 18,
+    "N": 19,
+    "X": 20,
+}
+def predict(model_name, pairs_file, sequence_file, progress = gr.Progress()):
+    try:
+        run_id = uuid4()
+        device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+        # gr.Info("Loading model...")
+        _ = lm_embed("M", use_cuda = (device.type == "cuda"))
+        model = get_pretrained(model_map[model_name]).to(device)
+        # gr.Info("Loading files...")
+        try:
+            seqs = SeqIO.to_dict(SeqIO.parse(sequence_file.name, "fasta"))
+        except ValueError as _:
+            raise gr.Error("Invalid FASTA file - duplicate entry")
+        if Path(pairs_file.name).suffix == ".csv":
+            pairs = pd.read_csv(pairs_file.name)
+        elif Path(pairs_file.name).suffix == ".tsv":
+            pairs = pd.read_csv(pairs_file.name, sep="\t")
+        try:
+            pairs.columns = ["protein1", "protein2"]
+        except ValueError as _:
+            raise gr.Error("Invalid pairs file - must have two columns 'protein1' and 'protein2'")
+        do_foldseek = False
+        if model_name == "TT3D":
+            do_foldseek = True
+            need_to_translate = set(pairs["protein1"]).union(set(pairs["protein2"]))
+            seqs_to_translate = {k: str(seqs[k].seq) for k in need_to_translate if k in seqs}
+            half_precision = False
+            assert not (half_precision and device=="cpu"), print("Running fp16 on CPU is not supported, yet")
+            gr.Info(f"Loading Foldseek embeddings -- this may take some time ({len(seqs_to_translate)} embeddings)...")
+            predictions = get_3di_sequences(
+                seqs_to_translate,
+                model_dir = "Rostlab/ProstT5",
+                report_fn = gr.Info,
+                error_fn = gr.Error,
+                device=device,
+                )
+            foldseek_sequences = predictions_to_dict(predictions)
+            foldseek_embeddings = {k: one_hot_3di_sequence(s.upper(), fold_vocab) for k, s in foldseek_sequences.items()}
+            # for k in seqs_to_translate.keys():
+            #     print(seqs_to_translate[k])
+            #     print(len(seqs_to_translate[k]))
+            #     print(foldseek_embeddings[k])
+            #     print(foldseek_embeddings[k].shape)
+        progress(0, desc="Starting...")
+        results = []
+        for i in progress.tqdm(range(len(pairs))):
+            r = pairs.iloc[i]
+            prot1 = r["protein1"]
+            prot2 = r["protein2"]
+            seq1 = str(seqs[prot1].seq)
+            seq2 = str(seqs[prot2].seq)
+            fold1 = foldseek_embeddings[prot1].to(device) if do_foldseek else None
+            fold2 = foldseek_embeddings[prot2].to(device) if do_foldseek else None
+            lm1 = lm_embed(seq1).to(device)
+            lm2 = lm_embed(seq2).to(device)
+            interaction = model.predict(lm1, lm2, embed_foldseek = do_foldseek, f0 = fold1, f1 = fold2).item()
+            results.append([prot1, prot2, interaction])
+        results = pd.DataFrame(results, columns = ["Protein 1", "Protein 2", "Interaction"])
+        file_path = f"/tmp/{run_id}.tsv"
+        with open(file_path, "w") as f:
+            results.to_csv(f, sep="\t", index=False, header = True)
+        return results, file_path
+    except Exception as e:
+        gr.Error(e)
+        return None, None
+demo = gr.Interface(
+    fn=predict,
+    inputs = [
+        gr.Dropdown(label="Model", choices = ["D-SCRIPT", "Topsy-Turvy", "TT3D"], value = "Topsy-Turvy"),
+        gr.File(label="Pairs (.csv/.tsv)", file_types = [".csv", ".tsv"]),
+        gr.File(label="Sequences (.fasta)", file_types = [".fasta"]),
+    ],
+    outputs = [
+        gr.DataFrame(label='Results', headers=['Protein 1', 'Protein 2', 'Interaction']),
+        gr.File(label="Download results", type="file")
+    ],
+    # title = title,
+    # description = description,
+    article = article,
+    theme = theme,
+)
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch()

dscript_architecture1.png ADDED Viewed

predict_3di.py ADDED Viewed

	@@ -0,0 +1,354 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun 16 14:27:44 2023
+@author: mheinzinger
+"""
+import argparse
+import time
+from pathlib import Path
+from urllib import request
+import shutil
+import numpy as np
+import torch
+from torch import nn
+from transformers import T5EncoderModel, T5Tokenizer
+device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+print("Using device: {}".format(device))
+# Convolutional neural network (two convolutional layers)
+class CNN(nn.Module):
+    def __init__( self ):
+        super(CNN, self).__init__()
+        self.classifier = nn.Sequential(
+            nn.Conv2d(1024, 32, kernel_size=(7, 1), padding=(3, 0)),  # 7x32
+            nn.ReLU(),
+            nn.Dropout(0.0),
+            nn.Conv2d(32, 20, kernel_size=(7, 1), padding=(3, 0))
+        )
+    def forward(self, x):
+        """
+            L = protein length
+            B = batch-size
+            F = number of features (1024 for embeddings)
+            N = number of classes (20 for 3Di)
+        """
+        x = x.permute(0, 2, 1).unsqueeze(dim=-1)  # IN: X = (B x L x F); OUT: (B x F x L, 1)
+        Yhat = self.classifier(x)  # OUT: Yhat_consurf = (B x N x L x 1)
+        Yhat = Yhat.squeeze(dim=-1)  # IN: (B x N x L x 1); OUT: ( B x L x N )
+        return Yhat
+def one_hot_3di_sequence(sequence, vocab):
+    foldseek_enc = torch.zeros(
+        len(sequence), len(vocab), dtype=torch.float32
+    )
+    for i, a in enumerate(sequence):
+        assert a in vocab
+        foldseek_enc[i, vocab[a]] = 1
+    return foldseek_enc.unsqueeze(0)
+def get_T5_model(model_dir):
+    print("Loading T5 from: {}".format(model_dir))
+    model = T5EncoderModel.from_pretrained(model_dir).to(device)
+    model = model.eval()
+    vocab = T5Tokenizer.from_pretrained(model_dir, do_lower_case=False )
+    return model, vocab
+def read_fasta( fasta_path, split_char, id_field ):
+    '''
+        Reads in fasta file containing multiple sequences.
+        Returns dictionary of holding multiple sequences or only single
+        sequence, depending on input file.
+    '''
+    sequences = dict()
+    with open( fasta_path, 'r' ) as fasta_f:
+        for line in fasta_f:
+            # get uniprot ID from header and create new entry
+            if line.startswith('>'):
+                uniprot_id = line.replace('>', '').strip().split(split_char)[id_field]
+                # replace tokens that are mis-interpreted when loading h5
+                uniprot_id = uniprot_id.replace("/","_").replace(".","_")
+                sequences[ uniprot_id ] = ''
+            else:
+                s = ''.join( line.split() ).replace("-","")
+                if s.islower(): # sanity check to avoid mix-up of 3Di and AA input
+                    print("The input file was in lower-case which indicates 3Di-input." +
+                          "This predictor only operates on amino-acid-input (upper-case)." +
+                          "Exiting now ..."
+                          )
+                    return None
+                else:
+                    sequences[ uniprot_id ] += s
+    return sequences
+def write_predictions(predictions, out_path):
+    ss_mapping = {
+        0: "A",
+        1: "C",
+        2: "D",
+        3: "E",
+        4: "F",
+        5: "G",
+        6: "H",
+        7: "I",
+        8: "K",
+        9: "L",
+        10: "M",
+        11: "N",
+        12: "P",
+        13: "Q",
+        14: "R",
+        15: "S",
+        16: "T",
+        17: "V",
+        18: "W",
+        19: "Y"
+    }
+    with open(out_path, 'w+') as out_f:
+        out_f.write( '\n'.join(
+          [ ">{}\n{}".format(
+              seq_id, "".join(list(map(lambda yhat: ss_mapping[int(yhat)], yhats))) )
+          for seq_id, yhats in predictions.items()
+          ]
+            ) )
+    print(f"Finished writing results to {out_path}")
+    return None
+def predictions_to_dict(predictions):
+    ss_mapping = {
+        0: "A",
+        1: "C",
+        2: "D",
+        3: "E",
+        4: "F",
+        5: "G",
+        6: "H",
+        7: "I",
+        8: "K",
+        9: "L",
+        10: "M",
+        11: "N",
+        12: "P",
+        13: "Q",
+        14: "R",
+        15: "S",
+        16: "T",
+        17: "V",
+        18: "W",
+        19: "Y"
+    }
+    results = {seq_id: "".join(list(map(lambda yhat: ss_mapping[int(yhat)], yhats))) for seq_id, yhats in predictions.items()}
+    return results
+def toCPU(tensor):
+    if len(tensor.shape) > 1:
+        return tensor.detach().cpu().squeeze(dim=-1).numpy()
+    else:
+        return tensor.detach().cpu().numpy()
+def download_file(url,local_path):
+    if not local_path.parent.is_dir():
+        local_path.parent.mkdir()
+    print("Downloading: {}".format(url))
+    req = request.Request(url, headers={
+          'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
+      })
+    with request.urlopen(req) as response, open(local_path, 'wb') as outfile:
+          shutil.copyfileobj(response, outfile)
+    return None
+def load_predictor( weights_link="https://rostlab.org/~deepppi/prostt5/cnn_chkpnt/model.pt" , device=torch.device("cpu")):
+    model = CNN()
+    checkpoint_p = Path.cwd() / "cnn_chkpnt" / "model.pt"
+    # if no pre-trained model is available, yet --> download it
+    if not checkpoint_p.exists():
+        download_file(weights_link, checkpoint_p)
+    state = torch.load(checkpoint_p, map_location=device)
+    model.load_state_dict(state["state_dict"])
+    model = model.eval()
+    model = model.to(device)
+    return model
+def get_3di_sequences( seq_dict, model_dir, device,
+                   max_residues=4000, max_seq_len=1000, max_batch=100,report_fn=print,error_fn=print,half_precision=False):
+    predictions = dict()
+    prefix = "<AA2fold>"
+    model, vocab = get_T5_model(model_dir)
+    predictor = load_predictor(device=device)
+    if half_precision:
+        model = model.half()
+        predictor = predictor.half()
+    report_fn('Total number of sequences: {}'.format(len(seq_dict)))
+    avg_length = sum([ len(seq) for _, seq in seq_dict.items()]) / len(seq_dict)
+    n_long     = sum([ 1 for _, seq in seq_dict.items() if len(seq)>max_seq_len])
+    # sort sequences by length to trigger OOM at the beginning
+    seq_dict   = sorted( seq_dict.items(), key=lambda kv: len( seq_dict[kv[0]] ), reverse=True )
+    report_fn("Average sequence length: {}".format(avg_length))
+    report_fn("Number of sequences >{}: {}".format(max_seq_len, n_long))
+    start = time.time()
+    batch = list()
+    for seq_idx, (pdb_id, seq) in enumerate(seq_dict,1):
+        # replace non-standard AAs
+        seq = seq.replace('U','X').replace('Z','X').replace('O','X')
+        seq_len = len(seq)
+        seq = prefix + ' ' + ' '.join(list(seq))
+        batch.append((pdb_id,seq,seq_len))
+        # count residues in current batch and add the last sequence length to
+        # avoid that batches with (n_res_batch > max_residues) get processed
+        n_res_batch = sum([ s_len for  _, _, s_len in batch ]) + seq_len
+        if len(batch) >= max_batch or n_res_batch>=max_residues or seq_idx==len(seq_dict) or seq_len>max_seq_len:
+            pdb_ids, seqs, seq_lens = zip(*batch)
+            batch = list()
+            token_encoding = vocab.batch_encode_plus(seqs,
+                                                     add_special_tokens=True,
+                                                     padding="longest",
+                                                     return_tensors='pt'
+                                                     ).to(device)
+            try:
+                with torch.no_grad():
+                    embedding_repr = model(token_encoding.input_ids,
+                                           attention_mask=token_encoding.attention_mask
+                                           )
+            except RuntimeError:
+                error_fn("RuntimeError during embedding for {} (L={})".format(
+                    pdb_id, seq_len)
+                    )
+                continue
+            # ProtT5 appends a special tokens at the end of each sequence
+            # Mask this also out during inference while taking into account the prefix
+            for idx, s_len in enumerate(seq_lens):
+                token_encoding.attention_mask[idx,s_len+1] = 0
+            # extract last hidden states (=embeddings)
+            residue_embedding = embedding_repr.last_hidden_state.detach()
+            # mask out padded elements in the attention output (can be non-zero) for further processing/prediction
+            residue_embedding = residue_embedding*token_encoding.attention_mask.unsqueeze(dim=-1)
+            # slice off embedding of special token prepended before to each sequence
+            residue_embedding = residue_embedding[:,1:]
+            prediction = predictor(residue_embedding)
+            prediction = toCPU(torch.max( prediction, dim=1, keepdim=True )[1] ).astype(np.byte)
+            # batch-size x seq_len x embedding_dim
+            # extra token is added at the end of the seq
+            for batch_idx, identifier in enumerate(pdb_ids):
+                s_len = seq_lens[batch_idx]
+                # slice off padding and special token appended to the end of the sequence
+                predictions[identifier] = prediction[batch_idx,:, 0:s_len].squeeze()
+                assert s_len == len(predictions[identifier]), error_fn(f"Length mismatch for {identifier}: is:{len(predictions[identifier])} vs should:{s_len}")
+    end = time.time()
+    report_fn('Total number of predictions: {}'.format(len(predictions)))
+    report_fn('Total time: {:.2f}[s]; time/prot: {:.4f}[s]; avg. len= {:.2f}'.format(
+            end-start, (end-start)/len(predictions), avg_length))
+    return predictions
+def create_arg_parser():
+    """"Creates and returns the ArgumentParser object."""
+    # Instantiate the parser
+    parser = argparse.ArgumentParser(description=(
+            'embed.py creates ProstT5-Encoder embeddings for a given text '+
+            ' file containing sequence(s) in FASTA-format.' +
+            'Example: python predict_3Di.py --input /path/to/some_AA_sequences.fasta --output /path/to/some_3Di_sequences.fasta --half 1' ) )
+    # Required positional argument
+    parser.add_argument( '-i', '--input', required=True, type=str,
+                    help='A path to a fasta-formatted text file containing protein sequence(s).')
+    # Optional positional argument
+    parser.add_argument( '-o', '--output', required=True, type=str,
+                    help='A path for saving the created embeddings as NumPy npz file.')
+    # Required positional argument
+    parser.add_argument('--model', required=False, type=str,
+                    default="Rostlab/ProstT5",
+                    help='Either a path to a directory holding the checkpoint for a pre-trained model or a huggingface repository link.' )
+    # Optional argument
+    parser.add_argument('--split_char', type=str,
+                    default='!',
+                    help='The character for splitting the FASTA header in order to retrieve ' +
+                        "the protein identifier. Should be used in conjunction with --id." +
+                        "Default: '!' ")
+    # Optional argument
+    parser.add_argument('--id', type=int,
+                    default=0,
+                    help='The index for the uniprot identifier field after splitting the ' +
+                        "FASTA header after each symbole in ['|', '#', ':', ' ']." +
+                        'Default: 0')
+    parser.add_argument('--half', type=int,
+                    default=1,
+                    help="Whether to use half_precision or not. Default: 1 (half-precision)")
+    return parser
+def main():
+    parser     = create_arg_parser()
+    args       = parser.parse_args()
+    seq_path   = Path( args.input ) # path to input FASTAS
+    out_path   = Path( args.output) # path where predictions should be written to
+    model_dir  = args.model # path/repo_link to checkpoint
+    if out_path.is_file():
+        print("Output file is already existing and will be overwritten ...")
+    split_char = args.split_char
+    id_field   = args.id
+    half_precision = False if int(args.half) == 0 else True
+    assert not (half_precision and device=="cpu"), print("Running fp16 on CPU is not supported, yet")
+    seq_dict = read_fasta( seq_path, split_char, id_field )
+    predictions = get_3di_sequences(
+        seq_dict,
+        model_dir,
+        )
+    print("Writing results now to disk ...")
+    write_predictions(predictions,out_path)
+if __name__ == '__main__':
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+dscript>=0.2.6
+biopython
+pandas
+tqdm
+transformers
+sentencepiece
+protobuf