File size: 7,800 Bytes
e09f17f 809fb87 e09f17f 809fb87 ff2b104 e09f17f 809fb87 ff2b104 e09f17f ff2b104 e09f17f a1d4104 c729a21 e09f17f a8ca144 e09f17f a8ca144 e09f17f a8ca144 e09f17f a8ca144 e09f17f a8ca144 e09f17f a8ca144 a73e54e e09f17f a8ca144 e09f17f d43f920 c729a21 e09f17f c729a21 a73e54e c729a21 a73e54e c729a21 a73e54e c729a21 809fb87 c729a21 809fb87 c729a21 a73e54e c729a21 809fb87 e09f17f 809fb87 ff2b104 a73e54e e09f17f a73e54e a8ca144 e09f17f 809fb87 a73e54e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
import time
import gradio as gr
import pandas as pd
import torch
from pathlib import Path
from Bio import SeqIO
from dscript.pretrained import get_pretrained
from dscript.language_model import lm_embed
from tqdm.auto import tqdm
from uuid import uuid4
from predict_3di import get_3di_sequences, predictions_to_dict, one_hot_3di_sequence
model_map = {
"D-SCRIPT": "human_v1",
"Topsy-Turvy": "human_v2",
"TT3D": "human_tt3d",
}
theme = "Default"
title = "D-SCRIPT: Predicting Protein-Protein Interactions"
description = """
If you use this interface to make predictions, please let us know (by emailing samsl@mit.edu)!
We want to keep this web version free to use with GPU support, and to do that we need to demonstrate to
our funders that it is being used. Thank you!
"""
# article = """
# <hr>
# <img style="margin-left:auto; margin-right:auto" src="https://raw.githubusercontent.com/samsledje/D-SCRIPT/main/docs/source/img/dscript_architecture.png" alt="D-SCRIPT architecture" width="70%"/>
# <hr>
# D-SCRIPT is a deep learning method for predicting a physical interaction between two proteins given just their sequences.
# It generalizes well to new species and is robust to limitations in training data size. Its design reflects the intuition that for two proteins to physically interact,
# a subset of amino acids from each protein should be in contact with the other. The intermediate stages of D-SCRIPT directly implement this intuition, with the penultimate stage
# in D-SCRIPT being a rough estimate of the inter-protein contact map of the protein dimer. This structurally-motivated design enhances the interpretability of the results and,
# since structure is more conserved evolutionarily than sequence, improves generalizability across species.
# <hr>
# Computational methods to predict protein-protein interaction (PPI) typically segregate into sequence-based "bottom-up" methods that infer properties from the characteristics of the
# individual protein sequences, or global "top-down" methods that infer properties from the pattern of already known PPIs in the species of interest. However, a way to incorporate
# top-down insights into sequence-based bottom-up PPI prediction methods has been elusive. Topsy-Turvy builds upon D-SCRIPT by synthesizing both views in a sequence-based,
# multi-scale, deep-learning model for PPI prediction. While Topsy-Turvy makes predictions using only sequence data, during the training phase it takes a transfer-learning approach by
# incorporating patterns from both global and molecular-level views of protein interaction. In a cross-species context, we show it achieves state-of-the-art performance, offering the
# ability to perform genome-scale, interpretable PPI prediction for non-model organisms with no existing experimental PPI data.
# """
article = """
Pairs file should be a comma-separated or tab-separated (.csv/.tsv) file with two columns, "protein1" and "protein2", where each row contains the names of two proteins. The sequences should be a FASTA file with the corresponding protein names as the headers.
Note that running here with the "TT3D" model does not run structure prediction on the sequences, but rather uses the [ProstT5](https://github.com/mheinzinger/ProstT5) language model to
translate amino acid to 3di sequences. This is much faster than running structure prediction, but the results may not be as accurate.
"""
fold_vocab = {
"D": 0,
"P": 1,
"V": 2,
"Q": 3,
"A": 4,
"W": 5,
"K": 6,
"E": 7,
"I": 8,
"T": 9,
"L": 10,
"F": 11,
"G": 12,
"S": 13,
"M": 14,
"H": 15,
"C": 16,
"R": 17,
"Y": 18,
"N": 19,
"X": 20,
}
def predict(model_name, pairs_file, sequence_file, progress = gr.Progress()):
try:
run_id = uuid4()
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
# gr.Info("Loading model...")
_ = lm_embed("M", use_cuda = (device.type == "cuda"))
model = get_pretrained(model_map[model_name]).to(device)
# gr.Info("Loading files...")
try:
seqs = SeqIO.to_dict(SeqIO.parse(sequence_file.name, "fasta"))
except ValueError as e:
print(e)
raise gr.Error("Invalid FASTA file - duplicate entry")
if Path(pairs_file.name).suffix == ".csv":
pairs = pd.read_csv(pairs_file.name)
elif Path(pairs_file.name).suffix == ".tsv":
pairs = pd.read_csv(pairs_file.name, sep="\t")
try:
pairs.columns = ["protein1", "protein2"]
except ValueError as e:
print(e)
raise gr.Error("Invalid pairs file - must have two columns 'protein1' and 'protein2'")
do_foldseek = False
if model_name == "TT3D":
do_foldseek = True
need_to_translate = set(pairs["protein1"]).union(set(pairs["protein2"]))
seqs_to_translate = {k: str(seqs[k].seq) for k in need_to_translate if k in seqs}
half_precision = False
assert not (half_precision and device=="cpu"), print("Running fp16 on CPU is not supported, yet")
gr.Info(f"Loading Foldseek embeddings -- this may take some time ({len(seqs_to_translate)} embeddings)...")
predictions = get_3di_sequences(
seqs_to_translate,
model_dir = "Rostlab/ProstT5",
report_fn = gr.Info,
error_fn = gr.Error,
device=device,
)
foldseek_sequences = predictions_to_dict(predictions)
foldseek_embeddings = {k: one_hot_3di_sequence(s.upper(), fold_vocab) for k, s in foldseek_sequences.items()}
# for k in seqs_to_translate.keys():
# print(seqs_to_translate[k])
# print(len(seqs_to_translate[k]))
# print(foldseek_embeddings[k])
# print(foldseek_embeddings[k].shape)
print("Starting predictions")
progress(0, desc="Starting...")
results = []
for i in progress.tqdm(range(len(pairs))):
r = pairs.iloc[i]
prot1 = r["protein1"]
prot2 = r["protein2"]
seq1 = str(seqs[prot1].seq)
seq2 = str(seqs[prot2].seq)
fold1 = foldseek_embeddings[prot1].to(device) if do_foldseek else None
fold2 = foldseek_embeddings[prot2].to(device) if do_foldseek else None
lm1 = lm_embed(seq1).to(device)
lm2 = lm_embed(seq2).to(device)
interaction = model.predict(lm1, lm2, embed_foldseek = do_foldseek, f0 = fold1, f1 = fold2).item()
results.append([prot1, prot2, interaction])
results = pd.DataFrame(results, columns = ["Protein 1", "Protein 2", "Interaction"])
file_path = f"/tmp/{run_id}.tsv"
with open(file_path, "w") as f:
results.to_csv(f, sep="\t", index=False, header = True)
return results, file_path
except Exception as e:
print(e)
raise gr.Error(e)
return None, None
demo = gr.Interface(
fn=predict,
inputs = [
gr.Dropdown(label="Model", choices = ["D-SCRIPT", "Topsy-Turvy", "TT3D"], value = "Topsy-Turvy"),
gr.File(label="Pairs (.csv/.tsv)", file_types = [".csv", ".tsv"]),
gr.File(label="Sequences (.fasta)", file_types = [".fasta"]),
],
outputs = [
gr.DataFrame(label='Results', headers=['Protein 1', 'Protein 2', 'Interaction']),
gr.File(label="Download results", type="filepath")
],
title = title,
description = description,
article = article,
theme = theme,
)
if __name__ == "__main__":
demo.queue(max_size=20).launch()
|