Spaces:
Sleeping
Sleeping
import time | |
import numpy as np | |
import gradio as gr | |
import pandas as pd | |
import torch | |
from pathlib import Path | |
from Bio import SeqIO | |
from tqdm.auto import tqdm | |
from uuid import uuid4 | |
from tempfile import TemporaryDirectory | |
from torch.utils.data import DataLoader | |
from pathvalidate import sanitize_filename | |
from conplex_dti.featurizer import MorganFeaturizer, ProtBertFeaturizer | |
from publish_model import ConPLex_DTI | |
theme = "Default" | |
title = "ConPLex: Predicting Drug-Target Interactions" | |
description = """ | |
If you use this interface to make predictions, please let us know (by emailing samsl@mit.edu)! | |
We want to keep this web version free to use with GPU support, and to do that we need to demonstrate to | |
our funders that it is being used. Thank you! | |
""" | |
# article = """ | |
# <hr> | |
# <img style="margin-left:auto; margin-right:auto" src="https://raw.githubusercontent.com/samsledje/D-SCRIPT/main/docs/source/img/dscript_architecture.png" alt="D-SCRIPT architecture" width="70%"/> | |
# <hr> | |
# D-SCRIPT is a deep learning method for predicting a physical interaction between two proteins given just their sequences. | |
# It generalizes well to new species and is robust to limitations in training data size. Its design reflects the intuition that for two proteins to physically interact, | |
# a subset of amino acids from each protein should be in contact with the other. The intermediate stages of D-SCRIPT directly implement this intuition, with the penultimate stage | |
# in D-SCRIPT being a rough estimate of the inter-protein contact map of the protein dimer. This structurally-motivated design enhances the interpretability of the results and, | |
# since structure is more conserved evolutionarily than sequence, improves generalizability across species. | |
# <hr> | |
# Computational methods to predict protein-protein interaction (PPI) typically segregate into sequence-based "bottom-up" methods that infer properties from the characteristics of the | |
# individual protein sequences, or global "top-down" methods that infer properties from the pattern of already known PPIs in the species of interest. However, a way to incorporate | |
# top-down insights into sequence-based bottom-up PPI prediction methods has been elusive. Topsy-Turvy builds upon D-SCRIPT by synthesizing both views in a sequence-based, | |
# multi-scale, deep-learning model for PPI prediction. While Topsy-Turvy makes predictions using only sequence data, during the training phase it takes a transfer-learning approach by | |
# incorporating patterns from both global and molecular-level views of protein interaction. In a cross-species context, we show it achieves state-of-the-art performance, offering the | |
# ability to perform genome-scale, interpretable PPI prediction for non-model organisms with no existing experimental PPI data. | |
# """ | |
article = """ | |
The pairs file should be a tab-separated values file where each row is a candidate pair, formatted as `[protein ID]\t[molecule ID]\t[protein Sequence]\t[molecule SMILES]` | |
""" | |
def predict(run_name, model_name, csv_file, progress = gr.Progress()): | |
try: | |
with TemporaryDirectory() as tmpdir: | |
run_id = uuid4() | |
run_name = sanitize_filename(run_name) | |
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") | |
gr.Info("Loading data...") | |
query_df = pd.read_csv( | |
csv_file.name, | |
sep="\t", | |
names=["proteinID", "moleculeID", "proteinSequence", "moleculeSmiles"], | |
) | |
# Loading model | |
gr.Info("Loading model -- this may take a while, as the ProtBert language model must be downloaded...") | |
target_featurizer = ProtBertFeaturizer( | |
save_dir=tmpdir, per_tok=False | |
).to(device) | |
drug_featurizer = MorganFeaturizer(save_dir=tmpdir).to(device) | |
gr.Info("Preloading embeddings...") | |
drug_featurizer.preload(query_df["moleculeSmiles"].unique()) | |
target_featurizer.preload(query_df["proteinSequence"].unique()) | |
model = ConPLex_DTI.from_pretrained(f"samsl/{model_name}") | |
model = model.eval() | |
model = model.to(device) | |
dt_feature_pairs = [ | |
(drug_featurizer(r["moleculeSmiles"]), target_featurizer(r["proteinSequence"])) | |
for _, r in query_df.iterrows() | |
] | |
dloader = DataLoader(dt_feature_pairs, batch_size=1024, shuffle=False) | |
progress(0, desc="Starting...") | |
preds = [] | |
for b in progress.tqdm(dloader): | |
preds.append(model(b[0], b[1]).detach().cpu().numpy()) | |
preds = np.concatenate(preds) | |
results = pd.DataFrame(query_df[["moleculeID", "proteinID"]]) | |
results["Prediction"] = preds | |
results.columns = ['Protein', 'Small Molecule', 'Predicted Interaction'] | |
file_path = f"/tmp/conplex_{run_name}_{run_id}.tsv" | |
with open(file_path, "w+") as f: | |
results.to_csv(f, sep="\t", index=False, header = True) | |
return file_path | |
except Exception as e: | |
gr.Error(e) | |
print(e) | |
return None | |
demo = gr.Interface( | |
fn=predict, | |
inputs = [ | |
gr.Textbox(label="Run Name", placeholder = "predictions", type="text"), | |
gr.Dropdown(label="Model", choices = ["ConPLex_V1_BindingDB"], value = "ConPLex_V1_BindingDB"), | |
gr.File(label="Pairs (.tsv)", file_types = [".tsv"]), | |
], | |
outputs = [ | |
# gr.DataFrame( | |
# label='Results', | |
# headers=['Protein', 'Small Molecule', 'Predicted Interaction'], | |
# height = 200, | |
# row_count = 20 | |
# ), | |
gr.File(label="Download results", type="filepath") | |
], | |
title = title, | |
description = description, | |
article = article, | |
theme = theme, | |
) | |
if __name__ == "__main__": | |
demo.queue(max_size=20).launch() |