Spaces:
Sleeping
Sleeping
File size: 6,359 Bytes
84bfd88 005026b 84bfd88 |
|
import sys
from typing import List
from tqdm import tqdm
import pandas as pd
import numpy as np
import threading
from concurrent.futures import ProcessPoolExecutor, as_completed, TimeoutError
import time
import requests
import joblib
# from bio_embeddings.embed import SeqVecEmbedder, ProtTransBertBFDEmbedder, ProtTransT5XLU50Embedder
from Bio import SeqIO
import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import torch
from typing import *
from rdkit import RDLogger
RDLogger.DisableLog("rdApp.*")
from xgboost import XGBClassifier, DMatrix
from model.barlow_twins import BarlowTwins
# sys.path.append("../utils/")
from utils.sequence import uniprot2sequence, encode_sequences
class DTIModel:
def __init__(self, bt_model_path: str, gbm_model_path: str, encoder: str = "prost_t5"):
self.bt_model = BarlowTwins()
self.bt_model.load_model(bt_model_path)
self.gbm_model = XGBClassifier()
self.gbm_model.load_model(gbm_model_path)
self.encoder = encoder
self.smiles_cache = {}
self.sequence_cache = {}
def _encode_smiles(self, smiles: str, radius: int = 2, bits: int = 1024, features: bool = False):
if smiles is None:
return None
# Check if the SMILES is already in the cache
if smiles in self.smiles_cache:
return self.smiles_cache[smiles]
else:
# Encode the SMILES and store it in the cache
try:
mol = Chem.MolFromSmiles(smiles)
morgan = AllChem.GetMorganFingerprintAsBitVect(
mol,
radius=radius,
nBits=bits,
useFeatures=features,
)
morgan = np.array(morgan)
self.smiles_cache[smiles] = morgan
return morgan
except Exception as e:
print(f"Failed to encode SMILES: {smiles}")
print(e)
return None
def _encode_smiles_mult(self, smiles: List[str], radius: int = 2, bits: int = 1024, features: bool = False):
morgan = [self._encode_smiles(s, radius, bits, features) for s in smiles]
return np.array(morgan)
def _encode_sequence(self, sequence: str):
# Clear torch cache
torch.cuda.empty_cache()
if sequence is None:
return None
# Check if the sequence is already in the cache
if sequence in self.sequence_cache:
return self.sequence_cache[sequence]
else:
# Encode the sequence and store it in the cache
try:
encoded_sequence = encode_sequences([sequence], encoder=self.encoder)
self.sequence_cache[sequence] = encoded_sequence
return encoded_sequence
except Exception as e:
print(f"Failed to encode sequence: {sequence}")
print(e)
return None
def _encode_sequence_mult(self, sequences: List[str]):
seq = [self._encode_sequence(sequence) for sequence in sequences]
return np.array(seq)
def __predict_pair(self, drug_emb: np.ndarray, target_emb: np.ndarray, pred_leaf: bool):
if drug_emb.shape[0] < target_emb.shape[0]:
drug_emb = np.tile(drug_emb, (len(target_emb), 1))
elif len(drug_emb) > len(target_emb):
target_emb = np.tile(target_emb, (len(drug_emb), 1))
emb = self.bt_model.zero_shot(drug_emb, target_emb)
if pred_leaf:
d_emb = DMatrix(emb)
return self.gbm_model.get_booster().predict(d_emb, pred_leaf=True)
else:
return self.gbm_model.predict_proba(emb)[:, 1]
def predict(self, drug: List[str] or str, target: str, pred_leaf: bool = False):
if isinstance(drug, str):
drug_emb = self._encode_smiles(drug)
else:
drug_emb = self._encode_smiles_mult(drug)
target_emb = self._encode_sequence(target)
return self.__predict_pair(drug_emb, target_emb, pred_leaf)
def get_leaf_weights(self):
return self.gbm_model.get_booster().get_score(importance_type="weight")
def _predict_fasta(self, drug: str, fasta_path: str):
drug_emb = self._encode_smiles(drug)
results = []
# Extract targets from fasta
for target in tqdm(SeqIO.parse(fasta_path, "fasta"), desc="Predicting targets"):
target_emb = self._encode_sequence(str(target.seq))
pred = self.__predict_pair(drug_emb, target_emb)
results.append(
{
"drug": drug,
"target": target.id,
"name": target.name,
"description": target.description,
"prediction": pred[0]
}
)
return pd.DataFrame(results)
def predict_fasta(self, drug: str, fasta_path: str, timeout_seconds: int = 120):
def process_target(target, results):
target_emb = self._encode_sequence(str(target.seq))
pred = self.__predict_pair(drug_emb, target_emb)
results.append({
"drug": drug,
"target": target.id,
"name": target.name,
"description": target.description,
"prediction": pred[0]
})
drug_emb = self._encode_smiles(drug)
results = []
# First, count the total number of records for the progress bar
total_records = sum(1 for _ in SeqIO.parse(fasta_path, "fasta"))
# Extract targets from fasta with a properly initialized tqdm progress bar
for target in tqdm(SeqIO.parse(fasta_path, "fasta"), total=total_records, desc="Predicting targets"):
thread_results = []
thread = threading.Thread(target=process_target, args=(target, thread_results))
thread.start()
thread.join(timeout_seconds)
if thread.is_alive():
print(f"Skipping target {target.id} due to timeout")
continue
results.extend(thread_results)
return pd.DataFrame(results)
def predict_uniprot(self, drug: List[str] or str, uniprot_id: str):
return self.predict(drug, uniprot2sequence(uniprot_id))
|