Spaces:

mschuh
/

BarlowDTI

Running on Zero

App Files Files Community

mschuh commited on Aug 13

Commit

84bfd88

•

1 Parent(s): 777321b

Upload 37 files

Browse files

Files changed (38) hide show

.gitattributes +10 -0
README.md +5 -8
app.py +125 -0
model/__init__.py +0 -0
model/barlow_twins.py +525 -0
model/base_model.py +75 -0
model/model.py +169 -0
model/preprocessor.py +180 -0
model/stash/14062024_0910/history.json +0 -0
model/stash/14062024_0910/log.txt +41 -0
model/stash/14062024_0910/params.pkl +3 -0
model/stash/14062024_0910/weights.pt +3 -0
model/xgb_models/14062024_0910_barlowdti_xxl_model.json +0 -0
model/xgb_models/xgb_model_BIOSNAP_full_data_14062024_0910_bt_optimized_0.json +3 -0
model/xgb_models/xgb_model_BIOSNAP_missing_data_70_14062024_0910_bt_optimized_0.json +0 -0
model/xgb_models/xgb_model_BIOSNAP_missing_data_80_14062024_0910_bt_optimized_0.json +3 -0
model/xgb_models/xgb_model_BIOSNAP_missing_data_90_14062024_0910_bt_optimized_0.json +0 -0
model/xgb_models/xgb_model_BIOSNAP_missing_data_95_14062024_0910_bt_optimized_0.json +0 -0
model/xgb_models/xgb_model_BIOSNAP_unseen_drug_14062024_0910_bt_optimized_0.json +3 -0
model/xgb_models/xgb_model_BIOSNAP_unseen_protein_14062024_0910_bt_optimized_0.json +3 -0
model/xgb_models/xgb_model_BindingDB_14062024_0910_bt_optimized_0.json +3 -0
model/xgb_models/xgb_model_DAVIS_14062024_0910_bt_optimized_0.json +0 -0
model/xgb_models/xgb_model_nature_mach_intel_BindingDB_cluster_14062024_0910_bt_optimized_0.json +0 -0
model/xgb_models/xgb_model_nature_mach_intel_BindingDB_protein_14062024_0910_bt_optimized_0.json +3 -0
model/xgb_models/xgb_model_nature_mach_intel_BindingDB_random_14062024_0910_bt_optimized_0.json +3 -0
model/xgb_models/xgb_model_nature_mach_intel_BindingDB_scaffold_14062024_0910_bt_optimized_0.json +3 -0
model/xgb_models/xgb_model_nature_mach_intel_BioSNAP_cluster_14062024_0910_bt_optimized_0.json +0 -0
model/xgb_models/xgb_model_nature_mach_intel_BioSNAP_protein_14062024_0910_bt_optimized_0.json +0 -0
model/xgb_models/xgb_model_nature_mach_intel_BioSNAP_random_14062024_0910_bt_optimized_0.json +3 -0
model/xgb_models/xgb_model_nature_mach_intel_BioSNAP_scaffold_14062024_0910_bt_optimized_0.json +0 -0
model/xgb_models/xgb_model_nature_mach_intel_Human_protein_14062024_0910_bt_optimized_0.json +3 -0
model/xgb_models/xgb_model_nature_mach_intel_Human_random_14062024_0910_bt_optimized_0.json +0 -0
model/xgb_models/xgb_model_nature_mach_intel_Human_scaffold_14062024_0910_bt_optimized_0.json +0 -0
requirements.txt +25 -0
utils/__init__.py +0 -0
utils/chem.py +64 -0
utils/parallel.py +78 -0
utils/sequence.py +339 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model/xgb_models/xgb_model_BindingDB_14062024_0910_bt_optimized_0.json filter=lfs diff=lfs merge=lfs -text
+model/xgb_models/xgb_model_BIOSNAP_full_data_14062024_0910_bt_optimized_0.json filter=lfs diff=lfs merge=lfs -text
+model/xgb_models/xgb_model_BIOSNAP_missing_data_80_14062024_0910_bt_optimized_0.json filter=lfs diff=lfs merge=lfs -text
+model/xgb_models/xgb_model_BIOSNAP_unseen_drug_14062024_0910_bt_optimized_0.json filter=lfs diff=lfs merge=lfs -text
+model/xgb_models/xgb_model_BIOSNAP_unseen_protein_14062024_0910_bt_optimized_0.json filter=lfs diff=lfs merge=lfs -text
+model/xgb_models/xgb_model_nature_mach_intel_BindingDB_protein_14062024_0910_bt_optimized_0.json filter=lfs diff=lfs merge=lfs -text
+model/xgb_models/xgb_model_nature_mach_intel_BindingDB_random_14062024_0910_bt_optimized_0.json filter=lfs diff=lfs merge=lfs -text
+model/xgb_models/xgb_model_nature_mach_intel_BindingDB_scaffold_14062024_0910_bt_optimized_0.json filter=lfs diff=lfs merge=lfs -text
+model/xgb_models/xgb_model_nature_mach_intel_BioSNAP_random_14062024_0910_bt_optimized_0.json filter=lfs diff=lfs merge=lfs -text
+model/xgb_models/xgb_model_nature_mach_intel_Human_protein_14062024_0910_bt_optimized_0.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,10 @@
 ---
 title: BarlowDTI
-emoji: 🔥
-colorFrom: purple
-colorTo: red
 sdk: gradio
 sdk_version: 4.41.0
 app_file: app.py
-pinned: false
-license: cc-by-4.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: BarlowDTI
+emoji: 💊 ↔️ 🎯
+colorFrom: blue
+colorTo: pink
 sdk: gradio
 sdk_version: 4.41.0
 app_file: app.py
+pinned: true
+---

app.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import gradio as gr
+import plotly.graph_objects as go
+import numpy as np
+import pandas as pd
+from model.model import DTIModel
+dt_str = "14062024_0910"
+def make_spider_plot(predictions, model_names, smiles_list):
+    fig = go.Figure()
+    for i, (prediction, smiles) in enumerate(zip(predictions, smiles_list)):
+        fig.add_trace(go.Scatterpolar(
+            r=prediction,
+            theta=model_names,
+            fill='toself',
+            name=smiles
+        ))
+    fig.update_layout(
+        polar=dict(
+            radialaxis=dict(
+                visible=True,
+                range=[0, 1]
+            )),
+        showlegend=True
+    )
+    return fig
+def predict_and_plot(amino_acid_sequence, smiles_input, datasets):
+    model_ensemble = {}
+    gbm_model_paths = {
+        "BindingDB": f"model/xgb_models/xgb_model_BindingDB_{dt_str}_bt_optimized_0.json",
+        "BioSNAP": f"model/xgb_models/xgb_model_BIOSNAP_full_data_{dt_str}_bt_optimized_0.json",
+        "DAVIS": f"model/xgb_models/xgb_model_DAVIS_{dt_str}_bt_optimized_0.json",
+        "BarlowDTI XXL": f"model/xgb_models/{dt_str}_barlowdti_xxl_model.json",
+    }
+    for model in datasets:
+        print(f"Loading model {model}")
+        model_ensemble[model] = DTIModel(
+            bt_model_path=f"model/stash/{dt_str}",
+            gbm_model_path=gbm_model_paths[model],
+        )
+    smiles_list = smiles_input.strip().split('\n')
+    predictions = []
+    for model in model_ensemble.values():
+        model_predictions = model.predict(smiles_list, amino_acid_sequence)
+        predictions.append(model_predictions)
+    predictions = np.array(predictions).transpose().tolist()
+    df = pd.DataFrame(predictions, index=smiles_list, columns=datasets).reset_index()
+    df.columns = ["SMILES"] + datasets
+    fig = make_spider_plot(predictions, datasets, smiles_list)
+    return fig, df
+dataset_names = [
+    "BarlowDTI XXL",
+    "BindingDB",
+    "BioSNAP",
+    "DAVIS",
+]
+title = "Predict Drug-Target Interactions with <span style='font-variant:small-caps;'>BarlowDTI</span>"
+description = """
+Input Amino Acid Sequence and SMILES to get interaction predictions visualized as a spider graph and in a table.
+The values ca be interpreted as the probability of interaction between the drug and target (0 = no interaction, 1 = interaction).
+__Note: Inference may take a loger time, you can upgrade to a paid GPU-enabled plan for faster inference.__
+"""
+article = """
+This interface enables the use of <span style='font-variant:small-caps;'>BarlowDTI</span> to predict drug-target interactions.
+The model ensemble consists of three models trained on different datasets: BindingDB, BIOSNAP, and DAVIS.
+If you use this interface in your research, please cite our paper:
+```
+@misc{schuh2024barlowtwinsdeepneural,
+      title={Barlow Twins Deep Neural Network for Advanced 1D Drug-Target Interaction Prediction},
+      author={Maximilian G. Schuh and Davide Boldini and Stephan A. Sieber},
+      year={2024},
+      eprint={2408.00040},
+      archivePrefix={arXiv},
+      primaryClass={q-bio.BM},
+      url={https://arxiv.org/abs/2408.00040},
+}
+```
+"""
+theme = gr.themes.Base(
+    primary_hue="violet",
+    font=[gr.themes.GoogleFont('IBM Plex Sans'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
+)
+iface = gr.Interface(
+    fn=predict_and_plot,
+    inputs=[
+        gr.Textbox(label="Protein Sequence", info="Just one sequence is allowed. Remove FASTA syntax (e.g. >ABC)."),
+        gr.Textbox(label="Molecule SMILES", info="One per line, multiple allowed."),
+        gr.CheckboxGroup(choices=dataset_names, label="Select Models for Prediction", value="BarlowDTI XXL")
+    ],
+    outputs=[
+        gr.Plot(label="Predictions Visualization"),
+        gr.DataFrame(label="Predictions DataFrame"),
+        # gr.DownloadButton(label="Download Predictions")
+    ],
+    title=title,
+    description=description,
+    article=article,
+    theme=theme
+)
+iface.launch()

model/__init__.py ADDED Viewed

File without changes

model/barlow_twins.py ADDED Viewed

	@@ -0,0 +1,525 @@

+import torch
+torch.manual_seed(42)
+torch.backends.cudnn.deterministic = True
+from torch import nn
+import numpy as np
+from typing import *
+from datetime import datetime
+import os
+import pickle
+import inspect
+from tqdm.auto import trange
+from model.base_model import BaseModel
+class BarlowTwins(BaseModel):
+    def __init__(
+            self,
+            n_bits: int = 1024,
+            aa_emb_size: int = 1024,
+            enc_n_neurons: int = 512,
+            enc_n_layers: int = 2,
+            proj_n_neurons: int = 2048,
+            proj_n_layers: int = 2,
+            embedding_dim: int = 512,
+            act_function: str = "relu",
+            loss_weight: float = 0.005,
+            batch_size: int = 512,
+            optimizer: str = "adamw",
+            momentum: float = 0.9,
+            learning_rate: float = 0.0001,
+            betas: tuple = (0.9, 0.999),
+            weight_decay: float = 1e-3,
+            step_size: int = 10,
+            gamma: float = 0.1,
+            verbose: bool = True,
+    ):
+        super().__init__()
+        self.enc_aa = None
+        self.enc_mol = None
+        self.proj = None
+        self.scheduler = None
+        self.optimizer = None
+        # store input in dict
+        self.param_dict = {
+            "act_function": self.activation_dict[
+                act_function
+            ],  # which activation function to use among dict options
+            "loss_weight": loss_weight,  # off-diagonal cross correlation loss weight
+            "batch_size": batch_size,  # samples per gradient step
+            "learning_rate": learning_rate,  # update step magnitude when training
+            "betas": betas,  # momentum hyperparameter for adam-like optimizers
+            "step_size": step_size,  # decay period for the learning rate
+            "gamma": gamma,  # decay coefficient for the learning rate
+            "optimizer": self.optimizer_dict[
+                optimizer
+            ],  # which optimizer to use among dict options
+            "momentum": momentum,  # momentum hyperparameter for SGD
+            "enc_n_neurons": enc_n_neurons,  # neurons to use for the mlp encoder
+            "enc_n_layers": enc_n_layers,  # number of hidden layers in the mlp encoder
+            "proj_n_neurons": proj_n_neurons,  # neurons to use for the mlp projector
+            "proj_n_layers": proj_n_layers,  # number of hidden layers in the mlp projector
+            "embedding_dim": embedding_dim,  # latent space dim for downstream tasks
+            "weight_decay": weight_decay,  # l2 regularization for linear layers
+            "verbose": verbose,  # whether to print feedback
+            "radius": "Not defined yet",  # fingerprint radius
+            "n_bits": n_bits,  # fingerprint bit size
+            "aa_emb_size": aa_emb_size,  # aa embedding size
+        }
+        # create history dictionary
+        self.history = {
+            "train_loss": [],
+            "on_diag_loss": [],
+            "off_diag_loss": [],
+            "validation_loss": [],
+        }
+        # run NN architecture construction method
+        self.construct_model()
+        # run scheduler construction method
+        self.construct_scheduler()
+        # print if necessary
+        if self.param_dict["verbose"] is True:
+            self.print_config()
+    @staticmethod
+    def __validate_inputs(locals_dict) -> None:
+        # get signature types from __init__
+        init_signature = inspect.signature(BarlowTwins.__init__)
+        # loop over all chosen arguments
+        for param_name, param_value in locals_dict.items():
+            # skip self
+            if param_name != "self":
+                # check that parameter exists
+                if param_name in init_signature.parameters:
+                    # check that param is correct type
+                    expected_type = init_signature.parameters[param_name].annotation
+                    assert isinstance(
+                        param_value, expected_type
+                    ), f"[BT]: Type mismatch for parameter '{param_name}'"
+                else:
+                    raise ValueError(f"[BT]: Unexpected parameter '{param_name}'")
+    def construct_mlp(self, input_units, layer_units, n_layers, output_units) -> nn.Sequential:
+        # make empty list to fill
+        mlp_list = []
+        # make lists defining layer sizes (input + n_neurons*n_layers + embedding_dim)
+        units = [input_units] + [layer_units] * n_layers
+        # add layer stack (linear -> batchnorm -> dropout -> activation)
+        for i in range(len(units) - 1):
+            mlp_list.append(nn.Linear(units[i], units[i + 1]))
+            mlp_list.append(nn.BatchNorm1d(units[i + 1]))
+            mlp_list.append(self.param_dict["act_function"]())
+        # add final linear layer
+        mlp_list.append(nn.Linear(units[-1], output_units))
+        return nn.Sequential(*mlp_list)
+    def construct_model(self) -> None:
+        # create fingerprint transformer
+        self.enc_mol = self.construct_mlp(
+            self.param_dict["n_bits"],
+            self.param_dict["enc_n_neurons"],
+            self.param_dict["enc_n_layers"],
+            self.param_dict["embedding_dim"],
+        )
+        # create aa transformer
+        self.enc_aa = self.construct_mlp(
+            self.param_dict["aa_emb_size"],
+            self.param_dict["enc_n_neurons"],
+            self.param_dict["enc_n_layers"],
+            self.param_dict["embedding_dim"],
+        )
+        # create mlp projector
+        self.proj = self.construct_mlp(
+            self.param_dict["embedding_dim"],
+            self.param_dict["proj_n_neurons"],
+            self.param_dict["proj_n_layers"],
+            self.param_dict["proj_n_neurons"],
+        )
+        # print if necessary
+        if self.param_dict["verbose"] is True:
+            print("[BT]: Model constructed successfully")
+    def construct_scheduler(self):
+        # make optimizer
+        self.optimizer = self.param_dict["optimizer"](
+            list(self.enc_mol.parameters())
+            + list(self.enc_aa.parameters())
+            + list(self.proj.parameters()),
+            lr=self.param_dict["learning_rate"],
+            betas=self.param_dict["betas"],
+            # momentum=self.param_dict["momentum"],
+            weight_decay=self.param_dict["weight_decay"],
+        )
+        # wrap optimizer in scheduler
+        """
+         self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+            self.optimizer,
+            T_max=self.param_dict["step_size"], # T_0
+            # eta_min=1e-7,
+            verbose=True
+        )
+        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+            self.optimizer,
+            patience=self.param_dict["step_size"],
+            verbose=True
+        )
+        """
+        self.scheduler = torch.optim.lr_scheduler.StepLR(
+            self.optimizer,
+            step_size=self.param_dict["step_size"],
+            gamma=self.param_dict["gamma"],
+        )
+        # print if necessary
+        if self.param_dict["verbose"] is True:
+            print("[BT]: Optimizer constructed successfully")
+    def switch_mode(self, is_training: bool):
+        if is_training:
+            self.enc_mol.train()
+            self.enc_aa.train()
+            self.proj.train()
+        else:
+            self.enc_mol.eval()
+            self.enc_aa.eval()
+            self.proj.eval()
+    @staticmethod
+    def normalize_projection(tensor: torch.tensor) -> torch.tensor:
+        means = torch.mean(tensor, axis=0)
+        std = torch.std(tensor, axis=0)
+        centered = torch.add(tensor, -means)
+        scaled = torch.div(centered, std)
+        return scaled
+    def compute_loss(
+        self,
+        mol_embedding: torch.tensor,
+        aa_embedding: torch.tensor,
+    ) -> torch.tensor:
+        # empirical cross-correlation matrix
+        mol_embedding = self.normalize_projection(mol_embedding).T
+        aa_embedding = self.normalize_projection(aa_embedding)
+        c = mol_embedding @ aa_embedding
+        # normalize by number of samples
+        c.div_(self.param_dict["batch_size"])
+        # compute elements on diagonal
+        on_diag = torch.diagonal(c).add_(-1).pow_(2).sum()
+        # compute elements off diagonal
+        n, m = c.shape
+        off_diag = c.flatten()[:-1].view(n - 1, n + 1)[:, 1:].flatten()
+        off_diag = off_diag.pow_(2).sum() * self.param_dict["loss_weight"]
+        return on_diag, off_diag
+    def forward(
+        self, mol_data: torch.tensor, aa_data: torch.tensor, is_training: bool = True
+    ) -> torch.tensor:
+        # switch according to input
+        self.switch_mode(is_training)
+        # get embeddings
+        mol_embeddings = self.enc_mol(mol_data)
+        aa_embeddings = self.enc_aa(aa_data)
+        # get projections
+        mol_proj = self.proj(mol_embeddings)
+        aa_proj = self.proj(aa_embeddings)
+        # compute loss
+        on_diag, off_diag = self.compute_loss(mol_proj, aa_proj)
+        return on_diag, off_diag
+    def train(
+        self,
+        train_data: torch.utils.data.DataLoader,
+        val_data: torch.utils.data.DataLoader = None,
+        num_epochs: int = 20,
+        patience: int = None,
+    ):
+        if self.param_dict["verbose"] is True:
+            print("[BT]: Training started")
+        if patience is None:
+            patience = 2 * self.param_dict["step_size"]
+        pbar = trange(num_epochs, desc="[BT]: Epochs", leave=False, colour="blue")
+        for epoch in pbar:
+            # initialize loss containers
+            train_loss = 0.0
+            on_diag_loss = 0.0
+            off_diag_loss = 0.0
+            val_loss = 0.0
+            # loop over training set
+            for _, (mol_data, aa_data) in enumerate(train_data):
+                # reset grad
+                self.optimizer.zero_grad()
+                # compute train loss for batch
+                on_diag, off_diag = self.forward(mol_data, aa_data, is_training=True)
+                t_loss = on_diag + off_diag
+                # backpropagation and optimization
+                t_loss.backward()
+                """
+                nn.utils.clip_grad_norm_(
+                    list(self.enc_mol.parameters()) +
+                    list(self.enc_aa.parameters()) +
+                    list(self.proj.parameters()),
+                    1
+                )
+                """
+                self.optimizer.step()
+                # add i-th loss to training container
+                train_loss += t_loss.item()
+                on_diag_loss += on_diag.item()
+                off_diag_loss += off_diag.item()
+            # add mean epoch loss for train data to history dictionary
+            self.history["train_loss"].append(train_loss / len(train_data))
+            self.history["on_diag_loss"].append(on_diag_loss / len(train_data))
+            self.history["off_diag_loss"].append(off_diag_loss / len(train_data))
+            # define msg to be printed
+            msg = (
+                f"[BT]: Epoch [{epoch + 1}/{num_epochs}], "
+                f"Train loss: {train_loss / len(train_data):.3f}, "
+                f"On diagonal: {on_diag_loss / len(train_data):.3f}, "
+                f"Off diagonal: {off_diag_loss / len(train_data):.3f} "
+            )
+            # loop over validation set (if present)
+            if val_data is not None:
+                for _, (mol_data, aa_data) in enumerate(val_data):
+                    # compute val loss for batch
+                    on_diag_v_loss, off_diag_v_loss = self.forward(
+                        mol_data, aa_data, is_training=False
+                    )
+                    # add i-th loss to val container
+                    v_loss = on_diag_v_loss + off_diag_v_loss
+                    val_loss += v_loss.item()
+                # add mean epoc loss for val data to history dictionary
+                self.history["validation_loss"].append(val_loss / len(val_data))
+                # add val loss to msg
+                msg += f", Val loss: {val_loss / len(val_data):.3f}"
+                # early stopping
+                if self.early_stopping(patience=patience):
+                    break
+                pbar.set_postfix(
+                    {
+                        "train loss": train_loss / len(train_data),
+                        "val loss": val_loss / len(val_data),
+                    }
+                )
+            else:
+                pbar.set_postfix({"train loss": train_loss / len(train_data)})
+            # update scheduler
+            self.scheduler.step()  # val_loss / len(val_data)
+            if self.param_dict["verbose"] is True:
+                print(msg)
+        if self.param_dict["verbose"] is True:
+            print("[BT]: Training finished")
+    def encode(
+        self, vector: np.ndarray, mode: str = "embedding", normalize: bool = True, encoder: str = "mol"
+    ) -> np.ndarray:
+        """
+        Encodes a given vector using the Barlow Twins model.
+        Args:
+        - vector (np.ndarray): the input vector to encode
+        - mode (str): the mode to use for encoding, either "embedding" or "projection"
+        - normalize (bool): whether to L2 normalize the output vector
+        Returns:
+        - np.ndarray: the encoded vector
+        """
+        # set mol encoder to eval mode
+        self.switch_mode(is_training=False)
+        # convert from numpy to tensor
+        if type(vector) is not torch.Tensor:
+            vector = torch.from_numpy(vector)
+        # if oly one molecule pair is passed, add a batch dimension
+        if len(vector.shape) == 1:
+            vector = vector.unsqueeze(0)
+        # get representation
+        if encoder == "mol":
+            embedding = self.enc_mol(vector)
+            if mode == "projection":
+                embedding = self.proj(embedding)
+        elif encoder == "aa":
+            embedding = self.enc_aa(vector)
+            if mode == "projection":
+                embedding = self.proj(embedding)
+        else:
+            raise ValueError("[BT]: Encoder not recognized")
+        # L2 normalize (optional)
+        if normalize:
+            embedding = torch.nn.functional.normalize(embedding)
+        # convert back to numpy
+        return embedding.cpu().detach().numpy()
+    def zero_shot(
+        self, mol_vector: np.ndarray, aa_vector: np.ndarray, l2_norm: bool = True, device: str = "cpu"
+    ) -> np.ndarray:
+        # disable training
+        self.switch_mode(is_training=False)
+        # cast aa vectors (pos and neg) to correct size, force single precision
+        # to both
+        mol_vector = np.array(mol_vector, dtype=np.float32)
+        aa_vector = np.array(aa_vector, dtype=np.float32)
+        # convert to tensors
+        mol_vector = torch.from_numpy(mol_vector).to(device)
+        aa_vector = torch.from_numpy(aa_vector).to(device)
+        # get embeddings
+        mol_embedding = self.encode(mol_vector, normalize=l2_norm, encoder="mol")
+        aa_embedding = self.encode(aa_vector, normalize=l2_norm, encoder="aa")
+        # concat mol and aa embeddings
+        concat = np.concatenate((mol_embedding, aa_embedding), axis=1)
+        return concat
+    def zero_shot_explain(
+        self, mol_vector, aa_vector, l2_norm: bool = True, device: str = "cpu"
+    ):
+        self.switch_mode(is_training=False)
+        mol_embedding = self.encode(mol_vector, normalize=l2_norm, encoder="mol")
+        aa_embedding = self.encode(aa_vector, normalize=l2_norm, encoder="aa")
+        return torch.cat((mol_embedding, aa_embedding), dim=1)
+    def consume_preprocessor(self, preprocessor) -> None:
+        # save attributes related to fingerprint generation from
+        # preprocessor object
+        self.param_dict["radius"] = preprocessor.radius
+        self.param_dict["n_bits"] = preprocessor.n_bits
+    def save_model(self, path: str) -> None:
+        # get current date and time for the filename
+        now = datetime.now()
+        formatted_date = now.strftime("%d%m%Y")
+        formatted_time = now.strftime("%H%M")
+        folder_name = f"{formatted_date}_{formatted_time}"
+        # make full path string and folder
+        folder_path = path + "/" + folder_name
+        os.makedirs(folder_path)
+        # make paths for weights, config and history
+        weight_path = folder_path + "/weights.pt"
+        param_path = folder_path + "/params.pkl"
+        history_path = folder_path + "/history.json"
+        # save each Sequential state dict in one object to the path
+        torch.save(
+            {
+                "enc_mol": self.enc_mol.state_dict(),
+                "enc_aa": self.enc_aa.state_dict(),
+                "proj": self.proj.state_dict(),
+            },
+            weight_path,
+        )
+        # dump params in pkl
+        with open(param_path, "wb") as file:
+            pickle.dump(self.param_dict, file)
+        # dump history in json
+        with open(history_path, "wb") as file:
+            pickle.dump(self.history, file)
+        # print if verbose is True
+        if self.param_dict["verbose"] is True:
+            print(f"[BT]: Model saved at {folder_path}")
+    def load_model(self, path: str) -> None:
+        # make weights, config and history paths
+        weights_path = path + "/weights.pt"
+        param_path = path + "/params.pkl"
+        history_path = path + "/history.json"
+        # load weights, history and params
+        checkpoint = torch.load(weights_path, map_location=self.device)
+        with open(param_path, "rb") as file:
+            param_dict = pickle.load(file)
+        with open(history_path, "rb") as file:
+            history = pickle.load(file)
+        # construct model again, overriding old verbose key with new instance
+        verbose = self.param_dict["verbose"]
+        self.param_dict = param_dict
+        self.param_dict["verbose"] = verbose
+        self.history = history
+        self.construct_model()
+        # set weights in Sequential models
+        self.enc_mol.load_state_dict(checkpoint["enc_mol"])
+        self.enc_aa.load_state_dict(checkpoint["enc_aa"])
+        self.proj.load_state_dict(checkpoint["proj"])
+        # recreate scheduler and optimizer in order to add new weights
+        # to graph
+        self.construct_scheduler()
+        # print if verbose is True
+        if self.param_dict["verbose"] is True:
+            print(f"[BT]: Model loaded from {path}")
+            print("[BT]: Loaded parameters:")
+            print(self.param_dict)
+    def move_to_device(self, device) -> None:
+        # move each Sequential model to device
+        self.enc_mol.to(device)
+        self.enc_aa.to(device)
+        self.proj.to(device)
+        self.device = device

model/base_model.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from typing import Tuple, Any, Union
+import torch
+from torch import nn
+import numpy as np
+class BaseModel(nn.Module):
+    def __init__(self):
+        super(BaseModel, self).__init__()
+        # set device (gpu 0 or 1 if available or cpu)
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        # make empty param dict
+        self.param_dict = {}
+        # make optimizer options dict
+        self.optimizer_dict = {
+            "adam": torch.optim.Adam,
+            "nadam": torch.optim.NAdam,
+            "adamax": torch.optim.Adamax,
+            "adamw": torch.optim.AdamW,
+            "sgd": torch.optim.SGD,
+        }
+        # make loss options dict
+        self.loss_dict = {
+            "mse": nn.MSELoss,
+            "l1": nn.L1Loss,
+            "smoothl1": nn.SmoothL1Loss,
+            "huber": nn.HuberLoss,
+            "cel": nn.CrossEntropyLoss,  # Suitable for classification tasks
+            "bcel": nn.BCELoss,  # Suitable for classification tasks
+        }
+        # make activation function options dictionary
+        self.activation_dict = {
+            "relu": nn.ReLU,
+            "swish": nn.Hardswish,
+            "leaky_relu": nn.LeakyReLU,
+            "elu": nn.ELU,
+            "selu": nn.SELU,
+        }
+        # make tokenizer placeholder
+        self.tokenizer = None
+        # create history dictionary
+        self.history = {
+            "train_loss": [],
+            "on_diag_loss": [],
+            "off_diag_loss": [],
+            "validation_loss": [],
+            "learning_rate": [],
+        }
+        # create early stopping params
+        self.count = 0
+    def print_config(self) -> None:
+        print("[BT]: Current parameter config:")
+        print(self.param_dict)
+    def early_stopping(self, patience: int) -> bool:
+        # count every epoch that's worse than the best for patience times
+        if len(self.history["validation_loss"]) > patience:
+            best_loss = min(self.history["validation_loss"])
+            if self.history["validation_loss"][-1] > best_loss:
+                self.count += 1
+            else:
+                self.count = 0
+            if self.count >= patience:
+                if self.param_dict["verbose"] is True:
+                    print("[BT]: Early stopping")
+                return True
+        return False

model/model.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import sys
+from typing import List
+from tqdm import tqdm
+import pandas as pd
+import numpy as np
+import threading
+from concurrent.futures import ProcessPoolExecutor, as_completed, TimeoutError
+import time
+import requests
+import joblib
+# from bio_embeddings.embed import SeqVecEmbedder, ProtTransBertBFDEmbedder, ProtTransT5XLU50Embedder
+from Bio import SeqIO
+import rdkit
+from rdkit import Chem, DataStructs
+from rdkit.Chem import AllChem
+import torch
+from typing import *
+from rdkit import RDLogger
+RDLogger.DisableLog("rdApp.*")
+from xgboost import XGBClassifier, DMatrix
+from model.barlow_twins import BarlowTwins
+# sys.path.append("../utils/")
+from utils.sequence import uniprot2sequence, encode_sequences
+class DTIModel:
+    def __init__(self, bt_model_path: str, gbm_model_path: str, encoder: str = "prost_t5"):
+        self.bt_model = BarlowTwins()
+        self.bt_model.load_model(bt_model_path)
+        self.gbm_model = XGBClassifier()
+        self.gbm_model.load_model(gbm_model_path)
+        self.encoder = encoder
+        self.smiles_cache = {}
+        self.sequence_cache = {}
+    def _encode_smiles(self, smiles: str, radius: int = 2, bits: int = 1024, features: bool = False):
+        if smiles is None:
+            return None
+        # Check if the SMILES is already in the cache
+        if smiles in self.smiles_cache:
+            return self.smiles_cache[smiles]
+        else:
+            # Encode the SMILES and store it in the cache
+            try:
+                mol = Chem.MolFromSmiles(smiles)
+                morgan = AllChem.GetMorganFingerprintAsBitVect(
+                    mol,
+                    radius=radius,
+                    nBits=bits,
+                    useFeatures=features,
+                )
+                morgan = np.array(morgan)
+                self.smiles_cache[smiles] = morgan
+                return morgan
+            except Exception as e:
+                print(f"Failed to encode SMILES: {smiles}")
+                print(e)
+                return None
+    def _encode_smiles_mult(self, smiles: List[str], radius: int = 2, bits: int = 1024, features: bool = False):
+        morgan = [self._encode_smiles(s, radius, bits, features) for s in smiles]
+        return np.array(morgan)
+    def _encode_sequence(self, sequence: str):
+        # Clear torch cache
+        torch.cuda.empty_cache()
+        if sequence is None:
+            return None
+        # Check if the sequence is already in the cache
+        if sequence in self.sequence_cache:
+            return self.sequence_cache[sequence]
+        else:
+            # Encode the sequence and store it in the cache
+            try:
+                encoded_sequence = encode_sequences([sequence], encoder=self.encoder)
+                self.sequence_cache[sequence] = encoded_sequence
+                return encoded_sequence
+            except Exception as e:
+                print(f"Failed to encode sequence: {sequence}")
+                print(e)
+                return None
+    def _encode_sequence_mult(self, sequences: List[str]):
+        seq = [self._encode_sequence(sequence) for sequence in sequences]
+        return np.array(seq)
+    def __predict_pair(self, drug_emb: np.ndarray, target_emb: np.ndarray, pred_leaf: bool):
+        if drug_emb.shape[0] < target_emb.shape[0]:
+            drug_emb = np.tile(drug_emb, (len(target_emb), 1))
+        elif len(drug_emb) > len(target_emb):
+            target_emb = np.tile(target_emb, (len(drug_emb), 1))
+        emb = self.bt_model.zero_shot(drug_emb, target_emb)
+        if pred_leaf:
+            d_emb = DMatrix(emb)
+            return self.gbm_model.get_booster().predict(d_emb, pred_leaf=True)
+        else:
+            return self.gbm_model.predict_proba(emb)[:, 1]
+    def predict(self, drug: List[str] or str, target: str, pred_leaf: bool = False):
+        if isinstance(drug, str):
+            drug_emb = self._encode_smiles(drug)
+        else:
+            drug_emb = self._encode_smiles_mult(drug)
+        target_emb = self._encode_sequence(target)
+        return self.__predict_pair(drug_emb, target_emb, pred_leaf)
+    def get_leaf_weights(self):
+        return self.gbm_model.get_booster().get_score(importance_type="weight")
+    def _predict_fasta(self, drug: str, fasta_path: str):
+        drug_emb = self._encode_smiles(drug)
+        results = []
+        # Extract targets from fasta
+        for target in tqdm(SeqIO.parse(fasta_path, "fasta"), desc="Predicting targets"):
+            target_emb = self._encode_sequence(str(target.seq))
+            pred = self.__predict_pair(drug_emb, target_emb)
+            results.append(
+                {
+                    "drug": drug,
+                    "target": target.id,
+                    "name": target.name,
+                    "description": target.description,
+                    "prediction": pred[0]
+                }
+            )
+        return pd.DataFrame(results)
+    def predict_fasta(self, drug: str, fasta_path: str, timeout_seconds: int = 120):
+        def process_target(target, results):
+            target_emb = self._encode_sequence(str(target.seq))
+            pred = self.__predict_pair(drug_emb, target_emb)
+            results.append({
+                "drug": drug,
+                "target": target.id,
+                "name": target.name,
+                "description": target.description,
+                "prediction": pred[0]
+            })
+        drug_emb = self._encode_smiles(drug)
+        results = []
+        # First, count the total number of records for the progress bar
+        total_records = sum(1 for _ in SeqIO.parse(fasta_path, "fasta"))
+        # Extract targets from fasta with a properly initialized tqdm progress bar
+        for target in tqdm(SeqIO.parse(fasta_path, "fasta"), total=total_records, desc="Predicting targets"):
+            thread_results = []
+            thread = threading.Thread(target=process_target, args=(target, thread_results))
+            thread.start()
+            thread.join(timeout_seconds)
+            if thread.is_alive():
+                print(f"Skipping target {target.id} due to timeout")
+                continue
+            results.extend(thread_results)
+        return pd.DataFrame(results)
+    def predict_uniprot(self, drug: List[str] or str, uniprot_id: str):
+        return self.predict(drug, uniprot2sequence(uniprot_id))

model/preprocessor.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import os
+from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
+import torch
+from rdkit import Chem, DataStructs
+import pandas as pd
+import pickle as pkl
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+import sys
+# sys.path.append("../utils/")
+from utils.parallel import *
+from utils.chem import *
+from utils.sequence import *
+class Preprocessor:
+    def __init__(
+            self,
+            path: str,
+            radius: int = 2,
+            n_bits: int = 1024,
+            aa_embedding: str = "prottrans_t5_xl_u50",
+            num_workers: int = 1,
+    ):
+        self.path = path
+        self.radius = radius
+        self.n_bits = n_bits
+        self.aa_embedding = aa_embedding
+        self.num_workers = num_workers
+        self.data = None
+        self.fp = None
+        self.aa = None
+        self.split = None
+        self.label = None
+        self.load_data()
+        self.process_data()
+    def load_data(self):
+        if os.path.isfile(self.path):
+            self.data = pd.read_csv(self.path, low_memory=False)
+        else:
+            raise ValueError("No data file found in the specified path")
+    def process_data(self):
+        if "smiles" not in self.data.columns:
+            raise ValueError("No smiles column found in the data")
+        if "sequence" not in self.data.columns:
+            raise ValueError("No sequence column found in the data")
+        smiles = self.data.smiles.tolist()
+        seq = self.data.sequence.tolist()
+        if "split" in self.data.columns:
+            self.split = self.data.split.tolist()
+        if "label" in self.data.columns:
+            self.label = self.data.label.tolist()
+        if self.num_workers > 1:
+            mols = parallel(get_mols, self.num_workers, smiles)
+            fps = parallel(get_fp, self.num_workers, mols, self.radius, self.n_bits)
+        else:
+            mols = get_mols(smiles)
+            fps = get_fp(mols, self.radius, self.n_bits)
+        self.fp = store_fp(fps, self.n_bits)
+        self.aa = encode_sequences(seq, self.aa_embedding)
+    def return_generator(
+            self,
+            device,
+            batch_size: int = 512,
+            include_negatives: bool = False,
+            shuffle: bool = True,
+            validation_split: float = None,
+    ) -> (DataLoader, DataLoader):
+        if self.split is None and self.label is None:
+            print("No split or label columns found in the dataset")
+            dataset = MolAADataset(device, self.fp, self.aa)
+        elif self.split is not None:
+            print("Splitting data into train and validation sets from the dataset without considering labels")
+            train_fp, train_aa, val_fp, val_aa = [], [], [], []
+            for i in range(len(self.fp)):
+                if self.split[i] == "train":
+                    train_fp.append(self.fp[i])
+                    train_aa.append(self.aa[i])
+                elif self.split[i] == "val":
+                    val_fp.append(self.fp[i])
+                    val_aa.append(self.aa[i])
+            train_dataset = MolAADataset(device, train_fp, train_aa)
+            val_dataset = MolAADataset(device, val_fp, val_aa)
+            print(f"Train: {len(train_fp)}, Validation: {len(val_fp)}")
+            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
+            validation_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=shuffle)
+            return train_loader, validation_loader
+        else:
+            print("Splitting data into train and validation sets from the dataset")
+            train_fp, train_aa, val_fp, val_aa = [], [], [], []
+            for i in range(len(self.fp)):
+                if self.split[i] == "train":
+                    if include_negatives and self.label[i] == 0:
+                        train_fp.append(self.fp[i])
+                        train_aa.append(self.aa[i] * -1)
+                    elif self.label[i] == 1:
+                        train_fp.append(self.fp[i])
+                        train_aa.append(self.aa[i])
+                elif self.split[i] == "val":
+                    if include_negatives and self.label[i] == 0:
+                        val_fp.append(self.fp[i])
+                        val_aa.append(self.aa[i] * -1)
+                    elif self.label[i] == 1:
+                        val_fp.append(self.fp[i])
+                        val_aa.append(self.aa[i])
+            train_dataset = MolAADataset(device, train_fp, train_aa)
+            val_dataset = MolAADataset(device, val_fp, val_aa)
+            print(f"Train: {len(train_fp)}, Validation: {len(val_fp)}")
+            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
+            validation_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=shuffle)
+            return train_loader, validation_loader
+        if validation_split is not None:
+            print("Splitting data into train and validation by fractionation from the dataset")
+            dataset_size = len(dataset)
+            indices = list(range(dataset_size))
+            split = int(np.floor(validation_split * dataset_size))
+            if shuffle:
+                np.random.shuffle(indices)
+            train_indices, val_indices = indices[split:], indices[:split]
+            train_sampler = SubsetRandomSampler(train_indices)
+            valid_sampler = SubsetRandomSampler(val_indices)
+            train_loader = DataLoader(
+                dataset, batch_size=batch_size, sampler=train_sampler
+            )
+            validation_loader = DataLoader(
+                dataset, batch_size=batch_size, sampler=valid_sampler
+            )
+            return train_loader, validation_loader
+        else:
+            train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
+            return train_loader, None
+class MolAADataset(Dataset):
+    def __init__(self, device, mol, aa):
+        self.mol = mol
+        self.aa = aa
+        self.device = device
+    def __len__(self):
+        """
+        Method necessary for Pytorch training
+        """
+        return len(self.mol)
+    def __getitem__(self, idx):
+        """
+        Method necessary for Pytorch training
+        """
+        mol_sample = torch.tensor(self.mol[idx], dtype=torch.float32)
+        aa_sample = torch.tensor(self.aa[idx], dtype=torch.float32)
+        mol_sample = mol_sample.to(self.device)
+        aa_sample = aa_sample.to(self.device)
+        return mol_sample, aa_sample

model/stash/14062024_0910/history.json ADDED Viewed

Binary file (3.33 kB). View file

model/stash/14062024_0910/log.txt ADDED Viewed

	@@ -0,0 +1,41 @@

+----------------
+Run description: Manual param optim
+----------------
+message: yes
+path: all_drugbank_smiles_sequence_prost_preprocessor.pkl
+load_preprocessor: True
+radius: 2
+n_bits: 1024
+num_workers: 64
+enc_n_neurons: 4096
+enc_n_layers: 3
+proj_n_neurons: 2048
+proj_n_layers: 1
+embedding_dim: 512
+act_function: relu
+aa_emb_size: 1024
+loss_weight: 0.005
+batch_size: 4096
+epochs: 250
+optimizer: adamw
+learning_rate: 0.0003
+beta_1: 0.9
+beta_2: 0.999
+weight_decay: 5e-05
+step_size: 10
+gamma: 0.1
+include_negatives: False
+hyperparameter_tuning: False
+val_split: 0.1
+aa_embedding: prost_t5
+model_type: barlow_twins
+device: cuda:0
+msg: Manual param optim
+start: 1718356109.3235965
+data: <preprocessor.Preprocessor object at 0x72f2d495eb10>
+train: <torch.utils.data.dataloader.DataLoader object at 0x72f2d3a66d50>
+val: <torch.utils.data.dataloader.DataLoader object at 0x72f2d480e7b0>
+file: <_io.BufferedReader name='all_drugbank_smiles_sequence_prost_preprocessor.pkl'>
+t_preprocessing: 0
+model: <barlow_twins.BarlowTwins object at 0x72f2d7652540>
+t_model: 1

model/stash/14062024_0910/params.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:065b380d18b2c40bfe031b14480665e5603fcaf06a731f8bc0ec92d829bb2169
+size 423

model/stash/14062024_0910/weights.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55014d6bc054a1aefc22e9c893deaf25939a639efa63f46e2083ff602a5961f1
+size 340300017

model/xgb_models/14062024_0910_barlowdti_xxl_model.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/xgb_models/xgb_model_BIOSNAP_full_data_14062024_0910_bt_optimized_0.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1481be9c69558a91c41d65970ba60ace4cb685a4c90b03be37a813b9f1abc96
+size 27471157

model/xgb_models/xgb_model_BIOSNAP_missing_data_70_14062024_0910_bt_optimized_0.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/xgb_models/xgb_model_BIOSNAP_missing_data_80_14062024_0910_bt_optimized_0.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ab4553ac67b4d75b85eae69c6a19daaad8c6575c3d01252dc8b58682656551b
+size 12831515

model/xgb_models/xgb_model_BIOSNAP_missing_data_90_14062024_0910_bt_optimized_0.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/xgb_models/xgb_model_BIOSNAP_missing_data_95_14062024_0910_bt_optimized_0.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/xgb_models/xgb_model_BIOSNAP_unseen_drug_14062024_0910_bt_optimized_0.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3992b670436e6e2c728eade63581c1962f7cc546b81fb61cd43b6f9eb426f17
+size 40338690

model/xgb_models/xgb_model_BIOSNAP_unseen_protein_14062024_0910_bt_optimized_0.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:557742dd11578818bbe6454c946ab2d5a5846556457d22c89cdbf5b47bd34831
+size 18191873

model/xgb_models/xgb_model_BindingDB_14062024_0910_bt_optimized_0.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84e911499ec13f38e1edc4b006faf2ef3e827d1d7d0fd53f481e0e41c82d59c1
+size 24742914

model/xgb_models/xgb_model_DAVIS_14062024_0910_bt_optimized_0.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/xgb_models/xgb_model_nature_mach_intel_BindingDB_cluster_14062024_0910_bt_optimized_0.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/xgb_models/xgb_model_nature_mach_intel_BindingDB_protein_14062024_0910_bt_optimized_0.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4a4b08241bf5779e9ef688b6c5a452ac13f4a67480ec6c17cc203ddd35ab7f7
+size 16983875

model/xgb_models/xgb_model_nature_mach_intel_BindingDB_random_14062024_0910_bt_optimized_0.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef54574bb754850ec34c0769df1222c6087541fc5e5bb3e17653982e079fb440
+size 64523467

model/xgb_models/xgb_model_nature_mach_intel_BindingDB_scaffold_14062024_0910_bt_optimized_0.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:716944cd5a88e6b7dd062a3c9cc331980908541d1b8039f321bdda0112c6668d
+size 25668977

model/xgb_models/xgb_model_nature_mach_intel_BioSNAP_cluster_14062024_0910_bt_optimized_0.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/xgb_models/xgb_model_nature_mach_intel_BioSNAP_protein_14062024_0910_bt_optimized_0.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/xgb_models/xgb_model_nature_mach_intel_BioSNAP_random_14062024_0910_bt_optimized_0.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:faaa3a7fcb8efd23876b23b9a07620bd4ca007d05c354e6bfd2c413f3244402b
+size 18444715

model/xgb_models/xgb_model_nature_mach_intel_BioSNAP_scaffold_14062024_0910_bt_optimized_0.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/xgb_models/xgb_model_nature_mach_intel_Human_protein_14062024_0910_bt_optimized_0.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e97db0190ff7a15820982d35191f0092319801ea2992c2ef545b9028a8d2ca1
+size 12630195

model/xgb_models/xgb_model_nature_mach_intel_Human_random_14062024_0910_bt_optimized_0.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/xgb_models/xgb_model_nature_mach_intel_Human_scaffold_14062024_0910_bt_optimized_0.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+Babel==2.14.0
+biopython==1.83
+chembl-structure-pipeline==1.2.2
+ConfigSpace==0.7.1
+cycler==0.12.1
+dask==2024.5.1
+joblib==1.4.0
+keras==3.4.1
+numpy==1.26.4
+optuna==3.6.1
+pandas==2.2.2
+plotly
+rdkit==2023.9.5
+scikit-learn==1.4.2
+scipy==1.13.0
+seaborn==0.13.2
+sentencepiece==0.2.0
+shap==0.46.0
+smac==2.1.0
+tensorflow==2.17.0
+torch==2.4.0
+tqdm==4.66.2
+transformers==4.41.0
+umap==0.1.1
+xgboost==2.0.3

utils/__init__.py ADDED Viewed

File without changes

utils/chem.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import rdkit
+from rdkit import Chem, DataStructs
+from rdkit.Chem import AllChem
+from typing import *
+import numpy as np
+from rdkit import RDLogger
+RDLogger.DisableLog("rdApp.*")
+def try_or_none(func, *args, **kwargs):
+    try:
+        return func(*args, **kwargs)
+    except:
+        return None
+def get_smiles(mols: List[rdkit.Chem.rdchem.Mol]) -> List[str]:
+    """
+    Gets list of smiles from list of rdkit molecules
+    """
+    return [Chem.MolToSmiles(x) for x in mols]
+def get_mols(smiles: List[str]) -> List[rdkit.Chem.rdchem.Mol]:
+    """
+    Gets list of rdkit molecules from list of smiles
+    """
+    return [Chem.MolFromSmiles(x) for x in smiles]
+def get_fp(
+    mols: List[rdkit.Chem.rdchem.Mol],
+    radius: int = 2,
+    nBits: int = 1024,
+    useFeatures: bool = False,
+):
+    """
+    Computes ECFP/FCFP from list of RDKIT mols
+    """
+    output = np.empty(len(mols), dtype=object)
+    for i, mol in enumerate(mols):
+        output[i] = AllChem.GetMorganFingerprintAsBitVect(
+            mol,
+            radius=radius,
+            nBits=nBits,
+            useFeatures=useFeatures,
+        )
+    return output
+def store_fp(fps: List, nBits: int = 1024):
+    """
+    Stores list of RDKIT sparse vectors in numpy array using C data structures
+    """
+    array = np.empty((len(fps), nBits), dtype=np.float32)
+    for i in range(len(array)):
+        DataStructs.ConvertToNumpyArray(fps[i], array[i])
+    return array

utils/parallel.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import multiprocessing
+import numpy as np
+import psutil
+from typing import *
+def parallel(function: Callable, n_jobs: int, x: List, *args) -> List:
+    """Higher order function to run other functions on multiple processes
+    Simple parallelization utility, slices the input list x in chunks and
+    executes the function on each chunk in different processes. Not suited
+    for functions that have already multithreading/processing implemented.
+    Args:
+        function:   callable to run on different processes
+        n_jobs:     how many cores to use
+        x:          list (M,) to use as input for function
+        *args:      optional arguments for function
+    Returns:
+        Object (M,) containing the output of function. Content and type depend
+        on function. If function returns list, then parallel will also return
+        a list. If function returns a numpy array, then parallel will return an
+        array.
+    """
+    # check that parallelization is required. n_jobs might be passed as 1 by
+    # i.e. Dataset methods if they notice that the loaded HTS is too large
+    # to be used on different cores.
+    if n_jobs > 1:
+        # split list in chunks
+        chunks = split_list(x, n_jobs)
+        # create list of tuples containing the chunks and *args
+        args = stitch_args(chunks, args)
+        # create multiprocessing pool and run function on chunks
+        pool = multiprocessing.Pool(n_jobs)
+        output = pool.starmap(function, args)
+        pool.close()
+        # unroll output (list of function outputs) into a single object
+        # of size M
+        if isinstance(output[0], list):
+            unrolled = [x for k in output for x in k]
+        elif isinstance(output[0], np.ndarray):
+            unrolled = np.concatenate(output, axis=0)
+    else:
+        # run function normally
+        unrolled = function(x, *args)
+    return unrolled
+def stitch_args(chunks: List[List], args: Tuple) -> List[Tuple]:
+    """
+    Stitches together the chunks to be run in parallel and optional function
+    arguments into tuples
+    """
+    output = [[x] for x in chunks]
+    for i in range(len(output)):
+        for j in range(len(args)):
+            output[i].append(args[j])
+    return [tuple(x) for x in output]
+def split_list(x: List, n_jobs: int) -> List[List]:
+    """
+    Converts a list into a list of lists of size n_jobs.
+    """
+    idxs = np.array_split(range(len(x)), n_jobs)
+    output = [0] * n_jobs
+    for i in range(n_jobs):
+        output[i] = [x[k] for k in idxs[i]]
+    return output

utils/sequence.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import requests
+import numpy as np
+# from bio_embeddings.embed import SeqVecEmbedder, ProtTransBertBFDEmbedder, ProtTransT5XLU50Embedder
+from transformers import T5Tokenizer, T5EncoderModel
+import torch
+import re
+import concurrent.futures
+from tqdm.auto import tqdm
+import multiprocessing
+from multiprocessing import Pool
+ENCODERS = {
+    # "seqvec": SeqVecEmbedder(),
+    # "prottrans_bert_bfd": ProtTransBertBFDEmbedder(),
+    # "prottrans_t5_xl_u50": ProtTransT5XLU50Embedder(),
+    "prot_t5": {
+        "tokenizer": T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False),
+        "model": T5EncoderModel.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc')
+    },
+    "prost_t5": {
+        "tokenizer": T5Tokenizer.from_pretrained("Rostlab/ProstT5", do_lower_case=False),
+        "model": T5EncoderModel.from_pretrained("Rostlab/ProstT5")
+    }
+}
+def drugbank2smiles(drugbank_id):
+    url = f"https://go.drugbank.com/drugs/{drugbank_id}.smiles"
+    response = requests.get(url)
+    if response.status_code == 200:
+        return response.text
+    else:
+        # print(f"Failed to get SMILES for {drugbank_id}")
+        return None
+def uniprot2sequence(uniprot_id):
+    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
+    response = requests.get(url)
+    if response.status_code == 200:
+        # Extract sequence from FASTA format
+        sequence = "".join(response.text.split("\n")[1:])
+        return sequence
+    else:
+        # print(f"Failed to get sequence for {uniprot_id}")
+        return None
+def encode_sequences(sequences: list, encoder: str):
+    if encoder not in ENCODERS.keys():
+        raise ValueError(f"Invalid encoder: {encoder}")
+    model = ENCODERS[encoder]["model"]
+    tokenizer = ENCODERS[encoder]["tokenizer"]
+    # Cache for storing encoded sequences
+    cache = {}
+    def encode_sequence(sequence: str):
+        if sequence is None:
+            return None
+        if len(sequence) <= 3:
+            raise ValueError(f"Invalid sequence: {sequence}")
+        # Check if the sequence is already in the cache
+        if sequence in cache:
+            return cache[sequence]
+        else:
+            # Encode the sequence and store it in the cache
+            try:
+                encoded_sequence = model.embed(sequence)
+                encoded_sequence = np.mean(encoded_sequence, axis=0)
+                cache[sequence] = encoded_sequence
+                return encoded_sequence
+            except Exception as e:
+                print(f"Failed to encode sequence: {sequence}")
+                print(e)
+                return None
+    def encode_sequence_device_failover(sequence: str, function, timeout: int = 120):
+        if sequence is None:
+            return None
+        if sequence in cache:
+            return cache[sequence]
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        torch.cuda.empty_cache()
+        try:
+            # Try to process using GPU
+            result = function(sequence, device)
+        except RuntimeError as e:
+            print(e)
+            return None
+            if "CUDA out of memory." in str(e):
+                print("Trying on CPU instead.")
+                device = torch.device("cpu")
+                with concurrent.futures.ThreadPoolExecutor() as executor:
+                    future = executor.submit(function, sequence, device)
+                    try:
+                        result = future.result(timeout=timeout)
+                    except concurrent.futures.TimeoutError:
+                        print(f"CPU encoding timed out.")
+                        cache[sequence] = None
+                        return None
+            else:
+                cache[sequence] = None
+                raise Exception(e)
+        except Exception as e:
+            print(f"Failed to encode sequence: {sequence}")
+            cache[sequence] = None
+            return None
+        cache[sequence] = result
+        return result
+    def encode_sequence_hf_3d(sequence, device):
+        sequence_1d_list = [sequence]
+        model.full() if device == "cpu" else model.half()
+        model.to(device)
+        ids = tokenizer.batch_encode_plus(
+            sequence_1d_list,
+            add_special_tokens=True,
+            padding="longest",
+            return_tensors="pt"
+        ).to(device)
+        with torch.no_grad():
+            embedding = model(
+                ids.input_ids,
+                attention_mask=ids.attention_mask
+            )
+        # Skip the first token, which is the special token for the entire sequence and mean pool the rest
+        assert embedding.last_hidden_state.shape[0] == 1
+        encoded_sequence = embedding.last_hidden_state[0, 1:-1, :]
+        encoded_sequence = encoded_sequence.mean(dim=0).cpu().numpy().flatten()
+        assert encoded_sequence.shape[0] == 1024
+        return encoded_sequence
+    def encode_sequence_hf(sequence, device):
+        sequence_1d_list = [sequence]
+        model.full() if device == "cpu" else model.half()
+        model.to(device)
+        ids = tokenizer.batch_encode_plus(
+            sequence_1d_list,
+            add_special_tokens=True,
+            padding="longest",
+            return_tensors="pt"
+        ).to(device)
+        with torch.no_grad():
+            embedding = model(
+                ids.input_ids,
+                attention_mask=ids.attention_mask
+            )
+        assert embedding.last_hidden_state.shape[0] == 1
+        encoded_sequence = embedding.last_hidden_state[0, :-1, :]
+        encoded_sequence = encoded_sequence.mean(dim=0).cpu().numpy().flatten()
+        assert encoded_sequence.shape[0] == 1024
+        return encoded_sequence
+    # Use list comprehension to encode all sequences, utilizing the cache
+    if encoder == "seqvec":
+        raise NotImplementedError("SeqVec is not supported")
+        seq = encoder_function.embed(list(sequences))
+        seq = np.sum(seq, axis=0)
+    if encoder == "prost_t5":
+        sequences = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequences]
+        # The direction of the translation is indicated by two special tokens:
+        # if you go from AAs to 3Di (or if you want to embed AAs), you need to prepend "<AA2fold>"
+        # if you go from 3Di to AAs (or if you want to embed 3Di), you need to prepend "<fold2AA>"
+        sequences = ["<AA2fold>" + " " + s if s.isupper() else "<fold2AA>" + " " + s for s in sequences]
+        seq = [encode_sequence_device_failover(sequence, encode_sequence_hf_3d) for sequence in tqdm(sequences, desc="Encoding sequences")]
+    elif encoder == "prot_t5":
+        sequences = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequences]
+        seq = [encode_sequence_device_failover(sequence, encode_sequence_hf) for sequence in tqdm(sequences, desc="Encoding sequences")]
+    else:
+        raise NotImplementedError("SeqVec is not supported")
+        seq = [encode_sequence(sequence) for sequence in sequences]
+    return np.array(seq)
+class SequenceEncoder:
+    def __init__(self, encoder: str):
+        if encoder not in ENCODERS:
+            raise ValueError(f"Invalid encoder: {encoder}")
+        self.encoder = encoder
+        self.model = ENCODERS[encoder]["model"]
+        self.tokenizer = ENCODERS[encoder]["tokenizer"]
+        self.cache = {}
+    def encode_sequence(self, sequence: str):
+        if sequence is None:
+            return None
+        if len(sequence) <= 3:
+            raise ValueError(f"Invalid sequence: {sequence}")
+        if sequence in self.cache:
+            return self.cache[sequence]
+        try:
+            encoded_sequence = self.model.embed(sequence)
+            encoded_sequence = np.mean(encoded_sequence, axis=0)
+            self.cache[sequence] = encoded_sequence
+            return encoded_sequence
+        except Exception as e:
+            print(f"Failed to encode sequence: {sequence}")
+            print(e)
+            return None
+    def encode_sequence_device_failover(self, sequence: str, function, timeout: int = 5):
+        if sequence is None:
+            return None
+        if sequence in self.cache:
+            return self.cache[sequence]
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        torch.cuda.empty_cache()
+        try:
+            result = function(sequence, device)
+        except RuntimeError as e:
+            return None
+            print(e)
+            if "CUDA out of memory." in str(e):
+                print("Trying on CPU instead.")
+                device = torch.device("cpu")
+                with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+                    future = executor.submit(function, sequence, device)
+                    try:
+                        result = future.result(timeout=timeout)
+                    except:
+                        print(f"CPU encoding timed out.")
+                        self.cache[sequence] = None
+                        return None
+                    finally:
+                        executor.shutdown(wait=False)
+            else:
+                self.cache[sequence] = None
+                return None
+        except Exception as e:
+            print(f"Failed to encode sequence: {sequence}")
+            self.cache[sequence] = None
+            return None
+        self.cache[sequence] = result
+        return result
+    def encode_sequence_hf_3d(self, sequence, device):
+        sequence_1d_list = [sequence]
+        self.model.full() if device == "cpu" else self.model.half()
+        self.model.to(device)
+        ids = self.tokenizer.batch_encode_plus(
+            sequence_1d_list,
+            add_special_tokens=True,
+            padding="longest",
+            return_tensors="pt"
+        ).to(device)
+        with torch.no_grad():
+            embedding = self.model(
+                ids.input_ids,
+                attention_mask=ids.attention_mask
+            )
+        assert embedding.last_hidden_state.shape[0] == 1
+        encoded_sequence = embedding.last_hidden_state[0, 1:-1, :]
+        encoded_sequence = encoded_sequence.mean(dim=0).cpu().numpy().flatten()
+        assert encoded_sequence.shape[0] == 1024
+        return encoded_sequence
+    def encode_sequence_hf(self, sequence, device):
+        sequence_1d_list = [sequence]
+        self.model.full() if device == "cpu" else self.model.half()
+        self.model.to(device)
+        ids = self.tokenizer.batch_encode_plus(
+            sequence_1d_list,
+            add_special_tokens=True,
+            padding="longest",
+            return_tensors="pt"
+        ).to(device)
+        with torch.no_grad():
+            embedding = self.model(
+                ids.input_ids,
+                attention_mask=ids.attention_mask
+            )
+        assert embedding.last_hidden_state.shape[0] == 1
+        encoded_sequence = embedding.last_hidden_state[0, :-1, :]
+        encoded_sequence = encoded_sequence.mean(dim=0).cpu().numpy().flatten()
+        assert encoded_sequence.shape[0] == 1024
+        return encoded_sequence
+    def encode_sequences(self, sequences: list):
+        if self.encoder == "seqvec":
+            raise NotImplementedError("SeqVec is not supported")
+            seq = self.encoder_function.embed(list(sequences))
+            seq = np.sum(seq, axis=0)
+        elif self.encoder == "prost_t5":
+            sequences = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequences]
+            sequences = ["<AA2fold>" + " " + s if s.isupper() else "<fold2AA>" + " " + s for s in sequences]
+            seq = [self.encode_sequence_device_failover(sequence, self.encode_sequence_hf_3d) for sequence in tqdm(sequences, desc="Encoding sequences")]
+        elif self.encoder == "prot_t5":
+            sequences = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequences]
+            seq = [self.encode_sequence_device_failover(sequence, self.encode_sequence_hf) for sequence in tqdm(sequences, desc="Encoding sequences")]
+        else:
+            raise NotImplementedError("SeqVec is not supported")
+            seq = [self.encode_sequence(sequence) for sequence in sequences]
+        if any([x is None for x in seq]):
+            return seq
+        else:
+            return np.array(seq)