Spaces:

ailab-bio
/

PROTAC-Degradation-Predictor

Sleeping

App Files Files Community

ribesstefano commited on Mar 15

Commit

91692a4

•

1 Parent(s): 022416b

Polished hparam code and added ablation studies

Browse files

Files changed (1) hide show

notebooks/protac_degradation_predictor.py +236 -368

notebooks/protac_degradation_predictor.py CHANGED Viewed

@@ -1,34 +1,34 @@
 import optuna
 import pandas as pd
 from rdkit import Chem
 from rdkit.Chem import AllChem
 from rdkit import DataStructs
 from collections import defaultdict
-import h5py
-import numpy as np
 from tqdm.auto import tqdm
-import os
-import urllib.request
-from sklearn.preprocessing import StandardScaler
-# ## Define Torch Dataset
 from imblearn.over_sampling import SMOTE, ADASYN
-from sklearn.preprocessing import LabelEncoder
-import pandas as pd
-import numpy as np
-from torch.utils.data import Dataset, DataLoader
-import warnings
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
 import pytorch_lightning as pl
 from torchmetrics import (
     Accuracy,
     AUROC,
@@ -38,13 +38,11 @@ from torchmetrics import (
 )
 from torchmetrics import MetricCollection
-import pickle
-from sklearn.model_selection import (
-    StratifiedKFold,
-    StratifiedGroupKFold,
-)
-from sklearn.preprocessing import OrdinalEncoder
 protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
@@ -71,8 +69,6 @@ print(f'Number of compounds in test set: {len(unlabeled_df)}')
 # Protein embeddings downloaded from [Uniprot](https://www.uniprot.org/help/embeddings).
 #
 # Please note that running the following cell the first time might take a while.
 download_link = "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/embeddings/UP000005640_9606/per-protein.h5"
 embeddings_path = "../data/uniprot2embedding.h5"
 if not os.path.exists(embeddings_path):
@@ -82,26 +78,17 @@ if not os.path.exists(embeddings_path):
 protein_embeddings = {}
 with h5py.File("../data/uniprot2embedding.h5", "r") as file:
-    print(f"number of entries: {len(file.items()):,}")
     uniprots = protac_df['Uniprot'].unique().tolist()
     uniprots += protac_df['E3 Ligase Uniprot'].unique().tolist()
     for i, sequence_id in tqdm(enumerate(uniprots), desc='Loading protein embeddings'):
         try:
             embedding = file[sequence_id][:]
             protein_embeddings[sequence_id] = np.array(embedding)
-            if i < 10:
-                print(
-                    f"\tid: {sequence_id}, "
-                    f"\tembeddings shape: {embedding.shape}, "
-                    f"\tembeddings mean: {np.array(embedding).mean()}"
-                )
         except KeyError:
             print(f'KeyError for {sequence_id}')
             protein_embeddings[sequence_id] = np.zeros((1024,))
-# ## Load Cell Embeddings
 cell2embedding_filepath = '../data/cell2embedding.pkl'
 with open(cell2embedding_filepath, 'rb') as f:
     cell2embedding = pickle.load(f)
@@ -113,8 +100,7 @@ for cell_line in protac_df['Cell Line Identifier'].unique():
     if cell_line not in cell2embedding:
         cell2embedding[cell_line] = np.zeros(emb_shape)
-# ## Precompute Molecular Fingerprints
 morgan_fpgen = AllChem.GetMorganGenerator(
     radius=15,
     fpSize=1024,
@@ -142,7 +128,6 @@ print(f'Number of SMILES with overlapping fingerprints: {len(overlapping_smiles)
 print(f'Number of overlapping SMILES in protac_df: {len(protac_df[protac_df["Smiles"].isin(overlapping_smiles)])}')
 # Get the pair-wise tanimoto similarity between the PROTAC fingerprints
 tanimoto_matrix = defaultdict(list)
 for i, smiles1 in enumerate(tqdm(protac_df['Smiles'].unique(), desc='Computing Tanimoto similarity')):
     fp1 = smiles2fp[smiles1]
@@ -158,11 +143,6 @@ protac_df['Avg Tanimoto'] = protac_df['Smiles'].map(avg_tanimoto)
 smiles2fp = {s: np.array(fp) for s, fp in smiles2fp.items()}
-# ## Set the Column to Predict
-active_col = 'Active'
-# active_col = 'Active - OR'
 class PROTAC_Dataset(Dataset):
     def __init__(
@@ -274,25 +254,24 @@ class PROTAC_Dataset(Dataset):
             }
         return elem
-# Ignore UserWarning from PyTorch Lightning
-warnings.filterwarnings("ignore", ".*does not have many workers.*")
 class PROTAC_Model(pl.LightningModule):
     def __init__(
         self,
-        hidden_dim,
-        smiles_emb_dim=1024,
-        poi_emb_dim=1024,
-        e3_emb_dim=1024,
-        cell_emb_dim=768,
-        batch_size=32,
-        learning_rate=1e-3,
-        dropout=0.2,
-        train_dataset=None,
-        val_dataset=None,
-        test_dataset=None,
-        disabled_embeddings=[],
     ):
         super().__init__()
         self.poi_emb_dim = poi_emb_dim
@@ -302,6 +281,7 @@ class PROTAC_Model(pl.LightningModule):
         self.hidden_dim = hidden_dim
         self.batch_size = batch_size
         self.learning_rate = learning_rate
         self.train_dataset = train_dataset
         self.val_dataset = val_dataset
         self.test_dataset = test_dataset
@@ -318,48 +298,18 @@ class PROTAC_Model(pl.LightningModule):
         if 'poi' not in self.disabled_embeddings:
             self.poi_emb = nn.Linear(poi_emb_dim, hidden_dim)
-            # # Set the POI surrogate model as a Sequential model
-            # self.poi_emb = nn.Sequential(
-            #     nn.Linear(poi_emb_dim, hidden_dim),
-            #     nn.GELU(),
-            #     nn.Dropout(p=dropout),
-            #     nn.Linear(hidden_dim, hidden_dim),
-            #     # nn.ReLU(),
-            #     # nn.Dropout(p=dropout),
-            # )
         if 'e3' not in self.disabled_embeddings:
             self.e3_emb = nn.Linear(e3_emb_dim, hidden_dim)
-            # self.e3_emb = nn.Sequential(
-            #     nn.Linear(e3_emb_dim, hidden_dim),
-            #     # nn.ReLU(),
-            #     nn.Dropout(p=dropout),
-            #     # nn.Linear(hidden_dim, hidden_dim),
-            #     # nn.ReLU(),
-            #     # nn.Dropout(p=dropout),
-            # )
         if 'cell' not in self.disabled_embeddings:
             self.cell_emb = nn.Linear(cell_emb_dim, hidden_dim)
-            # self.cell_emb = nn.Sequential(
-            #     nn.Linear(cell_emb_dim, hidden_dim),
-            #     # nn.ReLU(),
-            #     nn.Dropout(p=dropout),
-            #     # nn.Linear(hidden_dim, hidden_dim),
-            #     # nn.ReLU(),
-            #     # nn.Dropout(p=dropout),
-            # )
         if 'smiles' not in self.disabled_embeddings:
             self.smiles_emb = nn.Linear(smiles_emb_dim, hidden_dim)
-            # self.smiles_emb = nn.Sequential(
-            #     nn.Linear(smiles_emb_dim, hidden_dim),
-            #     # nn.ReLU(),
-            #     nn.Dropout(p=dropout),
-            #     # nn.Linear(hidden_dim, hidden_dim),
-            #     # nn.ReLU(),
-            #     # nn.Dropout(p=dropout),
-            # )
-        self.fc1 = nn.Linear(
-            hidden_dim * (4 - len(self.disabled_embeddings)), hidden_dim)
         self.fc2 = nn.Linear(hidden_dim, hidden_dim)
         self.fc3 = nn.Linear(hidden_dim, 1)
@@ -394,9 +344,16 @@ class PROTAC_Model(pl.LightningModule):
             embeddings.append(self.cell_emb(cell_emb))
         if 'smiles' not in self.disabled_embeddings:
             embeddings.append(self.smiles_emb(smiles_emb))
-        x = torch.cat(embeddings, dim=1)
-        x = self.dropout(F.gelu(self.fc1(x)))
-        x = self.dropout(F.gelu(self.fc2(x)))
         x = self.fc3(x)
         return x
@@ -468,178 +425,6 @@ class PROTAC_Model(pl.LightningModule):
             shuffle=False,
         )
-# ## Test Sets
-# We want a different test set per Cross-Validation (CV) experiment (see further down). We are interested in three scenarios:
-# * Randomly splitting the data into training and test sets. Hence, the test st shall contain unique SMILES and Uniprots
-# * Splitting the data according to their Uniprot. Hence, the test set shall contain unique Uniprots
-# * Splitting the data according to their SMILES, _i.e._, the test set shall contain unique SMILES
-test_indeces = {}
-# Isolating the unique SMILES and Uniprots:
-active_df = protac_df[protac_df[active_col].notna()].copy()
-# Get the unique SMILES and Uniprot
-unique_smiles = active_df['Smiles'].value_counts() == 1
-unique_uniprot = active_df['Uniprot'].value_counts() == 1
-print(f'Number of unique SMILES: {unique_smiles.sum()}')
-print(f'Number of unique Uniprot: {unique_uniprot.sum()}')
-# Sample 1% of the len(active_df) from unique SMILES and Uniprot and get the
-# indices for a test set
-n = int(0.05 * len(active_df)) // 2
-unique_smiles = unique_smiles[unique_smiles].sample(n=n, random_state=42)
-# unique_uniprot = unique_uniprot[unique_uniprot].sample(n=, random_state=42)
-unique_indices = active_df[
-    active_df['Smiles'].isin(unique_smiles.index) &
-    active_df['Uniprot'].isin(unique_uniprot.index)
-].index
-print(f'Number of unique indices: {len(unique_indices)} ({len(unique_indices) / len(active_df):.1%})')
-test_indeces['random'] = unique_indices
-# # Get the test set
-# test_df = active_df.loc[unique_indices]
-# # Bar plot of the test Active distribution as percentage
-# test_df['Active'].value_counts(normalize=True).plot(kind='bar')
-# plt.title('Test set Active distribution')
-# plt.show()
-# # Bar plot of the test Active - OR distribution as percentage
-# test_df['Active - OR'].value_counts(normalize=True).plot(kind='bar')
-# plt.title('Test set Active - OR distribution')
-# plt.show()
-# Isolating the unique Uniprots:
-active_df = protac_df[protac_df[active_col].notna()].copy()
-unique_uniprot = active_df['Uniprot'].value_counts() == 1
-print(f'Number of unique Uniprot: {unique_uniprot.sum()}')
-# NOTE: Since they are very few, all unique Uniprot will be used as test set.
-# Get the indices for a test set
-unique_indices = active_df[active_df['Uniprot'].isin(unique_uniprot.index)].index
-test_indeces['uniprot'] = unique_indices
-print(f'Number of unique indices: {len(unique_indices)} ({len(unique_indices) / len(active_df):.1%})')
-# DEPRECATED: The following results in a too Before starting any training, we isolate a small group of test data. Each element in the test set is selected so that all the following conditions are met:
-# * its SMILES is unique
-# * its POI is unique
-# * its (SMILES, POI) pair is unique
-active_df = protac_df[protac_df[active_col].notna()]
-# Find the samples that:
-# * have their SMILES appearing only once in the dataframe
-# * have their Uniprot appearing only once in the dataframe
-# * have their (Smiles, Uniprot) pair appearing only once in the dataframe
-unique_smiles = active_df['Smiles'].value_counts() == 1
-unique_uniprot = active_df['Uniprot'].value_counts() == 1
-unique_smiles_uniprot = active_df.groupby(['Smiles', 'Uniprot']).size() == 1
-# Get the indices of the unique samples
-unique_smiles_idx = active_df['Smiles'].map(unique_smiles)
-unique_uniprot_idx = active_df['Uniprot'].map(unique_uniprot)
-unique_smiles_uniprot_idx = active_df.set_index(['Smiles', 'Uniprot']).index.map(unique_smiles_uniprot)
-# Cross the indices to get the unique samples
-# unique_samples = active_df[unique_smiles_idx & unique_uniprot_idx & unique_smiles_uniprot_idx].index
-unique_samples = active_df[unique_smiles_idx & unique_uniprot_idx].index
-test_df = active_df.loc[unique_samples]
-warnings.filterwarnings("ignore", ".*FixedLocator*")
-# ## Cross-Validation Training
-# Cross validation training with 5 splits. The split operation is done in three different ways:
-#
-# * Random split
-# * POI-wise: some POIs never in both splits
-# * Least Tanimoto similarity PROTAC-wise
-# ### Plotting CV Folds
-# NOTE: When set to 60, it will result in 29 groups, with nice distributions of
-# the number of unique groups in the train and validation sets, together with
-# the number of active and inactive PROTACs.
-n_bins_tanimoto = 60 if active_col == 'Active' else 400
-n_splits = 5
-# The train and validation sets will be created from the active PROTACs only,
-# i.e., the ones with 'Active' column not NaN, and that are NOT in the test set
-active_df = protac_df[protac_df[active_col].notna()]
-train_val_df = active_df[~active_df.index.isin(test_df.index)].copy()
-# Make three groups for CV:
-# * Random split
-# * Split by Uniprot (POI)
-# * Split by least tanimoto similarity PROTAC-wise
-groups = [
-    'random',
-    'uniprot',
-    'tanimoto',
-]
-for group_type in groups:
-    if group_type == 'random':
-        kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
-        groups = None
-    elif group_type == 'uniprot':
-        # Split by Uniprot
-        kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
-        encoder = OrdinalEncoder()
-        groups = encoder.fit_transform(train_val_df['Uniprot'].values.reshape(-1, 1))
-        print(f'Number of unique groups: {len(encoder.categories_[0])}')
-    elif group_type == 'tanimoto':
-        # Split by tanimoto similarity, i.e., group_type PROTACs with similar Avg Tanimoto
-        kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
-        tanimoto_groups = pd.cut(train_val_df['Avg Tanimoto'], bins=n_bins_tanimoto).copy()
-        encoder = OrdinalEncoder()
-        groups = encoder.fit_transform(tanimoto_groups.values.reshape(-1, 1))
-        print(f'Number of unique groups: {len(encoder.categories_[0])}')
-    X = train_val_df.drop(columns=active_col)
-    y = train_val_df[active_col].tolist()
-    # print(f'Group: {group_type}')
-    # fig, ax = plt.subplots(figsize=(6, 3))
-    # plot_cv_indices(kf, X=X, y=y, group=groups, ax=ax, n_splits=n_splits)
-    # plt.tight_layout()
-    # plt.show()
-    stats = []
-    for k, (train_index, val_index) in enumerate(kf.split(X, y, groups)):
-        train_df = train_val_df.iloc[train_index]
-        val_df = train_val_df.iloc[val_index]
-        stat = {
-            'fold': k,
-            'train_len': len(train_df),
-            'val_len': len(val_df),
-            'train_perc': len(train_df) / len(train_val_df),
-            'val_perc': len(val_df) / len(train_val_df),
-            'train_active (%)': train_df[active_col].sum() / len(train_df) * 100,
-            'train_inactive (%)': (len(train_df) - train_df[active_col].sum()) / len(train_df) * 100,
-            'val_active (%)': val_df[active_col].sum() / len(val_df) * 100,
-            'val_inactive (%)': (len(val_df) - val_df[active_col].sum()) / len(val_df) * 100,
-            'num_leaking_uniprot': len(set(train_df['Uniprot']).intersection(set(val_df['Uniprot']))),
-            'num_leaking_smiles': len(set(train_df['Smiles']).intersection(set(val_df['Smiles']))),
-        }
-        if group_type != 'random':
-            stat['train_unique_groups'] = len(np.unique(groups[train_index]))
-            stat['val_unique_groups'] = len(np.unique(groups[val_index]))
-        stats.append(stat)
-    print('-' * 120)
-# ### Run CV
-import warnings
-# Seed everything in pytorch lightning
-pl.seed_everything(42)
 def train_model(
         train_df,
@@ -650,8 +435,9 @@ def train_model(
         learning_rate=2e-5,
         max_epochs=50,
         smiles_emb_dim=1024,
         smote_k_neighbors=5,
-        use_ored_activity=False if active_col == 'Active' else True,
         fast_dev_run=False,
         use_logger=True,
         logger_name='protac',
@@ -669,7 +455,7 @@ def train_model(
         max_epochs (int): The maximum number of epochs.
         smiles_emb_dim (int): The dimension of the SMILES embeddings.
         smote_k_neighbors (int): The number of neighbors for the SMOTE oversampler.
-        use_ored_activity (bool): Whether to use the ORED activity column.
         fast_dev_run (bool): Whether to run a fast development run.
         disabled_embeddings (list): The list of disabled embeddings.
@@ -737,6 +523,7 @@ def train_model(
         cell_emb_dim=768,
         batch_size=batch_size,
         learning_rate=learning_rate,
         train_dataset=train_ds,
         val_dataset=val_ds,
         test_dataset=test_ds if test_df is not None else None,
@@ -763,12 +550,15 @@ def objective(
         max_epochs_options,
         smote_k_neighbors_options,
         fast_dev_run=False,
 ) -> float:
     # Generate the hyperparameters
     hidden_dim = trial.suggest_categorical('hidden_dim', hidden_dim_options)
     batch_size = trial.suggest_categorical('batch_size', batch_size_options)
     learning_rate = trial.suggest_float('learning_rate', *learning_rate_options, log=True)
     max_epochs = trial.suggest_categorical('max_epochs', max_epochs_options)
     smote_k_neighbors = trial.suggest_categorical('smote_k_neighbors', smote_k_neighbors_options)
     # Train the model with the current set of hyperparameters
@@ -777,11 +567,14 @@ def objective(
         val_df,
         hidden_dim=hidden_dim,
         batch_size=batch_size,
         learning_rate=learning_rate,
         max_epochs=max_epochs,
         smote_k_neighbors=smote_k_neighbors,
         use_logger=False,
         fast_dev_run=fast_dev_run,
     )
     # Metrics is a dictionary containing at least the validation loss
@@ -800,6 +593,8 @@ def hyperparameter_tuning_and_training(
         fast_dev_run=False,
         n_trials=20,
         logger_name='protac_hparam_search',
 ) -> tuple:
     """ Hyperparameter tuning and training of a PROTAC model.
@@ -819,9 +614,13 @@ def hyperparameter_tuning_and_training(
     max_epochs_options = [10, 20, 50]
     smote_k_neighbors_options = list(range(3, 16))
     # Create an Optuna study object
-    study = optuna.create_study(direction='minimize')
-    study.optimize(lambda trial: objective(
             trial,
             train_df,
             val_df,
@@ -830,117 +629,186 @@ def hyperparameter_tuning_and_training(
             learning_rate_options,
             max_epochs_options,
             smote_k_neighbors_options=smote_k_neighbors_options,
-            fast_dev_run=fast_dev_run,),
         n_trials=n_trials,
     )
-    # Retrieve the best hyperparameters
-    best_params = study.best_params
-    best_hidden_dim = best_params['hidden_dim']
-    best_batch_size = best_params['batch_size']
-    best_learning_rate = best_params['learning_rate']
-    best_max_epochs = best_params['max_epochs']
-    best_smote_k_neighbors = best_params['smote_k_neighbors']
     # Retrain the model with the best hyperparameters
     model, trainer, metrics = train_model(
         train_df,
         val_df,
         test_df,
-        hidden_dim=best_hidden_dim,
-        batch_size=best_batch_size,
-        learning_rate=best_learning_rate,
-        max_epochs=best_max_epochs,
         use_logger=True,
         logger_name=logger_name,
         fast_dev_run=fast_dev_run,
     )
     # Report the best hyperparameters found
-    metrics['hidden_dim'] = best_hidden_dim
-    metrics['batch_size'] = best_batch_size
-    metrics['learning_rate'] = best_learning_rate
-    metrics['max_epochs'] = best_max_epochs
-    metrics['smote_k_neighbors'] = best_smote_k_neighbors
     # Return the best metrics
     return model, trainer, metrics
-# Example usage
-# train_df, val_df, test_df = load_your_data()  # You need to load your datasets here
-# model, trainer, best_metrics = hyperparameter_tuning_and_training(train_df, val_df, test_df)
-# Loop over the different splits and train the model:
-active_name = active_col.replace(' ', '').lower()
-active_name = 'active-and' if active_name == 'active' else active_name
-n_splits = 5
-report = []
-active_df = protac_df[protac_df[active_col].notna()]
-train_val_df = active_df[~active_df.index.isin(unique_samples)]
-# Make directory ../reports if it does not exist
-if not os.path.exists('../reports'):
-    os.makedirs('../reports')
-for group_type in ['random', 'uniprot', 'tanimoto']:
-    print(f'Starting CV for group type: {group_type}')
-    # Setup CV iterator and groups
-    if group_type == 'random':
-        kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
-        groups = None
-    elif group_type == 'uniprot':
-        # Split by Uniprot
-        kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
-        encoder = OrdinalEncoder()
-        groups = encoder.fit_transform(train_val_df['Uniprot'].values.reshape(-1, 1))
-    elif group_type == 'tanimoto':
-        # Split by tanimoto similarity, i.e., group_type PROTACs with similar Avg Tanimoto
-        kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
-        tanimoto_groups = pd.cut(train_val_df['Avg Tanimoto'], bins=n_bins_tanimoto).copy()
-        encoder = OrdinalEncoder()
-        groups = encoder.fit_transform(tanimoto_groups.values.reshape(-1, 1))
-    # Start the CV over the folds
-    X = train_val_df.drop(columns=active_col)
-    y = train_val_df[active_col].tolist()
-    for k, (train_index, val_index) in enumerate(kf.split(X, y, groups)):
-        train_df = train_val_df.iloc[train_index]
-        val_df = train_val_df.iloc[val_index]
-        stats = {
-            'fold': k,
-            'group_type': group_type,
-            'train_len': len(train_df),
-            'val_len': len(val_df),
-            'train_perc': len(train_df) / len(train_val_df),
-            'val_perc': len(val_df) / len(train_val_df),
-            'train_active_perc': train_df[active_col].sum() / len(train_df),
-            'train_inactive_perc': (len(train_df) - train_df[active_col].sum()) / len(train_df),
-            'val_active_perc': val_df[active_col].sum() / len(val_df),
-            'val_inactive_perc': (len(val_df) - val_df[active_col].sum()) / len(val_df),
-            'test_active_perc': test_df[active_col].sum() / len(test_df),
-            'test_inactive_perc': (len(test_df) - test_df[active_col].sum()) / len(test_df),
-            'num_leaking_uniprot': len(set(train_df['Uniprot']).intersection(set(val_df['Uniprot']))),
-            'num_leaking_smiles': len(set(train_df['Smiles']).intersection(set(val_df['Smiles']))),
-        }
-        if group_type != 'random':
-            stats['train_unique_groups'] = len(np.unique(groups[train_index]))
-            stats['val_unique_groups'] = len(np.unique(groups[val_index]))
-        # Train and evaluate the model
-        # model, trainer, metrics = train_model(train_df, val_df, test_df)
-        model, trainer, metrics = hyperparameter_tuning_and_training(
-            train_df,
-            val_df,
-            test_df,
-            fast_dev_run=False,
-            n_trials=50,
-            logger_name=f'protac_{active_name}_{group_type}_fold_{k}',
-        )
-        stats.update(metrics)
-        del model
-        del trainer
-        report.append(stats)
-report = pd.DataFrame(report)
-report.to_csv(
-    f'../reports/cv_report_hparam_search_{n_splits}-splits_{active_name}.csv',
-    index=False,
-)

 import optuna
+from optuna.samplers import TPESampler
+import h5py
+import os
+import pickle
+import warnings
+import logging
 import pandas as pd
+import numpy as np
+import urllib.request
 from rdkit import Chem
 from rdkit.Chem import AllChem
 from rdkit import DataStructs
 from collections import defaultdict
+from typing import Literal
+from jsonargparse import CLI
 from tqdm.auto import tqdm
 from imblearn.over_sampling import SMOTE, ADASYN
+from sklearn.preprocessing import OrdinalEncoder, StandardScaler, LabelEncoder
+from sklearn.model_selection import (
+    StratifiedKFold,
+    StratifiedGroupKFold,
+)
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
 import pytorch_lightning as pl
+from torch.utils.data import Dataset, DataLoader
 from torchmetrics import (
     Accuracy,
     AUROC,
 )
 from torchmetrics import MetricCollection
+# Ignore UserWarning from Matplotlib
+warnings.filterwarnings("ignore", ".*FixedLocator*")
+# Ignore UserWarning from PyTorch Lightning
+warnings.filterwarnings("ignore", ".*does not have many workers.*")
 protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
 # Protein embeddings downloaded from [Uniprot](https://www.uniprot.org/help/embeddings).
 #
 # Please note that running the following cell the first time might take a while.
 download_link = "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/embeddings/UP000005640_9606/per-protein.h5"
 embeddings_path = "../data/uniprot2embedding.h5"
 if not os.path.exists(embeddings_path):
 protein_embeddings = {}
 with h5py.File("../data/uniprot2embedding.h5", "r") as file:
     uniprots = protac_df['Uniprot'].unique().tolist()
     uniprots += protac_df['E3 Ligase Uniprot'].unique().tolist()
     for i, sequence_id in tqdm(enumerate(uniprots), desc='Loading protein embeddings'):
         try:
             embedding = file[sequence_id][:]
             protein_embeddings[sequence_id] = np.array(embedding)
         except KeyError:
             print(f'KeyError for {sequence_id}')
             protein_embeddings[sequence_id] = np.zeros((1024,))
+## Load Cell Embeddings
 cell2embedding_filepath = '../data/cell2embedding.pkl'
 with open(cell2embedding_filepath, 'rb') as f:
     cell2embedding = pickle.load(f)
     if cell_line not in cell2embedding:
         cell2embedding[cell_line] = np.zeros(emb_shape)
+## Precompute Molecular Fingerprints
 morgan_fpgen = AllChem.GetMorganGenerator(
     radius=15,
     fpSize=1024,
 print(f'Number of overlapping SMILES in protac_df: {len(protac_df[protac_df["Smiles"].isin(overlapping_smiles)])}')
 # Get the pair-wise tanimoto similarity between the PROTAC fingerprints
 tanimoto_matrix = defaultdict(list)
 for i, smiles1 in enumerate(tqdm(protac_df['Smiles'].unique(), desc='Computing Tanimoto similarity')):
     fp1 = smiles2fp[smiles1]
 smiles2fp = {s: np.array(fp) for s, fp in smiles2fp.items()}
 class PROTAC_Dataset(Dataset):
     def __init__(
             }
         return elem
 class PROTAC_Model(pl.LightningModule):
     def __init__(
         self,
+        hidden_dim: int,
+        smiles_emb_dim: int = 1024,
+        poi_emb_dim: int = 1024,
+        e3_emb_dim: int = 1024,
+        cell_emb_dim: int = 768,
+        batch_size: int = 32,
+        learning_rate: float = 1e-3,
+        dropout: float = 0.2,
+        join_embeddings: Literal['concat', 'sum'] = 'concat',
+        train_dataset: PROTAC_Dataset = None,
+        val_dataset: PROTAC_Dataset = None,
+        test_dataset: PROTAC_Dataset = None,
+        disabled_embeddings: list = [],
     ):
         super().__init__()
         self.poi_emb_dim = poi_emb_dim
         self.hidden_dim = hidden_dim
         self.batch_size = batch_size
         self.learning_rate = learning_rate
+        self.join_embeddings = join_embeddings
         self.train_dataset = train_dataset
         self.val_dataset = val_dataset
         self.test_dataset = test_dataset
         if 'poi' not in self.disabled_embeddings:
             self.poi_emb = nn.Linear(poi_emb_dim, hidden_dim)
         if 'e3' not in self.disabled_embeddings:
             self.e3_emb = nn.Linear(e3_emb_dim, hidden_dim)
         if 'cell' not in self.disabled_embeddings:
             self.cell_emb = nn.Linear(cell_emb_dim, hidden_dim)
         if 'smiles' not in self.disabled_embeddings:
             self.smiles_emb = nn.Linear(smiles_emb_dim, hidden_dim)
+        if self.join_embeddings == 'concat':
+            joint_dim = hidden_dim * (4 - len(self.disabled_embeddings))
+        elif self.join_embeddings == 'sum':
+            joint_dim = hidden_dim
+        self.fc1 = nn.Linear(joint_dim, hidden_dim)
         self.fc2 = nn.Linear(hidden_dim, hidden_dim)
         self.fc3 = nn.Linear(hidden_dim, 1)
             embeddings.append(self.cell_emb(cell_emb))
         if 'smiles' not in self.disabled_embeddings:
             embeddings.append(self.smiles_emb(smiles_emb))
+        if self.join_embeddings == 'concat':
+            x = torch.cat(embeddings, dim=1)
+        elif self.join_embeddings == 'sum':
+            if len(embeddings) > 1:
+                embeddings = torch.stack(embeddings, dim=1)
+                x = torch.sum(embeddings, dim=1)
+            else:
+                x = embeddings[0]
+        x = self.dropout(F.relu(self.fc1(x)))
+        x = self.dropout(F.relu(self.fc2(x)))
         x = self.fc3(x)
         return x
             shuffle=False,
         )
 def train_model(
         train_df,
         learning_rate=2e-5,
         max_epochs=50,
         smiles_emb_dim=1024,
+        join_embeddings='concat',
         smote_k_neighbors=5,
+        use_ored_activity=True,
         fast_dev_run=False,
         use_logger=True,
         logger_name='protac',
         max_epochs (int): The maximum number of epochs.
         smiles_emb_dim (int): The dimension of the SMILES embeddings.
         smote_k_neighbors (int): The number of neighbors for the SMOTE oversampler.
+        use_ored_activity (bool): Whether to use the ORED activity column, i.e., "Active - OR" column.
         fast_dev_run (bool): Whether to run a fast development run.
         disabled_embeddings (list): The list of disabled embeddings.
         cell_emb_dim=768,
         batch_size=batch_size,
         learning_rate=learning_rate,
+        join_embeddings=join_embeddings,
         train_dataset=train_ds,
         val_dataset=val_ds,
         test_dataset=test_ds if test_df is not None else None,
         max_epochs_options,
         smote_k_neighbors_options,
         fast_dev_run=False,
+        use_ored_activity=True,
+        disabled_embeddings=[],
 ) -> float:
     # Generate the hyperparameters
     hidden_dim = trial.suggest_categorical('hidden_dim', hidden_dim_options)
     batch_size = trial.suggest_categorical('batch_size', batch_size_options)
     learning_rate = trial.suggest_float('learning_rate', *learning_rate_options, log=True)
     max_epochs = trial.suggest_categorical('max_epochs', max_epochs_options)
+    join_embeddings = trial.suggest_categorical('join_embeddings', ['concat', 'sum'])
     smote_k_neighbors = trial.suggest_categorical('smote_k_neighbors', smote_k_neighbors_options)
     # Train the model with the current set of hyperparameters
         val_df,
         hidden_dim=hidden_dim,
         batch_size=batch_size,
+        join_embeddings=join_embeddings,
         learning_rate=learning_rate,
         max_epochs=max_epochs,
         smote_k_neighbors=smote_k_neighbors,
         use_logger=False,
         fast_dev_run=fast_dev_run,
+        use_ored_activity=use_ored_activity,
+        disabled_embeddings=disabled_embeddings,
     )
     # Metrics is a dictionary containing at least the validation loss
         fast_dev_run=False,
         n_trials=20,
         logger_name='protac_hparam_search',
+        use_ored_activity=True,
+        disabled_embeddings=[],
 ) -> tuple:
     """ Hyperparameter tuning and training of a PROTAC model.
     max_epochs_options = [10, 20, 50]
     smote_k_neighbors_options = list(range(3, 16))
+    # Set the verbosity of Optuna
+    optuna.logging.set_verbosity(optuna.logging.WARNING)
     # Create an Optuna study object
+    sampler = TPESampler(seed=42, multivariate=True)
+    study = optuna.create_study(direction='minimize', sampler=sampler)
+    study.optimize(
+        lambda trial: objective(
             trial,
             train_df,
             val_df,
             learning_rate_options,
             max_epochs_options,
             smote_k_neighbors_options=smote_k_neighbors_options,
+            fast_dev_run=fast_dev_run,
+            use_ored_activity=use_ored_activity,
+            disabled_embeddings=disabled_embeddings,
+        ),
         n_trials=n_trials,
     )
     # Retrain the model with the best hyperparameters
     model, trainer, metrics = train_model(
         train_df,
         val_df,
         test_df,
         use_logger=True,
         logger_name=logger_name,
         fast_dev_run=fast_dev_run,
+        use_ored_activity=use_ored_activity,
+        disabled_embeddings=disabled_embeddings,
+        **study.best_params,
     )
     # Report the best hyperparameters found
+    metrics.update({f'hparam_{k}': v for k, v in study.best_params.items()})
     # Return the best metrics
     return model, trainer, metrics
+def main(
+    use_ored_activity: bool = True,
+    n_trials: int = 50,
+    n_splits: int = 5,
+    fast_dev_run: bool = False,
+):
+    """ Train a PROTAC model using the given datasets and hyperparameters.
+    Args:
+        use_ored_activity (bool): Whether to use the 'Active - OR' column.
+        n_trials (int): The number of hyperparameter optimization trials.
+        n_splits (int): The number of cross-validation splits.
+        fast_dev_run (bool): Whether to run a fast development run.
+    """
+    ## Set the Column to Predict
+    active_col = 'Active - OR' if use_ored_activity else 'Active'
+    active_name = active_col.replace(' ', '').lower()
+    active_name = 'active-and' if active_name == 'active' else active_name
+    ## Test Sets
+    active_df = protac_df[protac_df[active_col].notna()]
+    # Before starting any training, we isolate a small group of test data. Each element in the test set is selected so that all the following conditions are met:
+    # * its SMILES appears only once in the dataframe
+    # * its Uniprot appears only once in the dataframe
+    # * its (Smiles, Uniprot) pair appears only once in the dataframe
+    unique_smiles = active_df['Smiles'].value_counts() == 1
+    unique_uniprot = active_df['Uniprot'].value_counts() == 1
+    unique_smiles_uniprot = active_df.groupby(['Smiles', 'Uniprot']).size() == 1
+    # Get the indices of the unique samples
+    unique_smiles_idx = active_df['Smiles'].map(unique_smiles)
+    unique_uniprot_idx = active_df['Uniprot'].map(unique_uniprot)
+    unique_smiles_uniprot_idx = active_df.set_index(['Smiles', 'Uniprot']).index.map(unique_smiles_uniprot)
+    # Cross the indices to get the unique samples
+    unique_samples = active_df[unique_smiles_idx & unique_uniprot_idx & unique_smiles_uniprot_idx].index
+    test_df = active_df.loc[unique_samples]
+    train_val_df = active_df[~active_df.index.isin(unique_samples)]
+    ## Cross-Validation Training
+    # Cross validation training with 5 splits. The split operation is done in three different ways:
+    #
+    # * Random split
+    # * POI-wise: some POIs never in both splits
+    # * Least Tanimoto similarity PROTAC-wise
+    # NOTE: When set to 60, it will result in 29 groups, with nice distributions of
+    # the number of unique groups in the train and validation sets, together with
+    # the number of active and inactive PROTACs.
+    n_bins_tanimoto = 60 if active_col == 'Active' else 400
+    # Make directory ../reports if it does not exist
+    if not os.path.exists('../reports'):
+        os.makedirs('../reports')
+    # Seed everything in pytorch lightning
+    pl.seed_everything(42)
+    # Loop over the different splits and train the model:
+    report = []
+    for group_type in ['random', 'uniprot', 'tanimoto']:
+        print('-' * 100)
+        print(f'Starting CV for group type: {group_type}')
+        print('-' * 100)
+        # Setup CV iterator and groups
+        if group_type == 'random':
+            kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
+            groups = None
+        elif group_type == 'uniprot':
+            # Split by Uniprot
+            kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
+            encoder = OrdinalEncoder()
+            groups = encoder.fit_transform(train_val_df['Uniprot'].values.reshape(-1, 1))
+        elif group_type == 'tanimoto':
+            # Split by tanimoto similarity, i.e., group_type PROTACs with similar Avg Tanimoto
+            kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
+            tanimoto_groups = pd.cut(train_val_df['Avg Tanimoto'], bins=n_bins_tanimoto).copy()
+            encoder = OrdinalEncoder()
+            groups = encoder.fit_transform(tanimoto_groups.values.reshape(-1, 1))
+        # Start the CV over the folds
+        X = train_val_df.drop(columns=active_col)
+        y = train_val_df[active_col].tolist()
+        for k, (train_index, val_index) in enumerate(kf.split(X, y, groups)):
+            print('-' * 100)
+            print(f'Starting CV for group type: {group_type}, fold: {k}')
+            print('-' * 100)
+            train_df = train_val_df.iloc[train_index]
+            val_df = train_val_df.iloc[val_index]
+            stats = {
+                'fold': k,
+                'group_type': group_type,
+                'train_len': len(train_df),
+                'val_len': len(val_df),
+                'train_perc': len(train_df) / len(train_val_df),
+                'val_perc': len(val_df) / len(train_val_df),
+                'train_active_perc': train_df[active_col].sum() / len(train_df),
+                'train_inactive_perc': (len(train_df) - train_df[active_col].sum()) / len(train_df),
+                'val_active_perc': val_df[active_col].sum() / len(val_df),
+                'val_inactive_perc': (len(val_df) - val_df[active_col].sum()) / len(val_df),
+                'test_active_perc': test_df[active_col].sum() / len(test_df),
+                'test_inactive_perc': (len(test_df) - test_df[active_col].sum()) / len(test_df),
+                'num_leaking_uniprot': len(set(train_df['Uniprot']).intersection(set(val_df['Uniprot']))),
+                'num_leaking_smiles': len(set(train_df['Smiles']).intersection(set(val_df['Smiles']))),
+                'disabled_embeddings': np.nan,
+            }
+            if group_type != 'random':
+                stats['train_unique_groups'] = len(np.unique(groups[train_index]))
+                stats['val_unique_groups'] = len(np.unique(groups[val_index]))
+            # Train and evaluate the model
+            model, trainer, metrics = hyperparameter_tuning_and_training(
+                train_df,
+                val_df,
+                test_df,
+                fast_dev_run=fast_dev_run,
+                n_trials=n_trials,
+                logger_name=f'protac_{active_name}_{group_type}_fold_{k}',
+                use_ored_activity=use_ored_activity,
+            )
+            hparams = {p.strip('hparam_'): v for p, v in stats.items() if p.startswith('hparam_')}
+            stats.update(metrics)
+            report.append(stats.copy())
+            del model
+            del trainer
+            # Ablation study: disable embeddings at a time
+            for disabled_embeddings in [['poi'], ['cell'], ['smiles'], ['e3', 'cell'], ['poi', 'e3', 'cell']]:
+                print('-' * 100)
+                print(f'Ablation study with disabled embeddings: {disabled_embeddings}')
+                print('-' * 100)
+                stats['disabled_embeddings'] = 'disabled ' + ' '.join(disabled_embeddings)
+                model, trainer, metrics = train_model(
+                    train_df,
+                    val_df,
+                    test_df,
+                    fast_dev_run=fast_dev_run,
+                    logger_name=f'protac_{active_name}_{group_type}_fold_{k}_disabled-{"-".join(disabled_embeddings)}',
+                    use_ored_activity=use_ored_activity,
+                    disabled_embeddings=disabled_embeddings,
+                    **hparams,
+                )
+                stats.update(metrics)
+                report.append(stats.copy())
+                del model
+                del trainer
+    report = pd.DataFrame(report)
+    report.to_csv(
+        f'../reports/cv_report_hparam_search_{n_splits}-splits_{active_name}.csv',
+        index=False,
+    )
+if __name__ == '__main__':
+    cli = CLI(main)