Spaces:

ailab-bio
/

PROTAC-Degradation-Predictor

Sleeping

App Files Files Community

ribesstefano commited on Jul 31, 2024

Commit

251060c

1 Parent(s): 1e811f2

Fixed issue with duplicates + Experiments now rely on predefined datasets + Added experiments on simple embeddings

Browse files

Files changed (8) hide show

protac_degradation_predictor/optuna_utils.py +32 -38
protac_degradation_predictor/protac_dataset.py +6 -4
protac_degradation_predictor/pytorch_models.py +34 -10
src/get_studies_datasets.py +196 -18
src/run_experiments.py +58 -150
src/run_experiments_aminoacid_counts.py +4 -4
src/run_experiments_cells_onehot.py +3 -3
src/run_experiments_xgboost.py +57 -34

protac_degradation_predictor/optuna_utils.py CHANGED Viewed

@@ -11,7 +11,7 @@ from .protac_dataset import get_datasets
 import torch
 import optuna
-from optuna.samplers import TPESampler
 import joblib
 import pandas as pd
 from sklearn.model_selection import (
@@ -117,8 +117,6 @@ def pytorch_model_objective(
         logger_save_dir: str = 'logs',
         logger_name: str = 'cv_model',
         enable_checkpointing: bool = False,
-        use_cells_one_hot: bool = False,
-        use_amino_acid_count: bool = False,
 ) -> float:
     """ Objective function for hyperparameter optimization.
@@ -135,17 +133,24 @@ def pytorch_model_objective(
         active_label (str): The active label column.
         disabled_embeddings (List[str]): The list of disabled embeddings.
     """
     # Suggest hyperparameters to be used accross the CV folds
-    hidden_dim = trial.suggest_categorical('hidden_dim', hidden_dim_options)
-    batch_size = 128 # trial.suggest_categorical('batch_size', batch_size_options)
-    learning_rate = trial.suggest_float('learning_rate', *learning_rate_options, log=True)
-    smote_k_neighbors = trial.suggest_categorical('smote_k_neighbors', smote_k_neighbors_options)
-    use_smote = trial.suggest_categorical('use_smote', [True, False])
-    # if use_cells_one_hot or use_amino_acid_count:
-    #     use_smote = False
-    apply_scaling = True # trial.suggest_categorical('apply_scaling', [True, False])
-    dropout = trial.suggest_float('dropout', *dropout_options)
-    use_batch_norm = trial.suggest_categorical('use_batch_norm', [True, False])
     # Start the CV over the folds
     X = train_val_df.copy().drop(columns=active_label)
@@ -185,12 +190,13 @@ def pytorch_model_objective(
             hidden_dim=hidden_dim,
             batch_size=batch_size,
             learning_rate=learning_rate,
-            dropout=dropout,
             use_batch_norm=use_batch_norm,
             max_epochs=max_epochs,
             smote_k_neighbors=smote_k_neighbors,
             apply_scaling=apply_scaling,
-            use_smote=use_smote,
             fast_dev_run=fast_dev_run,
             active_label=active_label,
             return_predictions=True,
@@ -224,18 +230,6 @@ def pytorch_model_objective(
     # Optuna aims to minimize the pytorch_model_objective
     return - val_roc_auc
-    # # Get the majority vote for the test predictions
-    # if test_df is not None and not fast_dev_run:
-    #     majority_vote_metrics = get_majority_vote_metrics(test_preds, test_df, active_label)
-    #     majority_vote_metrics.update(get_dataframe_stats(train_df, val_df, test_df, active_label))
-    #     trial.set_user_attr('majority_vote_metrics', majority_vote_metrics)
-    #     logging.info(f'Majority vote metrics: {majority_vote_metrics}')
-    # # Get the average validation accuracy and ROC AUC accross the folds
-    # val_roc_auc = np.mean([r['val_roc_auc'] for r in report])
-    # # Optuna aims to minimize the pytorch_model_objective
-    # return - val_roc_auc
 def hyperparameter_tuning_and_training(
@@ -256,8 +250,6 @@ def hyperparameter_tuning_and_training(
         max_epochs: int = 100,
         study_filename: Optional[str] = None,
         force_study: bool = False,
-        use_cells_one_hot: bool = False,
-        use_amino_acid_count: bool = False,
 ) -> tuple:
     """ Hyperparameter tuning and training of a PROTAC model.
@@ -285,10 +277,13 @@ def hyperparameter_tuning_and_training(
     """
     pl.seed_everything(42)
     # Define the search space
-    hidden_dim_options = [16, 32, 64, 128, 256] #, 512]
     batch_size_options = [128, 128] # [4, 8, 16, 32, 64, 128]
-    learning_rate_options = (1e-6, 1e-3) # min and max values for loguniform distribution
     smote_k_neighbors_options = list(range(3, 16))
     # NOTE: We want Optuna to explore the combination (very low dropout, very
     # small hidden_dim)
@@ -296,8 +291,10 @@ def hyperparameter_tuning_and_training(
     # Set the verbosity of Optuna
     optuna.logging.set_verbosity(optuna.logging.WARNING)
     # Create an Optuna study object
-    sampler = TPESampler(seed=42, multivariate=True)
     study = optuna.create_study(direction='minimize', sampler=sampler)
     study_loaded = False
@@ -328,8 +325,6 @@ def hyperparameter_tuning_and_training(
                 active_label=active_label,
                 max_epochs=max_epochs,
                 disabled_embeddings=[],
-                use_cells_one_hot=use_cells_one_hot,
-                use_amino_acid_count=use_amino_acid_count,
             ),
             n_trials=n_trials,
         )
@@ -360,10 +355,8 @@ def hyperparameter_tuning_and_training(
         disabled_embeddings=[],
         use_logger=True,
         logger_save_dir=logger_save_dir,
-        logger_name=f'{logger_name}_{split_type}_cv_model',
         enable_checkpointing=True,
-        use_cells_one_hot=use_cells_one_hot,
-        use_amino_acid_count=use_amino_acid_count,
     )
     # Retrain N models with the best hyperparameters (measure model uncertainty)
@@ -385,12 +378,13 @@ def hyperparameter_tuning_and_training(
             disabled_embeddings=[],
             use_logger=True,
             logger_save_dir=logger_save_dir,
-            logger_name=f'{logger_name}_best_model_n{i}',
             enable_checkpointing=True,
             checkpoint_model_name=f'best_model_n{i}_{split_type}',
             return_predictions=True,
             batch_size=128,
             apply_scaling=True,
             **study.best_params,
         )
         # Rename the keys in the metrics dictionary

 import torch
 import optuna
+from optuna.samplers import TPESampler, QMCSampler
 import joblib
 import pandas as pd
 from sklearn.model_selection import (
         logger_save_dir: str = 'logs',
         logger_name: str = 'cv_model',
         enable_checkpointing: bool = False,
 ) -> float:
     """ Objective function for hyperparameter optimization.
         active_label (str): The active label column.
         disabled_embeddings (List[str]): The list of disabled embeddings.
     """
+    # Set fixed hyperparameters
+    batch_size = 128
+    apply_scaling = True # It is dynamically disabled for binary data
+    use_batch_norm = True
     # Suggest hyperparameters to be used accross the CV folds
+    hidden_dim = trial.suggest_int('hidden_dim', 32, 512, step=32)
+    smote_k_neighbors = trial.suggest_int('smote_k_neighbors', 0, 12)
+    # hidden_dim = trial.suggest_categorical('hidden_dim', hidden_dim_options)
+    # smote_k_neighbors = trial.suggest_categorical('smote_k_neighbors', smote_k_neighbors_options)
+    # dropout = trial.suggest_float('dropout', *dropout_options)
+    # use_batch_norm = trial.suggest_categorical('use_batch_norm', [True, False])
+    # Optimizer parameters
+    learning_rate = trial.suggest_float('learning_rate', 1e-6, 1e-1, log=True)
+    beta1 = trial.suggest_float('beta1', 0.1, 0.999)
+    beta2 = trial.suggest_float('beta2', 0.1, 0.999)
+    eps = trial.suggest_float('eps', 1e-9, 1.0, log=True)
     # Start the CV over the folds
     X = train_val_df.copy().drop(columns=active_label)
             hidden_dim=hidden_dim,
             batch_size=batch_size,
             learning_rate=learning_rate,
+            beta1=beta1,
+            beta2=beta2,
+            eps=eps,
             use_batch_norm=use_batch_norm,
             max_epochs=max_epochs,
             smote_k_neighbors=smote_k_neighbors,
             apply_scaling=apply_scaling,
             fast_dev_run=fast_dev_run,
             active_label=active_label,
             return_predictions=True,
     # Optuna aims to minimize the pytorch_model_objective
     return - val_roc_auc
 def hyperparameter_tuning_and_training(
         max_epochs: int = 100,
         study_filename: Optional[str] = None,
         force_study: bool = False,
 ) -> tuple:
     """ Hyperparameter tuning and training of a PROTAC model.
     """
     pl.seed_everything(42)
+    # TODO: Make the following code more modular, i.e., the ranges shall be put
+    # in dictionaries or config files or something like that.
     # Define the search space
+    hidden_dim_options = [8, 16, 32, 64, 128, 256] #, 512]
     batch_size_options = [128, 128] # [4, 8, 16, 32, 64, 128]
+    learning_rate_options = (1e-6, 1e-1) # min and max values for loguniform distribution
     smote_k_neighbors_options = list(range(3, 16))
     # NOTE: We want Optuna to explore the combination (very low dropout, very
     # small hidden_dim)
     # Set the verbosity of Optuna
     optuna.logging.set_verbosity(optuna.logging.WARNING)
+    # Set a quasi-random sampler, as suggested in: https://github.com/google-research/tuning_playbook?tab=readme-ov-file#faqs
+    # sampler = TPESampler(seed=42, multivariate=True)
+    sampler = QMCSampler(qmc_type='halton', scramble=True, seed=42)
     # Create an Optuna study object
     study = optuna.create_study(direction='minimize', sampler=sampler)
     study_loaded = False
                 active_label=active_label,
                 max_epochs=max_epochs,
                 disabled_embeddings=[],
             ),
             n_trials=n_trials,
         )
         disabled_embeddings=[],
         use_logger=True,
         logger_save_dir=logger_save_dir,
+        logger_name=f'cv_model_{logger_name}',
         enable_checkpointing=True,
     )
     # Retrain N models with the best hyperparameters (measure model uncertainty)
             disabled_embeddings=[],
             use_logger=True,
             logger_save_dir=logger_save_dir,
+            logger_name=f'best_model_n{i}_{logger_name}',
             enable_checkpointing=True,
             checkpoint_model_name=f'best_model_n{i}_{split_type}',
             return_predictions=True,
             batch_size=128,
             apply_scaling=True,
+            use_batch_norm=True,
             **study.best_params,
         )
         # Rename the keys in the metrics dictionary

protac_degradation_predictor/protac_dataset.py CHANGED Viewed

@@ -319,7 +319,6 @@ def get_datasets(
         protein2embedding: Dict = None,
         cell2embedding: Dict = None,
         smiles2fp: Dict = None,
-        use_smote: bool = True,
         smote_k_neighbors: int = 5,
         active_label: str = 'Active',
         disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
@@ -345,14 +344,17 @@ def get_datasets(
         use_single_scaler (bool): Whether to use a single scaler for all features.
         apply_scaling (bool): Whether to apply scaling to the data now. Defaults to False (the Pytorch Lightning model does that).
     """
-    oversampler = SMOTE(k_neighbors=smote_k_neighbors, random_state=42)
     train_ds = PROTAC_Dataset(
         train_df,
         protein2embedding,
         cell2embedding,
         smiles2fp,
-        use_smote=use_smote,
-        oversampler=oversampler if use_smote else None,
         active_label=active_label,
         disabled_embeddings=disabled_embeddings,
         scaler=scaler,

         protein2embedding: Dict = None,
         cell2embedding: Dict = None,
         smiles2fp: Dict = None,
         smote_k_neighbors: int = 5,
         active_label: str = 'Active',
         disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
         use_single_scaler (bool): Whether to use a single scaler for all features.
         apply_scaling (bool): Whether to apply scaling to the data now. Defaults to False (the Pytorch Lightning model does that).
     """
+    if smote_k_neighbors:
+        oversampler = SMOTE(k_neighbors=smote_k_neighbors, random_state=42)
+    else:
+        oversampler = None
     train_ds = PROTAC_Dataset(
         train_df,
         protein2embedding,
         cell2embedding,
         smiles2fp,
+        use_smote=True if smote_k_neighbors else False,
+        oversampler=oversampler,
         active_label=active_label,
         disabled_embeddings=disabled_embeddings,
         scaler=scaler,

protac_degradation_predictor/pytorch_models.py CHANGED Viewed

@@ -171,6 +171,7 @@ class PROTAC_Model(pl.LightningModule):
         test_dataset: PROTAC_Dataset = None,
         disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
         apply_scaling: bool = True,
     ):
         """ Initialize the PROTAC Pytorch Lightning model.
@@ -189,6 +190,7 @@ class PROTAC_Model(pl.LightningModule):
             test_dataset (PROTAC_Dataset): The test dataset
             disabled_embeddings (list): List of disabled embeddings. Can be 'poi', 'e3', 'cell', 'smiles'
             apply_scaling (bool): Whether to apply scaling to the embeddings
         """
         super().__init__()
         # Set our init args as class attributes
@@ -328,15 +330,31 @@ class PROTAC_Model(pl.LightningModule):
         return self.step(batch, batch_idx, 'test')
     def configure_optimizers(self):
-        optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
         return {
             'optimizer': optimizer,
-            'lr_scheduler': optim.lr_scheduler.ReduceLROnPlateau(
-                optimizer=optimizer,
-                mode='min',
-                factor=0.1,
-                patience=0,
-            ),
             'interval': 'step',  # or 'epoch'
             'frequency': 1,
             'monitor': 'val_loss',
@@ -411,12 +429,14 @@ def train_model(
         hidden_dim: int = 768,
         batch_size: int = 128,
         learning_rate: float = 2e-5,
         dropout: float = 0.2,
         max_epochs: int = 50,
         use_batch_norm: bool = False,
         join_embeddings: Literal['beginning', 'concat', 'sum'] = 'sum',
-        smote_k_neighbors:int = 5,
-        use_smote: bool = True,
         apply_scaling: bool = True,
         active_label: str = 'Active',
         fast_dev_run: bool = False,
@@ -468,7 +488,6 @@ def train_model(
         protein2embedding,
         cell2embedding,
         smiles2fp,
-        use_smote=use_smote,
         smote_k_neighbors=smote_k_neighbors,
         active_label=active_label,
         disabled_embeddings=disabled_embeddings,
@@ -540,6 +559,10 @@ def train_model(
         devices=1,
         num_nodes=1,
     )
     model = PROTAC_Model(
         hidden_dim=hidden_dim,
         smiles_emb_dim=smiles_emb_dim,
@@ -556,6 +579,7 @@ def train_model(
         val_dataset=val_ds,
         test_dataset=test_ds if test_df is not None else None,
         disabled_embeddings=disabled_embeddings,
     )
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")

         test_dataset: PROTAC_Dataset = None,
         disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
         apply_scaling: bool = True,
+        extra_optim_params: Optional[dict] = None,
     ):
         """ Initialize the PROTAC Pytorch Lightning model.
             test_dataset (PROTAC_Dataset): The test dataset
             disabled_embeddings (list): List of disabled embeddings. Can be 'poi', 'e3', 'cell', 'smiles'
             apply_scaling (bool): Whether to apply scaling to the embeddings
+            extra_optim_params (dict): Extra parameters for the optimizer
         """
         super().__init__()
         # Set our init args as class attributes
         return self.step(batch, batch_idx, 'test')
     def configure_optimizers(self):
+        # Define optimizer
+        if self.extra_optim_params is not None:
+            optimizer = optim.Adam(self.parameters(), lr=self.learning_rate, **self.extra_optim_params)
+        else:
+            optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
+        # Define LR scheduler
+        if self.trainer.max_epochs:
+            total_iters = self.trainer.max_epochs
+        elif self.trainer.max_steps:
+            total_iters = self.trainer.max_steps
+        else:
+            total_iters = 20
+        lr_scheduler = optim.lr_scheduler.LinearLR(
+            optimizer=optimizer,
+            total_iters=total_iters,
+        )
+        # lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+        #     optimizer=optimizer,
+        #     mode='min',
+        #     factor=0.01,
+        #     patience=0,
+        # )
         return {
             'optimizer': optimizer,
+            'lr_scheduler': lr_scheduler,
             'interval': 'step',  # or 'epoch'
             'frequency': 1,
             'monitor': 'val_loss',
         hidden_dim: int = 768,
         batch_size: int = 128,
         learning_rate: float = 2e-5,
+        beta1: float = 0.9,
+        beta2: float = 0.999,
+        eps: float = 1e-8,
         dropout: float = 0.2,
         max_epochs: int = 50,
         use_batch_norm: bool = False,
         join_embeddings: Literal['beginning', 'concat', 'sum'] = 'sum',
+        smote_k_neighbors: int = 5,
         apply_scaling: bool = True,
         active_label: str = 'Active',
         fast_dev_run: bool = False,
         protein2embedding,
         cell2embedding,
         smiles2fp,
         smote_k_neighbors=smote_k_neighbors,
         active_label=active_label,
         disabled_embeddings=disabled_embeddings,
         devices=1,
         num_nodes=1,
     )
+    extra_optim_params = {
+        'betas': (beta1, beta2),
+        'eps': eps,
+    }
     model = PROTAC_Model(
         hidden_dim=hidden_dim,
         smiles_emb_dim=smiles_emb_dim,
         val_dataset=val_ds,
         test_dataset=test_ds if test_df is not None else None,
         disabled_embeddings=disabled_embeddings,
+        extra_optim_params=extra_optim_params,
     )
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")

src/get_studies_datasets.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import sys
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 import protac_degradation_predictor as pdp
@@ -10,6 +11,7 @@ import logging
 from typing import Literal
 from sklearn.preprocessing import OrdinalEncoder
 from tqdm import tqdm
 import pandas as pd
 import numpy as np
@@ -109,7 +111,7 @@ def get_tanimoto_split_indices(
         active_df: pd.DataFrame,
         active_col: str,
         test_split: float,
-        n_bins_tanimoto: int = 200,
 ) -> pd.Index:
     """ Get the indices of the test set using the Tanimoto-based split.
@@ -154,9 +156,11 @@ def get_tanimoto_split_indices(
                 test_df.append(group_df)
                 continue
             # Be more selective and make sure that the percentage of active and
-            # inactive is balanced
-            if (num_active_group + num_active_test) / (num_entries_test + num_entries) < 0.6:
-                if (num_inactive_group + num_inactive_test) / (num_entries_test + num_entries) < 0.6:
                     test_df.append(group_df)
     test_df = pd.concat(test_df)
     return test_df.index
@@ -212,10 +216,130 @@ def get_target_split_indices(active_df: pd.DataFrame, active_col: str, test_spli
     return test_df.index
 def main(
     active_col: str = 'Active (Dmax 0.6, pDC50 6.0)',
     test_split: float = 0.1,
     studies: str | Literal['all', 'standard', 'e3_ligase', 'similarity', 'target'] = 'all',
 ):
     """ Get and save the datasets for the different studies.
@@ -237,49 +361,103 @@ def main(
     protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
     # Map E3 Ligase Iap to IAP
     protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
     protac_df[active_col] = protac_df.apply(
         lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
     )
     _, protac_df = get_smiles2fp_and_avg_tanimoto(protac_df)
     ## Get the test sets
     test_indeces = {}
     active_df = protac_df[protac_df[active_col].notna()].copy()
-    # Remove legacy column 'Active - OR' if it exists
-    if 'Active - OR' in active_df.columns:
-        active_df.drop(columns='Active - OR', inplace=True)
     if studies == 'standard' or studies == 'all':
         test_indeces['standard'] = get_random_split_indices(active_df, test_split)
     if studies == 'target' or studies == 'all':
         test_indeces['target'] = get_target_split_indices(active_df, active_col, test_split)
-    if studies == 'e3_ligase' or studies == 'all':
-        test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
     if studies == 'similarity' or studies == 'all':
         test_indeces['similarity'] = get_tanimoto_split_indices(active_df, active_col, test_split)
     # Make directory for studies datasets if it does not exist
     data_dir = '../data/studies'
     if not os.path.exists(data_dir):
         os.makedirs(data_dir)
-    # Cross-Validation Training
-    for split_type, indeces in test_indeces.items():
-        test_df = active_df.loc[indeces].copy()
-        train_val_df = active_df[~active_df.index.isin(test_df.index)].copy()
-        # Save the datasets
         train_val_perc = f'{int((1 - test_split) * 100)}'
         test_perc = f'{int(test_split * 100)}'
         train_val_filename = f'{data_dir}/{split_type}_train_val_{train_val_perc}split_{active_name}.csv'
         test_filename = f'{data_dir}/{split_type}_test_{test_perc}split_{active_name}.csv'
-        print('')
-        print(f'Saving train_val datasets as: {train_val_filename}')
-        print(f'Saving test datasets as:      {test_filename}')
         train_val_df.to_csv(train_val_filename, index=False)
         test_df.to_csv(test_filename, index=False)

 import os
 import sys
+from typing import Dict
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 import protac_degradation_predictor as pdp
 from typing import Literal
 from sklearn.preprocessing import OrdinalEncoder
+from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold
 from tqdm import tqdm
 import pandas as pd
 import numpy as np
         active_df: pd.DataFrame,
         active_col: str,
         test_split: float,
+        n_bins_tanimoto: int = 100, # Original: 200
 ) -> pd.Index:
     """ Get the indices of the test set using the Tanimoto-based split.
                 test_df.append(group_df)
                 continue
             # Be more selective and make sure that the percentage of active and
+            # inactive is not over-exceeding 60%
+            perc_active_group = (num_active_group + num_active_test) / (num_entries_test + num_entries)
+            perc_inactive_group = (num_inactive_group + num_inactive_test) / (num_entries_test + num_entries)
+            if perc_active_group < 0.6:
+                if perc_inactive_group < 0.6:
                     test_df.append(group_df)
     test_df = pd.concat(test_df)
     return test_df.index
     return test_df.index
+def get_dataframe_stats(
+        train_df = None,
+        val_df = None,
+        test_df = None,
+        active_label = 'Active',
+    ) -> Dict:
+    """ Get some statistics from the dataframes.
+    Args:
+        train_df (pd.DataFrame): The training set.
+        val_df (pd.DataFrame): The validation set.
+        test_df (pd.DataFrame): The test set.
+    """
+    stats = {}
+    if train_df is not None:
+        stats['train_len'] = len(train_df)
+        stats['train_active_perc'] = train_df[active_label].sum() / len(train_df)
+        stats['train_inactive_perc'] = (len(train_df) - train_df[active_label].sum()) / len(train_df)
+        stats['train_avg_tanimoto_dist'] = train_df['Avg Tanimoto'].mean()
+    if val_df is not None:
+        stats['val_len'] = len(val_df)
+        stats['val_active_perc'] = val_df[active_label].sum() / len(val_df)
+        stats['val_inactive_perc'] = (len(val_df) - val_df[active_label].sum()) / len(val_df)
+        stats['val_avg_tanimoto_dist'] = val_df['Avg Tanimoto'].mean()
+    if test_df is not None:
+        stats['test_len'] = len(test_df)
+        stats['test_active_perc'] = test_df[active_label].sum() / len(test_df)
+        stats['test_inactive_perc'] = (len(test_df) - test_df[active_label].sum()) / len(test_df)
+        stats['test_avg_tanimoto_dist'] = test_df['Avg Tanimoto'].mean()
+    if train_df is not None and val_df is not None:
+        leaking_uniprot = list(set(train_df['Uniprot']).intersection(set(val_df['Uniprot'])))
+        leaking_smiles = list(set(train_df['Smiles']).intersection(set(val_df['Smiles'])))
+        stats['num_leaking_uniprot_train_val'] = len(leaking_uniprot)
+        stats['num_leaking_smiles_train_val'] = len(leaking_smiles)
+        stats['perc_leaking_uniprot_train_val'] = len(train_df[train_df['Uniprot'].isin(leaking_uniprot)]) / len(train_df)
+        stats['perc_leaking_smiles_train_val'] = len(train_df[train_df['Smiles'].isin(leaking_smiles)]) / len(train_df)
+        key_cols = [
+            'Smiles',
+            'Uniprot',
+            'E3 Ligase Uniprot',
+            'Cell Line Identifier',
+        ]
+        class_cols = ['DC50 (nM)', 'Dmax (%)']
+        # Check if there are any entries that are in BOTH train and val sets
+        tmp_train_df = train_df[key_cols + class_cols].copy()
+        tmp_val_df = val_df[key_cols + class_cols].copy()
+        stats['leaking_train_val'] = len(tmp_train_df.merge(tmp_val_df, on=key_cols + class_cols, how='inner'))
+    if train_df is not None and test_df is not None:
+        leaking_uniprot = list(set(train_df['Uniprot']).intersection(set(test_df['Uniprot'])))
+        leaking_smiles = list(set(train_df['Smiles']).intersection(set(test_df['Smiles'])))
+        stats['num_leaking_uniprot_train_test'] = len(leaking_uniprot)
+        stats['num_leaking_smiles_train_test'] = len(leaking_smiles)
+        stats['perc_leaking_uniprot_train_test'] = len(train_df[train_df['Uniprot'].isin(leaking_uniprot)]) / len(train_df)
+        stats['perc_leaking_smiles_train_test'] = len(train_df[train_df['Smiles'].isin(leaking_smiles)]) / len(train_df)
+        key_cols = [
+            'Smiles',
+            'Uniprot',
+            'E3 Ligase Uniprot',
+            'Cell Line Identifier',
+        ]
+        class_cols = ['DC50 (nM)', 'Dmax (%)']
+        # Check if there are any entries that are in BOTH train and test sets
+        tmp_train_df = train_df[key_cols + class_cols].copy()
+        tmp_test_df = test_df[key_cols + class_cols].copy()
+        stats['leaking_train_test'] = len(tmp_train_df.merge(tmp_test_df, on=key_cols + class_cols, how='inner'))
+    return stats
+def merge_numerical_cols(group):
+    key_cols = [
+        'Smiles',
+        'Uniprot',
+        'E3 Ligase Uniprot',
+        'Cell Line Identifier',
+    ]
+    class_cols = ['DC50 (nM)', 'Dmax (%)']
+    # Loop over all numerical columns
+    for col in group.select_dtypes(include=[np.number]).columns:
+        if col == 'Compound ID':
+            continue
+        # Compute the geometric mean for the column
+        values = group[col].dropna()
+        if not values.empty:
+            group[col] = np.prod(values) ** (1 / len(values))
+    row = group.drop_duplicates(subset=key_cols + class_cols).reset_index(drop=True)
+    assert len(row) == 1
+    return row
+def remove_duplicates(df):
+    key_cols = [
+        'Smiles',
+        'Uniprot',
+        'E3 Ligase Uniprot',
+        'Cell Line Identifier',
+    ]
+    class_cols = ['DC50 (nM)', 'Dmax (%)']
+    # Check if there are any duplicated entries having the same key columns, if
+    # so, merge them by applying a geometric mean to their DC50 and Dmax columns
+    duplicated = df[df.duplicated(subset=key_cols, keep=False)]
+    # NOTE: Reset index to remove the multi-index
+    merged = duplicated.groupby(key_cols).apply(lambda x: merge_numerical_cols(x))
+    merged = merged.reset_index(drop=True)
+    # Remove the duplicated entries from the original dataframe df
+    df = df[~df.duplicated(subset=key_cols, keep=False)]
+    # Concatenate the merged dataframe with the original dataframe
+    return pd.concat([df, merged], ignore_index=True)
 def main(
     active_col: str = 'Active (Dmax 0.6, pDC50 6.0)',
     test_split: float = 0.1,
     studies: str | Literal['all', 'standard', 'e3_ligase', 'similarity', 'target'] = 'all',
+    cv_n_splits: int = 5,
 ):
     """ Get and save the datasets for the different studies.
     protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
     # Map E3 Ligase Iap to IAP
     protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
+    # Remove duplicates
+    protac_df = remove_duplicates(protac_df)
+    # Remove legacy columns if they exist
+    if 'Active - OR' in protac_df.columns:
+        protac_df.drop(columns='Active - OR', inplace=True)
+    if 'Active - AND' in protac_df.columns:
+        protac_df.drop(columns='Active - AND', inplace=True)
+    if 'Active' in protac_df.columns:
+        protac_df.drop(columns='Active', inplace=True)
+    # Calculate Activity and add it as a column
     protac_df[active_col] = protac_df.apply(
         lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
     )
+    # Precompute fingerprints and average Tanimoto similarity
     _, protac_df = get_smiles2fp_and_avg_tanimoto(protac_df)
     ## Get the test sets
     test_indeces = {}
     active_df = protac_df[protac_df[active_col].notna()].copy()
     if studies == 'standard' or studies == 'all':
         test_indeces['standard'] = get_random_split_indices(active_df, test_split)
     if studies == 'target' or studies == 'all':
         test_indeces['target'] = get_target_split_indices(active_df, active_col, test_split)
     if studies == 'similarity' or studies == 'all':
         test_indeces['similarity'] = get_tanimoto_split_indices(active_df, active_col, test_split)
+    # if studies == 'e3_ligase' or studies == 'all':
+    #     test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
     # Make directory for studies datasets if it does not exist
     data_dir = '../data/studies'
     if not os.path.exists(data_dir):
         os.makedirs(data_dir)
+    # Open file for reporting
+    with open(f'{data_dir}/report_datasets.md', 'w') as f:
+        # Cross-Validation Training
+        for split_type, indeces in test_indeces.items():
+            test_df = active_df.loc[indeces].copy()
+            train_val_df = active_df[~active_df.index.isin(test_df.index)].copy()
+            # Print statistics on active/inactive percentages
+            perc_active = train_val_df[active_col].sum() / len(train_val_df)
+            print('-' * 80)
+            print(f'{split_type.capitalize()} Split')
+            print(f'Len Train/Val:{len(train_val_df)}')
+            print(f'Len Test: {len(test_df)}')
+            print(f'Percentage Active in Train/Val: {perc_active:.2%}')
+            print(f'Percentage Inactive in Train/Val: {1 - perc_active:.2%}')
+            # Get the CV object
+            if split_type == 'standard':
+                kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+                group = None
+            elif split_type == 'e3_ligase':
+                kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+                group = train_val_df['E3 Group'].to_numpy()
+            elif split_type == 'similarity':
+                kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+                group = train_val_df['Tanimoto Group'].to_numpy()
+            elif split_type == 'target':
+                kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+                group = train_val_df['Uniprot Group'].to_numpy()
+            # Get the folds on the train_val_df, then collect statistics on active/inactive percentages
+            stats = []
+            for i, (train_index, val_index) in enumerate(kf.split(train_val_df, train_val_df[active_col].to_list(), group)):
+                train_df = train_val_df.iloc[train_index]
+                val_df = train_val_df.iloc[val_index]
+                s = get_dataframe_stats(train_df, val_df, test_df, active_col)
+                s['fold'] = i + 1
+                stats.append(s)
+            # Append the statistics as markdown to report file f
+            stats_df = pd.DataFrame(stats)
+            f.write(f'## {split_type.capitalize()} Split\n\n')
+            f.write(stats_df.to_markdown(index=False))
+            f.write('\n\n')
+            print('-' * 80)
+        # Save the datasets
         train_val_perc = f'{int((1 - test_split) * 100)}'
         test_perc = f'{int(test_split * 100)}'
         train_val_filename = f'{data_dir}/{split_type}_train_val_{train_val_perc}split_{active_name}.csv'
         test_filename = f'{data_dir}/{split_type}_test_{test_perc}split_{active_name}.csv'
+        # print('')
+        # print(f'Saving train_val datasets as: {train_val_filename}')
+        # print(f'Saving test datasets as:      {test_filename}')
         train_val_df.to_csv(train_val_filename, index=False)
         test_df.to_csv(test_filename, index=False)

src/run_experiments.py CHANGED Viewed

@@ -233,7 +233,7 @@ def main(
     max_epochs: int = 100,
     run_sklearn: bool = False,
     force_study: bool = False,
-    experiments: str | Literal['all', 'random', 'e3_ligase', 'tanimoto', 'uniprot'] = 'all',
 ):
     """ Train a PROTAC model using the given datasets and hyperparameters.
@@ -250,34 +250,39 @@ def main(
     """
     pl.seed_everything(42)
-    # Set the Column to Predict
-    active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
-    # Get Dmax_threshold from the active_col
-    Dmax_threshold = float(active_col.split('Dmax')[1].split(',')[0].strip('(').strip(')').strip())
-    pDC50_threshold = float(active_col.split('pDC50')[1].strip('(').strip(')').strip())
-    # Load the PROTAC dataset
-    protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
-    # Map E3 Ligase Iap to IAP
-    protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
-    protac_df[active_col] = protac_df.apply(
-        lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
-    )
-    smiles2fp, protac_df = get_smiles2fp_and_avg_tanimoto(protac_df)
-    ## Get the test sets
-    test_indeces = {}
-    active_df = protac_df[protac_df[active_col].notna()].copy()
-    if experiments == 'random' or experiments == 'all':
-        test_indeces['random'] = get_random_split_indices(active_df, test_split)
-    if experiments == 'uniprot' or experiments == 'all':
-        test_indeces['uniprot'] = get_target_split_indices(active_df, active_col, test_split)
-    if experiments == 'e3_ligase' or experiments == 'all':
-        test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
-    if experiments == 'tanimoto' or experiments == 'all':
-        test_indeces['tanimoto'] = get_tanimoto_split_indices(active_df, active_col, test_split)
     # Make directory ../reports if it does not exist
     if not os.path.exists('../reports'):
@@ -287,28 +292,46 @@ def main(
     protein2embedding = pdp.load_protein2embedding('../data/uniprot2embedding.h5')
     cell2embedding = pdp.load_cell2embedding('../data/cell2embedding.pkl')
     # Cross-Validation Training
     reports = defaultdict(list)
-    for split_type, indeces in test_indeces.items():
-        test_df = active_df.loc[indeces].copy()
-        train_val_df = active_df[~active_df.index.isin(test_df.index)].copy()
         # Get the CV object
-        if split_type == 'random':
             kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
             group = None
         elif split_type == 'e3_ligase':
             kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
             group = train_val_df['E3 Group'].to_numpy()
-        elif split_type == 'tanimoto':
             kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
             group = train_val_df['Tanimoto Group'].to_numpy()
-        elif split_type == 'uniprot':
             kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
             group = train_val_df['Uniprot Group'].to_numpy()
         # Start the experiment
-        experiment_name = f'{active_name}_test_split_{test_split}_{split_type}'
         optuna_reports = pdp.hyperparameter_tuning_and_training(
             protein2embedding=protein2embedding,
             cell2embedding=cell2embedding,
@@ -323,7 +346,7 @@ def main(
             n_trials=n_trials,
             max_epochs=max_epochs,
             logger_save_dir='../logs',
-            logger_name=f'logs_{experiment_name}',
             active_label=active_col,
             study_filename=f'../reports/study_{experiment_name}.pkl',
             force_study=force_study,
@@ -334,121 +357,6 @@ def main(
             report.to_csv(f'../reports/{report_name}_{experiment_name}.csv', index=False)
             reports[report_name].append(report.copy())
-        # # Start the CV over the folds
-        # X = train_val_df.drop(columns=active_col)
-        # y = train_val_df[active_col].tolist()
-        # for k, (train_index, val_index) in enumerate(kf.split(X, y, group)):
-        #     print('-' * 100)
-        #     print(f'Starting CV for group type: {split_type}, fold: {k}')
-        #     print('-' * 100)
-        #     train_df = train_val_df.iloc[train_index]
-        #     val_df = train_val_df.iloc[val_index]
-        #     leaking_uniprot = list(set(train_df['Uniprot']).intersection(set(val_df['Uniprot'])))
-        #     leaking_smiles = list(set(train_df['Smiles']).intersection(set(val_df['Smiles'])))
-        #     stats = {
-        #         'fold': k,
-        #         'split_type': split_type,
-        #         'train_len': len(train_df),
-        #         'val_len': len(val_df),
-        #         'train_perc': len(train_df) / len(train_val_df),
-        #         'val_perc': len(val_df) / len(train_val_df),
-        #         'train_active_perc': train_df[active_col].sum() / len(train_df),
-        #         'train_inactive_perc': (len(train_df) - train_df[active_col].sum()) / len(train_df),
-        #         'val_active_perc': val_df[active_col].sum() / len(val_df),
-        #         'val_inactive_perc': (len(val_df) - val_df[active_col].sum()) / len(val_df),
-        #         'test_active_perc': test_df[active_col].sum() / len(test_df),
-        #         'test_inactive_perc': (len(test_df) - test_df[active_col].sum()) / len(test_df),
-        #         'num_leaking_uniprot': len(leaking_uniprot),
-        #         'num_leaking_smiles': len(leaking_smiles),
-        #         'train_leaking_uniprot_perc': len(train_df[train_df['Uniprot'].isin(leaking_uniprot)]) / len(train_df),
-        #         'train_leaking_smiles_perc': len(train_df[train_df['Smiles'].isin(leaking_smiles)]) / len(train_df),
-        #     }
-        #     if split_type != 'random':
-        #         stats['train_unique_groups'] = len(np.unique(group[train_index]))
-        #         stats['val_unique_groups'] = len(np.unique(group[val_index]))
-        #     # At each fold, train and evaluate the Pytorch model
-        #     if split_type != 'tanimoto' or run_sklearn:
-        #         logging.info(f'Skipping Pytorch model training on fold {k} with split type {split_type} and test split {test_split}.')
-        #         continue
-        #     else:
-        #         logging.info(f'Starting Pytorch model training on fold {k} with split type {split_type} and test split {test_split}.')
-        #         # Train and evaluate the model
-        #         model, trainer, metrics = pdp.hyperparameter_tuning_and_training(
-        #             protein2embedding,
-        #             cell2embedding,
-        #             smiles2fp,
-        #             train_df,
-        #             val_df,
-        #             test_df,
-        #             fast_dev_run=fast_dev_run,
-        #             n_trials=n_trials,
-        #             logger_name=f'protac_{active_name}_{split_type}_fold_{k}_test_split_{test_split}',
-        #             active_label=active_col,
-        #             study_filename=f'../reports/study_{active_name}_{split_type}_fold_{k}_test_split_{test_split}.pkl',
-        #         )
-        #         hparams = {p.replace('hparam_', ''): v for p, v in stats.items() if p.startswith('hparam_')}
-        #         stats.update(metrics)
-        #         stats['model_type'] = 'Pytorch'
-        #         report.append(stats.copy())
-        #         del model
-        #         del trainer
-        #         # Ablation study: disable embeddings at a time
-        #         for disabled_embeddings in [['e3'], ['poi'], ['cell'], ['smiles'], ['e3', 'cell'], ['poi', 'e3', 'cell']]:
-        #             print('-' * 100)
-        #             print(f'Ablation study with disabled embeddings: {disabled_embeddings}')
-        #             print('-' * 100)
-        #             stats['disabled_embeddings'] = 'disabled ' + ' '.join(disabled_embeddings)
-        #             model, trainer, metrics = pdp.train_model(
-        #                 protein2embedding,
-        #                 cell2embedding,
-        #                 smiles2fp,
-        #                 train_df,
-        #                 val_df,
-        #                 test_df,
-        #                 fast_dev_run=fast_dev_run,
-        #                 logger_name=f'protac_{active_name}_{split_type}_fold_{k}_disabled-{"-".join(disabled_embeddings)}',
-        #                 active_label=active_col,
-        #                 disabled_embeddings=disabled_embeddings,
-        #                 **hparams,
-        #             )
-        #             stats.update(metrics)
-        #             report.append(stats.copy())
-        #             del model
-        #             del trainer
-        #     # At each fold, train and evaluate sklearn models
-        #     if run_sklearn:
-        #         for model_type in ['RandomForest', 'SVC', 'LogisticRegression', 'GradientBoosting']:
-        #             logging.info(f'Starting sklearn model {model_type} training on fold {k} with split type {split_type} and test split {test_split}.')
-        #             # Train and evaluate sklearn models
-        #             model, metrics = pdp.hyperparameter_tuning_and_training_sklearn(
-        #                 protein2embedding=protein2embedding,
-        #                 cell2embedding=cell2embedding,
-        #                 smiles2fp=smiles2fp,
-        #                 train_df=train_df,
-        #                 val_df=val_df,
-        #                 test_df=test_df,
-        #                 model_type=model_type,
-        #                 active_label=active_col,
-        #                 n_trials=n_trials,
-        #                 study_filename=f'../reports/study_{active_name}_{split_type}_fold_{k}_test_split_{test_split}_{model_type.lower()}.pkl',
-        #             )
-        #             hparams = {p.replace('hparam_', ''): v for p, v in stats.items() if p.startswith('hparam_')}
-        #             stats['model_type'] = model_type
-        #             stats.update(metrics)
-        #             report.append(stats.copy())
-        # # Save the report at the end of each split type
-        # report_df = pd.DataFrame(report)
-        # report_df.to_csv(
-        #     f'../reports/cv_report_hparam_search_{cv_n_splits}-splits_{active_name}_test_split_{test_split}{"_sklearn" if run_sklearn else ""}.csv',
-        #     index=False,
-        # )
 if __name__ == '__main__':
     cli = CLI(main)

     max_epochs: int = 100,
     run_sklearn: bool = False,
     force_study: bool = False,
+    experiments: str | Literal['all', 'standard', 'e3_ligase', 'similarity', 'target'] = 'all',
 ):
     """ Train a PROTAC model using the given datasets and hyperparameters.
     """
     pl.seed_everything(42)
+    # # Set the Column to Predict
+    # active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
+    # # Get Dmax_threshold from the active_col
+    # Dmax_threshold = float(active_col.split('Dmax')[1].split(',')[0].strip('(').strip(')').strip())
+    # pDC50_threshold = float(active_col.split('pDC50')[1].strip('(').strip(')').strip())
+    # # Load the PROTAC dataset
+    # protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
+    # # Map E3 Ligase Iap to IAP
+    # protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
+    # protac_df[active_col] = protac_df.apply(
+    #     lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
+    # )
+    # # Drop duplicates
+    # protac_df = protac_df.drop_duplicates(subset=['Smiles', 'Uniprot', 'E3 Ligase Uniprot', 'Cell Line Identifier', active_col])
+    # # Precompute fingerprints and average Tanimoto similarity
+    # smiles2fp, protac_df = get_smiles2fp_and_avg_tanimoto(protac_df)
+    # ## Get the test sets
+    # test_indeces = {}
+    # active_df = protac_df[protac_df[active_col].notna()].copy()
+    # if experiments == 'standard' or experiments == 'all':
+    #     test_indeces['standard'] = get_random_split_indices(active_df, test_split)
+    # if experiments == 'target' or experiments == 'all':
+    #     test_indeces['target'] = get_target_split_indices(active_df, active_col, test_split)
+    # if experiments == 'similarity' or experiments == 'all':
+    #     test_indeces['similarity'] = get_tanimoto_split_indices(active_df, active_col, test_split, n_bins_tanimoto=100)
+    # if experiments == 'e3_ligase' or experiments == 'all':
+    #     test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
     # Make directory ../reports if it does not exist
     if not os.path.exists('../reports'):
     protein2embedding = pdp.load_protein2embedding('../data/uniprot2embedding.h5')
     cell2embedding = pdp.load_cell2embedding('../data/cell2embedding.pkl')
+    studies_dir = '../data/studies'
+    train_val_perc = f'{int((1 - test_split) * 100)}'
+    test_perc = f'{int(test_split * 100)}'
+    active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
+    if experiments == 'all':
+        experiments = ['standard', 'similarity', 'target']
+    else:
+        experiments = [experiments]
     # Cross-Validation Training
     reports = defaultdict(list)
+    for split_type in experiments:
+        train_val_filename = f'{split_type}_train_val_{train_val_perc}split_{active_name}.csv'
+        test_filename = f'{split_type}_test_{test_perc}split_{active_name}.csv'
+        train_val_df = pd.read_csv(os.path.join(studies_dir, train_val_filename))
+        test_df = pd.read_csv(os.path.join(studies_dir, test_filename))
+        # Get SMILES and precompute fingerprints dictionary
+        unique_smiles = pd.concat([train_val_df, test_df])['Smiles'].unique().tolist()
+        smiles2fp = {s: np.array(pdp.get_fingerprint(s)) for s in unique_smiles}
         # Get the CV object
+        if split_type == 'standard':
             kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
             group = None
         elif split_type == 'e3_ligase':
             kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
             group = train_val_df['E3 Group'].to_numpy()
+        elif split_type == 'similarity':
             kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
             group = train_val_df['Tanimoto Group'].to_numpy()
+        elif split_type == 'target':
             kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
             group = train_val_df['Uniprot Group'].to_numpy()
         # Start the experiment
+        experiment_name = f'{split_type}_{active_name}_test_split_{test_split}'
         optuna_reports = pdp.hyperparameter_tuning_and_training(
             protein2embedding=protein2embedding,
             cell2embedding=cell2embedding,
             n_trials=n_trials,
             max_epochs=max_epochs,
             logger_save_dir='../logs',
+            logger_name=f'{experiment_name}',
             active_label=active_col,
             study_filename=f'../reports/study_{experiment_name}.pkl',
             force_study=force_study,
             report.to_csv(f'../reports/{report_name}_{experiment_name}.csv', index=False)
             reports[report_name].append(report.copy())
 if __name__ == '__main__':
     cli = CLI(main)

src/run_experiments_aminoacid_counts.py CHANGED Viewed

@@ -59,6 +59,7 @@ def main(
         force_study (bool): Whether to force the creation of a new study.
         experiments (str): Type of experiments to run. Options are 'all', 'standard', 'e3_ligase', 'similarity', 'target'.
     """
     # Make directory ../reports if it does not exist
     if not os.path.exists('../reports'):
@@ -81,7 +82,7 @@ def main(
     # NOTE: Check that the protein2embedding is a dictionary of strings
     if not all(isinstance(k, str) for k in protein2embedding.keys()):
         raise ValueError("All keys in `protein2embedding` must be strings.")
-    countvec = CountVectorizer(ngram_range=(1,1), analyzer='char')
     protein_embeddings = countvec.fit_transform(
         list(protein2embedding.keys())
     ).toarray()
@@ -126,7 +127,7 @@ def main(
             group = train_val_df['Uniprot Group'].to_numpy()
         # Start the experiment
-        experiment_name = f'{active_name}_test_split_{test_split}_{split_type}'
         optuna_reports = pdp.hyperparameter_tuning_and_training(
             protein2embedding=protein2embedding,
             cell2embedding=cell2embedding,
@@ -141,11 +142,10 @@ def main(
             n_trials=n_trials,
             max_epochs=max_epochs,
             logger_save_dir='../logs',
-            logger_name=f'logs_{experiment_name}',
             active_label=active_col,
             study_filename=f'../reports/study_aminoacidcnt_{experiment_name}.pkl',
             force_study=force_study,
-            use_amino_acid_count=True,
         )
         # Save the reports to file

         force_study (bool): Whether to force the creation of a new study.
         experiments (str): Type of experiments to run. Options are 'all', 'standard', 'e3_ligase', 'similarity', 'target'.
     """
+    pl.seed_everything(42)
     # Make directory ../reports if it does not exist
     if not os.path.exists('../reports'):
     # NOTE: Check that the protein2embedding is a dictionary of strings
     if not all(isinstance(k, str) for k in protein2embedding.keys()):
         raise ValueError("All keys in `protein2embedding` must be strings.")
+    countvec = CountVectorizer(ngram_range=(1, 1), analyzer='char')
     protein_embeddings = countvec.fit_transform(
         list(protein2embedding.keys())
     ).toarray()
             group = train_val_df['Uniprot Group'].to_numpy()
         # Start the experiment
+        experiment_name = f'{split_type}_{active_name}_test_split_{test_split}'
         optuna_reports = pdp.hyperparameter_tuning_and_training(
             protein2embedding=protein2embedding,
             cell2embedding=cell2embedding,
             n_trials=n_trials,
             max_epochs=max_epochs,
             logger_save_dir='../logs',
+            logger_name=f'aminoacidcnt_{experiment_name}',
             active_label=active_col,
             study_filename=f'../reports/study_aminoacidcnt_{experiment_name}.pkl',
             force_study=force_study,
         )
         # Save the reports to file

src/run_experiments_cells_onehot.py CHANGED Viewed

@@ -61,6 +61,7 @@ def main(
         force_study (bool): Whether to force the creation of a new study.
         experiments (str): Type of experiments to run. Options are 'all', 'standard', 'e3_ligase', 'similarity', 'target'.
     """
     # Make directory ../reports if it does not exist
     if not os.path.exists('../reports'):
@@ -116,7 +117,7 @@ def main(
             group = train_val_df['Uniprot Group'].to_numpy()
         # Start the experiment
-        experiment_name = f'{active_name}_test_split_{test_split}_{split_type}'
         optuna_reports = pdp.hyperparameter_tuning_and_training(
             protein2embedding=protein2embedding,
             cell2embedding=cell2embedding,
@@ -131,11 +132,10 @@ def main(
             n_trials=n_trials,
             max_epochs=max_epochs,
             logger_save_dir='../logs',
-            logger_name=f'logs_{experiment_name}',
             active_label=active_col,
             study_filename=f'../reports/study_cellsonehot_{experiment_name}.pkl',
             force_study=force_study,
-            use_cells_one_hot=True,
         )
         # Save the reports to file

         force_study (bool): Whether to force the creation of a new study.
         experiments (str): Type of experiments to run. Options are 'all', 'standard', 'e3_ligase', 'similarity', 'target'.
     """
+    pl.seed_everything(42)
     # Make directory ../reports if it does not exist
     if not os.path.exists('../reports'):
             group = train_val_df['Uniprot Group'].to_numpy()
         # Start the experiment
+        experiment_name = f'{split_type}_{active_name}_test_split_{test_split}'
         optuna_reports = pdp.hyperparameter_tuning_and_training(
             protein2embedding=protein2embedding,
             cell2embedding=cell2embedding,
             n_trials=n_trials,
             max_epochs=max_epochs,
             logger_save_dir='../logs',
+            logger_name=f'cellsonehot_{experiment_name}',
             active_label=active_col,
             study_filename=f'../reports/study_cellsonehot_{experiment_name}.pkl',
             force_study=force_study,
         )
         # Save the reports to file

src/run_experiments_xgboost.py CHANGED Viewed

@@ -232,7 +232,7 @@ def main(
     cv_n_splits: int = 5,
     num_boost_round: int = 100,
     force_study: bool = False,
-    experiments: str | Literal['all', 'random', 'e3_ligase', 'tanimoto', 'uniprot'] = 'all',
 ):
     """ Train a PROTAC model using the given datasets and hyperparameters.
@@ -244,34 +244,38 @@ def main(
     """
     pl.seed_everything(42)
-    # Set the Column to Predict
-    active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
-    # Get Dmax_threshold from the active_col
-    Dmax_threshold = float(active_col.split('Dmax')[1].split(',')[0].strip('(').strip(')').strip())
-    pDC50_threshold = float(active_col.split('pDC50')[1].strip('(').strip(')').strip())
-    # Load the PROTAC dataset
-    protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
-    # Map E3 Ligase Iap to IAP
-    protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
-    protac_df[active_col] = protac_df.apply(
-        lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
-    )
-    smiles2fp, protac_df = get_smiles2fp_and_avg_tanimoto(protac_df)
-    ## Get the test sets
-    test_indeces = {}
-    active_df = protac_df[protac_df[active_col].notna()].copy()
-    if experiments == 'random' or experiments == 'all':
-        test_indeces['random'] = get_random_split_indices(active_df, test_split)
-    if experiments == 'uniprot' or experiments == 'all':
-        test_indeces['uniprot'] = get_target_split_indices(active_df, active_col, test_split)
-    if experiments == 'e3_ligase' or experiments == 'all':
-        test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
-    if experiments == 'tanimoto' or experiments == 'all':
-        test_indeces['tanimoto'] = get_tanimoto_split_indices(active_df, active_col, test_split)
     # Make directory ../reports if it does not exist
     if not os.path.exists('../reports'):
@@ -281,23 +285,41 @@ def main(
     protein2embedding = pdp.load_protein2embedding('../data/uniprot2embedding.h5')
     cell2embedding = pdp.load_cell2embedding('../data/cell2embedding.pkl')
     # Cross-Validation Training
     reports = defaultdict(list)
-    for split_type, indeces in test_indeces.items():
-        test_df = active_df.loc[indeces].copy()
-        train_val_df = active_df[~active_df.index.isin(test_df.index)].copy()
         # Get the CV object
-        if split_type == 'random':
             kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
             group = None
         elif split_type == 'e3_ligase':
             kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
             group = train_val_df['E3 Group'].to_numpy()
-        elif split_type == 'tanimoto':
             kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
             group = train_val_df['Tanimoto Group'].to_numpy()
-        elif split_type == 'uniprot':
             kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
             group = train_val_df['Uniprot Group'].to_numpy()
@@ -326,5 +348,6 @@ def main(
             report.to_csv(f'../reports/xgboost_{report_name}_{experiment_name}.csv', index=False)
             reports[report_name].append(report.copy())
 if __name__ == '__main__':
     cli = CLI(main)

     cv_n_splits: int = 5,
     num_boost_round: int = 100,
     force_study: bool = False,
+    experiments: str | Literal['all', 'standard', 'e3_ligase', 'similarity', 'target'] = 'all',
 ):
     """ Train a PROTAC model using the given datasets and hyperparameters.
     """
     pl.seed_everything(42)
+    # # Set the Column to Predict
+    # active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
+    # # Get Dmax_threshold from the active_col
+    # Dmax_threshold = float(active_col.split('Dmax')[1].split(',')[0].strip('(').strip(')').strip())
+    # pDC50_threshold = float(active_col.split('pDC50')[1].strip('(').strip(')').strip())
+    # # Load the PROTAC dataset
+    # protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
+    # # Map E3 Ligase Iap to IAP
+    # protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
+    # protac_df[active_col] = protac_df.apply(
+    #     lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
+    # )
+    # # Drop duplicates
+    # protac_df = protac_df.drop_duplicates(subset=['Smiles', 'Uniprot', 'E3 Ligase Uniprot', 'Cell Line Identifier', active_col])
+    # # Precompute fingerprint dictionary and the average Tanimoto similarity
+    # smiles2fp, protac_df = get_smiles2fp_and_avg_tanimoto(protac_df)
+    # ## Get the test sets
+    # test_indeces = {}
+    # active_df = protac_df[protac_df[active_col].notna()].copy()
+    # if experiments == 'standard' or experiments == 'all':
+    #     test_indeces['standard'] = get_random_split_indices(active_df, test_split)
+    # if experiments == 'target' or experiments == 'all':
+    #     test_indeces['target'] = get_target_split_indices(active_df, active_col, test_split)
+    # if experiments == 'similarity' or experiments == 'all':
+    #     test_indeces['similarity'] = get_tanimoto_split_indices(active_df, active_col, test_split, n_bins_tanimoto=100)
+    # if experiments == 'e3_ligase' or experiments == 'all':
+    #     test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
     # Make directory ../reports if it does not exist
     if not os.path.exists('../reports'):
     protein2embedding = pdp.load_protein2embedding('../data/uniprot2embedding.h5')
     cell2embedding = pdp.load_cell2embedding('../data/cell2embedding.pkl')
+    studies_dir = '../data/studies'
+    train_val_perc = f'{int((1 - test_split) * 100)}'
+    test_perc = f'{int(test_split * 100)}'
+    active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
+    if experiments == 'all':
+        experiments = ['standard', 'similarity', 'target']
+    else:
+        experiments = [experiments]
     # Cross-Validation Training
     reports = defaultdict(list)
+    for split_type in experiments:
+        train_val_filename = f'{split_type}_train_val_{train_val_perc}split_{active_name}.csv'
+        test_filename = f'{split_type}_test_{test_perc}split_{active_name}.csv'
+        train_val_df = pd.read_csv(os.path.join(studies_dir, train_val_filename))
+        test_df = pd.read_csv(os.path.join(studies_dir, test_filename))
+        # Get SMILES and precompute fingerprints dictionary
+        unique_smiles = pd.concat([train_val_df, test_df])['Smiles'].unique().tolist()
+        smiles2fp = {s: np.array(pdp.get_fingerprint(s)) for s in unique_smiles}
         # Get the CV object
+        if split_type == 'standard':
             kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
             group = None
         elif split_type == 'e3_ligase':
             kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
             group = train_val_df['E3 Group'].to_numpy()
+        elif split_type == 'similarity':
             kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
             group = train_val_df['Tanimoto Group'].to_numpy()
+        elif split_type == 'target':
             kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
             group = train_val_df['Uniprot Group'].to_numpy()
             report.to_csv(f'../reports/xgboost_{report_name}_{experiment_name}.csv', index=False)
             reports[report_name].append(report.copy())
 if __name__ == '__main__':
     cli = CLI(main)