Spaces:

ailab-bio
/

PROTAC-Degradation-Predictor

Sleeping

App Files Files Community

ribesstefano commited on Apr 29, 2024

Commit

bda3015

1 Parent(s): fda7af7

Added softmax in model + Fixed and updated some hparams + Fixed bug in tanimoto distance (it was treated as similarity)

Browse files

Files changed (4) hide show

protac_degradation_predictor/optuna_utils.py +48 -7
protac_degradation_predictor/pytorch_models.py +24 -10
src/plot_experiment_results.py +33 -12
src/run_experiments.py +30 -12

protac_degradation_predictor/optuna_utils.py CHANGED Viewed

@@ -55,14 +55,17 @@ def get_dataframe_stats(
         stats['train_len'] = len(train_df)
         stats['train_active_perc'] = train_df[active_label].sum() / len(train_df)
         stats['train_inactive_perc'] = (len(train_df) - train_df[active_label].sum()) / len(train_df)
     if val_df is not None:
         stats['val_len'] = len(val_df)
         stats['val_active_perc'] = val_df[active_label].sum() / len(val_df)
         stats['val_inactive_perc'] = (len(val_df) - val_df[active_label].sum()) / len(val_df)
     if test_df is not None:
         stats['test_len'] = len(test_df)
         stats['test_active_perc'] = test_df[active_label].sum() / len(test_df)
         stats['test_inactive_perc'] = (len(test_df) - test_df[active_label].sum()) / len(test_df)
     if train_df is not None and val_df is not None:
         leaking_uniprot = list(set(train_df['Uniprot']).intersection(set(val_df['Uniprot'])))
         leaking_smiles = list(set(train_df['Smiles']).intersection(set(val_df['Smiles'])))
@@ -98,6 +101,10 @@ def pytorch_model_objective(
         active_label: str = 'Active',
         disabled_embeddings: List[str] = [],
         max_epochs: int = 100,
 ) -> float:
     """ Objective function for hyperparameter optimization.
@@ -116,11 +123,11 @@ def pytorch_model_objective(
     """
     # Suggest hyperparameters to be used accross the CV folds
     hidden_dim = trial.suggest_categorical('hidden_dim', hidden_dim_options)
-    batch_size = trial.suggest_categorical('batch_size', batch_size_options)
     learning_rate = trial.suggest_float('learning_rate', *learning_rate_options, log=True)
     smote_k_neighbors = trial.suggest_categorical('smote_k_neighbors', smote_k_neighbors_options)
     use_smote = trial.suggest_categorical('use_smote', [True, False])
-    apply_scaling = trial.suggest_categorical('apply_scaling', [True, False])
     dropout = trial.suggest_float('dropout', *dropout_options)
     # Start the CV over the folds
@@ -166,11 +173,14 @@ def pytorch_model_objective(
             smote_k_neighbors=smote_k_neighbors,
             apply_scaling=apply_scaling,
             use_smote=use_smote,
-            use_logger=False,
             fast_dev_run=fast_dev_run,
             active_label=active_label,
             return_predictions=True,
             disabled_embeddings=disabled_embeddings,
         )
         if test_df is not None:
             _, _, metrics, val_pred, test_pred = ret
@@ -246,11 +256,13 @@ def hyperparameter_tuning_and_training(
     pl.seed_everything(42)
     # Define the search space
-    hidden_dim_options = [32, 64, 128, 256, 512, 768]
-    batch_size_options = [4, 8, 16, 32, 64, 128]
-    learning_rate_options = (1e-5, 1e-3) # min and max values for loguniform distribution
     smote_k_neighbors_options = list(range(3, 16))
-    dropout_options = (0.2, 0.9)
     # Set the verbosity of Optuna
     optuna.logging.set_verbosity(optuna.logging.WARNING)
@@ -293,6 +305,31 @@ def hyperparameter_tuning_and_training(
     cv_report = pd.DataFrame(study.best_trial.user_attrs['report'])
     hparam_report = pd.DataFrame([study.best_params])
     # Retrain N models with the best hyperparameters (measure model uncertainty)
     test_report = []
     test_preds = []
@@ -315,6 +352,8 @@ def hyperparameter_tuning_and_training(
             enable_checkpointing=True,
             checkpoint_model_name=f'best_model_n{i}_{split_type}',
             return_predictions=True,
             **study.best_params,
         )
         # Rename the keys in the metrics dictionary
@@ -371,6 +410,8 @@ def hyperparameter_tuning_and_training(
             logger_save_dir=logger_save_dir,
             logger_name=f'{logger_name}_disabled-{"-".join(disabled_embeddings)}',
             disabled_embeddings=disabled_embeddings,
             **study.best_params,
         )
         # Rename the keys in the metrics dictionary

         stats['train_len'] = len(train_df)
         stats['train_active_perc'] = train_df[active_label].sum() / len(train_df)
         stats['train_inactive_perc'] = (len(train_df) - train_df[active_label].sum()) / len(train_df)
+        stats['train_avg_tanimoto_dist'] = train_df['Avg Tanimoto'].mean()
     if val_df is not None:
         stats['val_len'] = len(val_df)
         stats['val_active_perc'] = val_df[active_label].sum() / len(val_df)
         stats['val_inactive_perc'] = (len(val_df) - val_df[active_label].sum()) / len(val_df)
+        stats['val_avg_tanimoto_dist'] = val_df['Avg Tanimoto'].mean()
     if test_df is not None:
         stats['test_len'] = len(test_df)
         stats['test_active_perc'] = test_df[active_label].sum() / len(test_df)
         stats['test_inactive_perc'] = (len(test_df) - test_df[active_label].sum()) / len(test_df)
+        stats['test_avg_tanimoto_dist'] = test_df['Avg Tanimoto'].mean()
     if train_df is not None and val_df is not None:
         leaking_uniprot = list(set(train_df['Uniprot']).intersection(set(val_df['Uniprot'])))
         leaking_smiles = list(set(train_df['Smiles']).intersection(set(val_df['Smiles'])))
         active_label: str = 'Active',
         disabled_embeddings: List[str] = [],
         max_epochs: int = 100,
+        use_logger: bool = False,
+        logger_save_dir: str = 'logs',
+        logger_name: str = 'cv_model',
+        enable_checkpointing: bool = False,
 ) -> float:
     """ Objective function for hyperparameter optimization.
     """
     # Suggest hyperparameters to be used accross the CV folds
     hidden_dim = trial.suggest_categorical('hidden_dim', hidden_dim_options)
+    batch_size = 128 # trial.suggest_categorical('batch_size', batch_size_options)
     learning_rate = trial.suggest_float('learning_rate', *learning_rate_options, log=True)
     smote_k_neighbors = trial.suggest_categorical('smote_k_neighbors', smote_k_neighbors_options)
     use_smote = trial.suggest_categorical('use_smote', [True, False])
+    apply_scaling = True # trial.suggest_categorical('apply_scaling', [True, False])
     dropout = trial.suggest_float('dropout', *dropout_options)
     # Start the CV over the folds
             smote_k_neighbors=smote_k_neighbors,
             apply_scaling=apply_scaling,
             use_smote=use_smote,
             fast_dev_run=fast_dev_run,
             active_label=active_label,
             return_predictions=True,
             disabled_embeddings=disabled_embeddings,
+            use_logger=use_logger,
+            logger_save_dir=logger_save_dir,
+            logger_name=f'{logger_name}_fold{k}',
+            enable_checkpointing=enable_checkpointing,
         )
         if test_df is not None:
             _, _, metrics, val_pred, test_pred = ret
     pl.seed_everything(42)
     # Define the search space
+    hidden_dim_options = [32, 64, 128, 256, 512]
+    batch_size_options = [128, 128] # [4, 8, 16, 32, 64, 128]
+    learning_rate_options = (1e-6, 1e-3) # min and max values for loguniform distribution
     smote_k_neighbors_options = list(range(3, 16))
+    # NOTE: We want Optuna to explore the combination (very low dropout, very
+    # small hidden_dim)
+    dropout_options = (0, 0.5)
     # Set the verbosity of Optuna
     optuna.logging.set_verbosity(optuna.logging.WARNING)
     cv_report = pd.DataFrame(study.best_trial.user_attrs['report'])
     hparam_report = pd.DataFrame([study.best_params])
+    # Train the best CV models and store their checkpoints by running the objective
+    pytorch_model_objective(
+        trial=study.best_trial,
+        protein2embedding=protein2embedding,
+        cell2embedding=cell2embedding,
+        smiles2fp=smiles2fp,
+        train_val_df=train_val_df,
+        kf=kf,
+        groups=groups,
+        test_df=test_df,
+        hidden_dim_options=hidden_dim_options,
+        batch_size_options=batch_size_options,
+        learning_rate_options=learning_rate_options,
+        smote_k_neighbors_options=smote_k_neighbors_options,
+        dropout_options=dropout_options,
+        fast_dev_run=fast_dev_run,
+        active_label=active_label,
+        max_epochs=max_epochs,
+        disabled_embeddings=[],
+        use_logger=True,
+        logger_save_dir=logger_save_dir,
+        logger_name=f'{logger_name}_{split_type}_cv_model',
+        enable_checkpointing=True,
+    )
     # Retrain N models with the best hyperparameters (measure model uncertainty)
     test_report = []
     test_preds = []
             enable_checkpointing=True,
             checkpoint_model_name=f'best_model_n{i}_{split_type}',
             return_predictions=True,
+            batch_size=128,
+            apply_scaling=True,
             **study.best_params,
         )
         # Rename the keys in the metrics dictionary
             logger_save_dir=logger_save_dir,
             logger_name=f'{logger_name}_disabled-{"-".join(disabled_embeddings)}',
             disabled_embeddings=disabled_embeddings,
+            batch_size=128,
+            apply_scaling=True,
             **study.best_params,
         )
         # Rename the keys in the metrics dictionary

protac_degradation_predictor/pytorch_models.py CHANGED Viewed

@@ -63,15 +63,29 @@ class PROTAC_Predictor(nn.Module):
         self.__dict__.update(locals())
         # Define "surrogate models" branches
         if self.join_embeddings != 'beginning':
             if 'poi' not in self.disabled_embeddings:
-                self.poi_emb = nn.Linear(poi_emb_dim, hidden_dim)
             if 'e3' not in self.disabled_embeddings:
-                self.e3_emb = nn.Linear(e3_emb_dim, hidden_dim)
             if 'cell' not in self.disabled_embeddings:
-                self.cell_emb = nn.Linear(cell_emb_dim, hidden_dim)
             if 'smiles' not in self.disabled_embeddings:
-                self.smiles_emb = nn.Linear(smiles_emb_dim, hidden_dim)
         # Define hidden dimension for joining layer
         if self.join_embeddings == 'beginning':
@@ -95,6 +109,7 @@ class PROTAC_Predictor(nn.Module):
     def forward(self, poi_emb, e3_emb, cell_emb, smiles_emb):
         embeddings = []
         if self.join_embeddings == 'beginning':
             if 'poi' not in self.disabled_embeddings:
                 embeddings.append(poi_emb)
             if 'e3' not in self.disabled_embeddings:
@@ -123,7 +138,6 @@ class PROTAC_Predictor(nn.Module):
                 else:
                     x = embeddings[0]
         x = self.dropout(F.relu(self.fc1(x)))
-        x = self.dropout(F.relu(self.fc2(x)))
         x = self.fc3(x)
         return x
@@ -137,7 +151,7 @@ class PROTAC_Model(pl.LightningModule):
         poi_emb_dim: int = config.protein_embedding_size,
         e3_emb_dim: int = config.protein_embedding_size,
         cell_emb_dim: int = config.cell_embedding_size,
-        batch_size: int = 32,
         learning_rate: float = 1e-3,
         dropout: float = 0.2,
         join_embeddings: Literal['beginning', 'concat', 'sum'] = 'sum',
@@ -145,7 +159,7 @@ class PROTAC_Model(pl.LightningModule):
         val_dataset: PROTAC_Dataset = None,
         test_dataset: PROTAC_Dataset = None,
         disabled_embeddings: list = [],
-        apply_scaling: bool = False,
     ):
         """ Initialize the PROTAC Pytorch Lightning model.
@@ -388,7 +402,7 @@ def train_model(
         val_df: pd.DataFrame,
         test_df: Optional[pd.DataFrame] = None,
         hidden_dim: int = 768,
-        batch_size: int = 8,
         learning_rate: float = 2e-5,
         dropout: float = 0.2,
         max_epochs: int = 50,
@@ -399,7 +413,7 @@ def train_model(
         join_embeddings: Literal['beginning', 'concat', 'sum'] = 'sum',
         smote_k_neighbors:int = 5,
         use_smote: bool = True,
-        apply_scaling: bool = False,
         active_label: str = 'Active',
         fast_dev_run: bool = False,
         use_logger: bool = True,
@@ -508,7 +522,7 @@ def train_model(
         logger=loggers if use_logger else False,
         callbacks=callbacks,
         max_epochs=max_epochs,
-        val_check_interval=0.5,
         fast_dev_run=fast_dev_run,
         enable_model_summary=False,
         enable_checkpointing=enable_checkpointing,

         self.__dict__.update(locals())
         # Define "surrogate models" branches
+        # NOTE: The softmax is used to ensure that the embeddings are normalized
+        # and can be summed on a "similar scale".
         if self.join_embeddings != 'beginning':
             if 'poi' not in self.disabled_embeddings:
+                self.poi_emb = nn.Sequential(
+                    nn.Linear(poi_emb_dim, hidden_dim),
+                    nn.Softmax(dim=1),
+                )
             if 'e3' not in self.disabled_embeddings:
+                self.e3_emb = nn.Sequential(
+                    nn.Linear(e3_emb_dim, hidden_dim),
+                    nn.Softmax(dim=1),
+                )
             if 'cell' not in self.disabled_embeddings:
+                self.cell_emb = nn.Sequential(
+                    nn.Linear(cell_emb_dim, hidden_dim),
+                    nn.Softmax(dim=1),
+                )
             if 'smiles' not in self.disabled_embeddings:
+                self.smiles_emb = nn.Sequential(
+                    nn.Linear(smiles_emb_dim, hidden_dim),
+                    nn.Softmax(dim=1),
+                )
         # Define hidden dimension for joining layer
         if self.join_embeddings == 'beginning':
     def forward(self, poi_emb, e3_emb, cell_emb, smiles_emb):
         embeddings = []
         if self.join_embeddings == 'beginning':
+            # TODO: Remove this if-branch
             if 'poi' not in self.disabled_embeddings:
                 embeddings.append(poi_emb)
             if 'e3' not in self.disabled_embeddings:
                 else:
                     x = embeddings[0]
         x = self.dropout(F.relu(self.fc1(x)))
         x = self.fc3(x)
         return x
         poi_emb_dim: int = config.protein_embedding_size,
         e3_emb_dim: int = config.protein_embedding_size,
         cell_emb_dim: int = config.cell_embedding_size,
+        batch_size: int = 128,
         learning_rate: float = 1e-3,
         dropout: float = 0.2,
         join_embeddings: Literal['beginning', 'concat', 'sum'] = 'sum',
         val_dataset: PROTAC_Dataset = None,
         test_dataset: PROTAC_Dataset = None,
         disabled_embeddings: list = [],
+        apply_scaling: bool = True,
     ):
         """ Initialize the PROTAC Pytorch Lightning model.
         val_df: pd.DataFrame,
         test_df: Optional[pd.DataFrame] = None,
         hidden_dim: int = 768,
+        batch_size: int = 128,
         learning_rate: float = 2e-5,
         dropout: float = 0.2,
         max_epochs: int = 50,
         join_embeddings: Literal['beginning', 'concat', 'sum'] = 'sum',
         smote_k_neighbors:int = 5,
         use_smote: bool = True,
+        apply_scaling: bool = True,
         active_label: str = 'Active',
         fast_dev_run: bool = False,
         use_logger: bool = True,
         logger=loggers if use_logger else False,
         callbacks=callbacks,
         max_epochs=max_epochs,
+        # val_check_interval=0.5,
         fast_dev_run=fast_dev_run,
         enable_model_summary=False,
         enable_checkpointing=enable_checkpointing,

src/plot_experiment_results.py CHANGED Viewed

@@ -12,7 +12,9 @@ import numpy as np
 palette = ['#83B8FE', '#FFA54C', '#94ED67', '#FF7FFF']
-def plot_training_curves(df, split_type):
     # Clean the data
     df = df.dropna(how='all', axis=1)
@@ -26,14 +28,14 @@ def plot_training_curves(df, split_type):
     # Plot training loss
     ax[0].plot(epoch_data.index, epoch_data['train_loss_epoch'], label='Training Loss')
-    ax[0].plot(epoch_data.index, epoch_data['test_loss'], label='Test Loss', linestyle='--')
     ax[0].set_ylabel('Loss')
     ax[0].legend(loc='lower right')
     ax[0].grid(axis='both', alpha=0.5)
     # Plot training accuracy
     ax[1].plot(epoch_data.index, epoch_data['train_acc_epoch'], label='Training Accuracy')
-    ax[1].plot(epoch_data.index, epoch_data['test_acc'], label='Test Accuracy', linestyle='--')
     ax[1].set_ylabel('Accuracy')
     ax[1].legend(loc='lower right')
     ax[1].grid(axis='both', alpha=0.5)
@@ -44,7 +46,7 @@ def plot_training_curves(df, split_type):
     # Plot training ROC-AUC
     ax[2].plot(epoch_data.index, epoch_data['train_roc_auc_epoch'], label='Training ROC-AUC')
-    ax[2].plot(epoch_data.index, epoch_data['test_roc_auc'], label='Test ROC-AUC', linestyle='--')
     ax[2].set_ylabel('ROC-AUC')
     ax[2].legend(loc='lower right')
     ax[2].grid(axis='both', alpha=0.5)
@@ -270,10 +272,18 @@ def plot_ablation_study(report):
         plt.savefig(f'plots/ablation_study_{group}.pdf', bbox_inches='tight')
 def main():
     active_col = 'Active (Dmax 0.6, pDC50 6.0)'
     test_split = 0.1
     n_models_for_test = 3
     active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
     report_base_name = f'{active_name}_test_split_{test_split}'
@@ -300,25 +310,36 @@ def main():
             pd.read_csv(f'reports/hparam_report_{report_base_name}_uniprot.csv'),
             pd.read_csv(f'reports/hparam_report_{report_base_name}_tanimoto.csv'),
         ]),
     }
-    metrics = {}
-    for i in range(n_models_for_test):
-        for split_type in ['random', 'tanimoto', 'uniprot', 'e3_ligase']:
             logs_dir = f'logs_{report_base_name}_{split_type}_best_model_n{i}'
-            metrics[f'{split_type}_{i}'] = pd.read_csv(f'logs/{logs_dir}/{logs_dir}/metrics.csv')
-            metrics[f'{split_type}_{i}']['model_id'] = i
             # Rename 'val_' columns to 'test_' columns
-            metrics[f'{split_type}_{i}'] = metrics[f'{split_type}_{i}'].rename(columns={'val_loss': 'test_loss', 'val_acc': 'test_acc', 'val_roc_auc': 'test_roc_auc'})
-            plot_training_curves(metrics[f'{split_type}_{i}'], f'{split_type}_{i}')
     df_val = reports['cv_train']
     df_test = reports['test']
     plot_performance_metrics(df_val, df_test, title=f'{active_name}_metrics')
     reports['test']['disabled_embeddings'] = pd.NA
     plot_ablation_study(pd.concat([
         reports['ablation'],

 palette = ['#83B8FE', '#FFA54C', '#94ED67', '#FF7FFF']
+def plot_training_curves(df, split_type, stage='test'):
+    Stage = 'Test' if stage == 'test' else 'Validation'
     # Clean the data
     df = df.dropna(how='all', axis=1)
     # Plot training loss
     ax[0].plot(epoch_data.index, epoch_data['train_loss_epoch'], label='Training Loss')
+    ax[0].plot(epoch_data.index, epoch_data[f'{stage}_loss'], label=f'{Stage} Loss', linestyle='--')
     ax[0].set_ylabel('Loss')
     ax[0].legend(loc='lower right')
     ax[0].grid(axis='both', alpha=0.5)
     # Plot training accuracy
     ax[1].plot(epoch_data.index, epoch_data['train_acc_epoch'], label='Training Accuracy')
+    ax[1].plot(epoch_data.index, epoch_data[f'{stage}_acc'], label=f'{Stage} Accuracy', linestyle='--')
     ax[1].set_ylabel('Accuracy')
     ax[1].legend(loc='lower right')
     ax[1].grid(axis='both', alpha=0.5)
     # Plot training ROC-AUC
     ax[2].plot(epoch_data.index, epoch_data['train_roc_auc_epoch'], label='Training ROC-AUC')
+    ax[2].plot(epoch_data.index, epoch_data[f'{stage}_roc_auc'], label=f'{Stage} ROC-AUC', linestyle='--')
     ax[2].set_ylabel('ROC-AUC')
     ax[2].legend(loc='lower right')
     ax[2].grid(axis='both', alpha=0.5)
         plt.savefig(f'plots/ablation_study_{group}.pdf', bbox_inches='tight')
+def plot_majority_voting_performance(df):
+    # cv_models,test_acc,test_roc_auc,split_type
+    # Melt the dataframe
+    df = df.melt(id_vars=['cv_models', 'test_acc', 'test_roc_auc', 'split_type'], var_name='Metric', value_name='Score')
+    print(df)
 def main():
     active_col = 'Active (Dmax 0.6, pDC50 6.0)'
     test_split = 0.1
     n_models_for_test = 3
+    cv_n_folds = 5
     active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
     report_base_name = f'{active_name}_test_split_{test_split}'
             pd.read_csv(f'reports/hparam_report_{report_base_name}_uniprot.csv'),
             pd.read_csv(f'reports/hparam_report_{report_base_name}_tanimoto.csv'),
         ]),
+        'majority_vote': pd.concat([
+            pd.read_csv(f'reports/majority_vote_report_{report_base_name}_random.csv'),
+            pd.read_csv(f'reports/majority_vote_report_{report_base_name}_uniprot.csv'),
+            pd.read_csv(f'reports/majority_vote_report_{report_base_name}_tanimoto.csv'),
+        ]),
     }
+    for split_type in ['random', 'tanimoto', 'uniprot']:
+        for i in range(n_models_for_test):
             logs_dir = f'logs_{report_base_name}_{split_type}_best_model_n{i}'
+            metrics = pd.read_csv(f'logs/{logs_dir}/{logs_dir}/metrics.csv')
+            metrics['model_id'] = i
             # Rename 'val_' columns to 'test_' columns
+            metrics = metrics.rename(columns={'val_loss': 'test_loss', 'val_acc': 'test_acc', 'val_roc_auc': 'test_roc_auc'})
+            plot_training_curves(metrics, f'{split_type}_best_model_n{i}')
+        for i in range(cv_n_folds):
+            # logs_dir = f'logs_{report_base_name}_{split_type}_best_model_n{i}'
+            logs_dir = f'{split_type}_cv_model_fold{i}'
+            metrics = pd.read_csv(f'logs/{logs_dir}/{logs_dir}/metrics.csv')
+            metrics['fold'] = i
+            plot_training_curves(metrics, f'{split_type}_cv_model_fold{i}', stage='val')
     df_val = reports['cv_train']
     df_test = reports['test']
     plot_performance_metrics(df_val, df_test, title=f'{active_name}_metrics')
+    plot_majority_voting_performance(reports['majority_vote'])
     reports['test']['disabled_embeddings'] = pd.NA
     plot_ablation_study(pd.concat([
         reports['ablation'],

src/run_experiments.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing import Literal
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 import protac_degradation_predictor as pdp
 import pytorch_lightning as pl
 from rdkit import Chem
@@ -77,21 +78,38 @@ def get_smiles2fp_and_avg_tanimoto(protac_df: pd.DataFrame) -> tuple:
     Returns:
         tuple: The SMILES to fingerprint dictionary and the average Tanimoto similarity.
     """
     smiles2fp = {}
-    for smiles in tqdm(protac_df['Smiles'].unique().tolist(), desc='Precomputing fingerprints'):
         smiles2fp[smiles] = pdp.get_fingerprint(smiles)
-    # Get the pair-wise tanimoto similarity between the PROTAC fingerprints
     tanimoto_matrix = defaultdict(list)
-    for i, smiles1 in enumerate(tqdm(protac_df['Smiles'].unique(), desc='Computing Tanimoto similarity')):
-        fp1 = smiles2fp[smiles1]
-        # TODO: Use BulkTanimotoSimilarity for better performance
-        for j, smiles2 in enumerate(protac_df['Smiles'].unique()):
-            if j < i:
-                continue
-            fp2 = smiles2fp[smiles2]
-            tanimoto_dist = DataStructs.TanimotoSimilarity(fp1, fp2)
-            tanimoto_matrix[smiles1].append(tanimoto_dist)
     avg_tanimoto = {k: np.mean(v) for k, v in tanimoto_matrix.items()}
     protac_df['Avg Tanimoto'] = protac_df['Smiles'].map(avg_tanimoto)
@@ -256,7 +274,7 @@ def main(
         test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
     if experiments == 'tanimoto' or experiments == 'all':
         test_indeces['tanimoto'] = get_tanimoto_split_indices(active_df, active_col, test_split)
     # Make directory ../reports if it does not exist
     if not os.path.exists('../reports'):
         os.makedirs('../reports')

 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 import protac_degradation_predictor as pdp
+from protac_degradation_predictor.optuna_utils import get_dataframe_stats
 import pytorch_lightning as pl
 from rdkit import Chem
     Returns:
         tuple: The SMILES to fingerprint dictionary and the average Tanimoto similarity.
     """
+    unique_smiles = protac_df['Smiles'].unique().tolist()
     smiles2fp = {}
+    for smiles in tqdm(unique_smiles, desc='Precomputing fingerprints'):
         smiles2fp[smiles] = pdp.get_fingerprint(smiles)
+    # # Get the pair-wise tanimoto similarity between the PROTAC fingerprints
+    # tanimoto_matrix = defaultdict(list)
+    # for i, smiles1 in enumerate(tqdm(protac_df['Smiles'].unique(), desc='Computing Tanimoto similarity')):
+    #     fp1 = smiles2fp[smiles1]
+    #     # TODO: Use BulkTanimotoSimilarity for better performance
+    #     for j, smiles2 in enumerate(protac_df['Smiles'].unique()[i:]):
+    #         fp2 = smiles2fp[smiles2]
+    #         tanimoto_dist = 1 - DataStructs.TanimotoSimilarity(fp1, fp2)
+    #         tanimoto_matrix[smiles1].append(tanimoto_dist)
+    # avg_tanimoto = {k: np.mean(v) for k, v in tanimoto_matrix.items()}
+    # protac_df['Avg Tanimoto'] = protac_df['Smiles'].map(avg_tanimoto)
     tanimoto_matrix = defaultdict(list)
+    fps = list(smiles2fp.values())
+    # Compute all-against-all Tanimoto similarity using BulkTanimotoSimilarity
+    for i, (smiles1, fp1) in enumerate(tqdm(zip(unique_smiles, fps), desc='Computing Tanimoto similarity', total=len(fps))):
+        similarities = DataStructs.BulkTanimotoSimilarity(fp1, fps[i:])  # Only compute for i to end, avoiding duplicates
+        for j, similarity in enumerate(similarities):
+            distance = 1 - similarity
+            tanimoto_matrix[smiles1].append(distance)  # Store as distance
+            if i != i + j:
+                tanimoto_matrix[unique_smiles[i + j]].append(distance)  # Symmetric filling
+    # Calculate average Tanimoto distance for each unique SMILES
     avg_tanimoto = {k: np.mean(v) for k, v in tanimoto_matrix.items()}
     protac_df['Avg Tanimoto'] = protac_df['Smiles'].map(avg_tanimoto)
         test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
     if experiments == 'tanimoto' or experiments == 'all':
         test_indeces['tanimoto'] = get_tanimoto_split_indices(active_df, active_col, test_split)
     # Make directory ../reports if it does not exist
     if not os.path.exists('../reports'):
         os.makedirs('../reports')