ribesstefano commited on
Commit
251060c
·
1 Parent(s): 1e811f2

Fixed issue with duplicates + Experiments now rely on predefined datasets + Added experiments on simple embeddings

Browse files
protac_degradation_predictor/optuna_utils.py CHANGED
@@ -11,7 +11,7 @@ from .protac_dataset import get_datasets
11
 
12
  import torch
13
  import optuna
14
- from optuna.samplers import TPESampler
15
  import joblib
16
  import pandas as pd
17
  from sklearn.model_selection import (
@@ -117,8 +117,6 @@ def pytorch_model_objective(
117
  logger_save_dir: str = 'logs',
118
  logger_name: str = 'cv_model',
119
  enable_checkpointing: bool = False,
120
- use_cells_one_hot: bool = False,
121
- use_amino_acid_count: bool = False,
122
  ) -> float:
123
  """ Objective function for hyperparameter optimization.
124
 
@@ -135,17 +133,24 @@ def pytorch_model_objective(
135
  active_label (str): The active label column.
136
  disabled_embeddings (List[str]): The list of disabled embeddings.
137
  """
 
 
 
 
 
138
  # Suggest hyperparameters to be used accross the CV folds
139
- hidden_dim = trial.suggest_categorical('hidden_dim', hidden_dim_options)
140
- batch_size = 128 # trial.suggest_categorical('batch_size', batch_size_options)
141
- learning_rate = trial.suggest_float('learning_rate', *learning_rate_options, log=True)
142
- smote_k_neighbors = trial.suggest_categorical('smote_k_neighbors', smote_k_neighbors_options)
143
- use_smote = trial.suggest_categorical('use_smote', [True, False])
144
- # if use_cells_one_hot or use_amino_acid_count:
145
- # use_smote = False
146
- apply_scaling = True # trial.suggest_categorical('apply_scaling', [True, False])
147
- dropout = trial.suggest_float('dropout', *dropout_options)
148
- use_batch_norm = trial.suggest_categorical('use_batch_norm', [True, False])
 
 
149
 
150
  # Start the CV over the folds
151
  X = train_val_df.copy().drop(columns=active_label)
@@ -185,12 +190,13 @@ def pytorch_model_objective(
185
  hidden_dim=hidden_dim,
186
  batch_size=batch_size,
187
  learning_rate=learning_rate,
188
- dropout=dropout,
 
 
189
  use_batch_norm=use_batch_norm,
190
  max_epochs=max_epochs,
191
  smote_k_neighbors=smote_k_neighbors,
192
  apply_scaling=apply_scaling,
193
- use_smote=use_smote,
194
  fast_dev_run=fast_dev_run,
195
  active_label=active_label,
196
  return_predictions=True,
@@ -224,18 +230,6 @@ def pytorch_model_objective(
224
 
225
  # Optuna aims to minimize the pytorch_model_objective
226
  return - val_roc_auc
227
- # # Get the majority vote for the test predictions
228
- # if test_df is not None and not fast_dev_run:
229
- # majority_vote_metrics = get_majority_vote_metrics(test_preds, test_df, active_label)
230
- # majority_vote_metrics.update(get_dataframe_stats(train_df, val_df, test_df, active_label))
231
- # trial.set_user_attr('majority_vote_metrics', majority_vote_metrics)
232
- # logging.info(f'Majority vote metrics: {majority_vote_metrics}')
233
-
234
- # # Get the average validation accuracy and ROC AUC accross the folds
235
- # val_roc_auc = np.mean([r['val_roc_auc'] for r in report])
236
-
237
- # # Optuna aims to minimize the pytorch_model_objective
238
- # return - val_roc_auc
239
 
240
 
241
  def hyperparameter_tuning_and_training(
@@ -256,8 +250,6 @@ def hyperparameter_tuning_and_training(
256
  max_epochs: int = 100,
257
  study_filename: Optional[str] = None,
258
  force_study: bool = False,
259
- use_cells_one_hot: bool = False,
260
- use_amino_acid_count: bool = False,
261
  ) -> tuple:
262
  """ Hyperparameter tuning and training of a PROTAC model.
263
 
@@ -285,10 +277,13 @@ def hyperparameter_tuning_and_training(
285
  """
286
  pl.seed_everything(42)
287
 
 
 
 
288
  # Define the search space
289
- hidden_dim_options = [16, 32, 64, 128, 256] #, 512]
290
  batch_size_options = [128, 128] # [4, 8, 16, 32, 64, 128]
291
- learning_rate_options = (1e-6, 1e-3) # min and max values for loguniform distribution
292
  smote_k_neighbors_options = list(range(3, 16))
293
  # NOTE: We want Optuna to explore the combination (very low dropout, very
294
  # small hidden_dim)
@@ -296,8 +291,10 @@ def hyperparameter_tuning_and_training(
296
 
297
  # Set the verbosity of Optuna
298
  optuna.logging.set_verbosity(optuna.logging.WARNING)
 
 
 
299
  # Create an Optuna study object
300
- sampler = TPESampler(seed=42, multivariate=True)
301
  study = optuna.create_study(direction='minimize', sampler=sampler)
302
 
303
  study_loaded = False
@@ -328,8 +325,6 @@ def hyperparameter_tuning_and_training(
328
  active_label=active_label,
329
  max_epochs=max_epochs,
330
  disabled_embeddings=[],
331
- use_cells_one_hot=use_cells_one_hot,
332
- use_amino_acid_count=use_amino_acid_count,
333
  ),
334
  n_trials=n_trials,
335
  )
@@ -360,10 +355,8 @@ def hyperparameter_tuning_and_training(
360
  disabled_embeddings=[],
361
  use_logger=True,
362
  logger_save_dir=logger_save_dir,
363
- logger_name=f'{logger_name}_{split_type}_cv_model',
364
  enable_checkpointing=True,
365
- use_cells_one_hot=use_cells_one_hot,
366
- use_amino_acid_count=use_amino_acid_count,
367
  )
368
 
369
  # Retrain N models with the best hyperparameters (measure model uncertainty)
@@ -385,12 +378,13 @@ def hyperparameter_tuning_and_training(
385
  disabled_embeddings=[],
386
  use_logger=True,
387
  logger_save_dir=logger_save_dir,
388
- logger_name=f'{logger_name}_best_model_n{i}',
389
  enable_checkpointing=True,
390
  checkpoint_model_name=f'best_model_n{i}_{split_type}',
391
  return_predictions=True,
392
  batch_size=128,
393
  apply_scaling=True,
 
394
  **study.best_params,
395
  )
396
  # Rename the keys in the metrics dictionary
 
11
 
12
  import torch
13
  import optuna
14
+ from optuna.samplers import TPESampler, QMCSampler
15
  import joblib
16
  import pandas as pd
17
  from sklearn.model_selection import (
 
117
  logger_save_dir: str = 'logs',
118
  logger_name: str = 'cv_model',
119
  enable_checkpointing: bool = False,
 
 
120
  ) -> float:
121
  """ Objective function for hyperparameter optimization.
122
 
 
133
  active_label (str): The active label column.
134
  disabled_embeddings (List[str]): The list of disabled embeddings.
135
  """
136
+ # Set fixed hyperparameters
137
+ batch_size = 128
138
+ apply_scaling = True # It is dynamically disabled for binary data
139
+ use_batch_norm = True
140
+
141
  # Suggest hyperparameters to be used accross the CV folds
142
+ hidden_dim = trial.suggest_int('hidden_dim', 32, 512, step=32)
143
+ smote_k_neighbors = trial.suggest_int('smote_k_neighbors', 0, 12)
144
+ # hidden_dim = trial.suggest_categorical('hidden_dim', hidden_dim_options)
145
+ # smote_k_neighbors = trial.suggest_categorical('smote_k_neighbors', smote_k_neighbors_options)
146
+ # dropout = trial.suggest_float('dropout', *dropout_options)
147
+ # use_batch_norm = trial.suggest_categorical('use_batch_norm', [True, False])
148
+
149
+ # Optimizer parameters
150
+ learning_rate = trial.suggest_float('learning_rate', 1e-6, 1e-1, log=True)
151
+ beta1 = trial.suggest_float('beta1', 0.1, 0.999)
152
+ beta2 = trial.suggest_float('beta2', 0.1, 0.999)
153
+ eps = trial.suggest_float('eps', 1e-9, 1.0, log=True)
154
 
155
  # Start the CV over the folds
156
  X = train_val_df.copy().drop(columns=active_label)
 
190
  hidden_dim=hidden_dim,
191
  batch_size=batch_size,
192
  learning_rate=learning_rate,
193
+ beta1=beta1,
194
+ beta2=beta2,
195
+ eps=eps,
196
  use_batch_norm=use_batch_norm,
197
  max_epochs=max_epochs,
198
  smote_k_neighbors=smote_k_neighbors,
199
  apply_scaling=apply_scaling,
 
200
  fast_dev_run=fast_dev_run,
201
  active_label=active_label,
202
  return_predictions=True,
 
230
 
231
  # Optuna aims to minimize the pytorch_model_objective
232
  return - val_roc_auc
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
 
235
  def hyperparameter_tuning_and_training(
 
250
  max_epochs: int = 100,
251
  study_filename: Optional[str] = None,
252
  force_study: bool = False,
 
 
253
  ) -> tuple:
254
  """ Hyperparameter tuning and training of a PROTAC model.
255
 
 
277
  """
278
  pl.seed_everything(42)
279
 
280
+ # TODO: Make the following code more modular, i.e., the ranges shall be put
281
+ # in dictionaries or config files or something like that.
282
+
283
  # Define the search space
284
+ hidden_dim_options = [8, 16, 32, 64, 128, 256] #, 512]
285
  batch_size_options = [128, 128] # [4, 8, 16, 32, 64, 128]
286
+ learning_rate_options = (1e-6, 1e-1) # min and max values for loguniform distribution
287
  smote_k_neighbors_options = list(range(3, 16))
288
  # NOTE: We want Optuna to explore the combination (very low dropout, very
289
  # small hidden_dim)
 
291
 
292
  # Set the verbosity of Optuna
293
  optuna.logging.set_verbosity(optuna.logging.WARNING)
294
+ # Set a quasi-random sampler, as suggested in: https://github.com/google-research/tuning_playbook?tab=readme-ov-file#faqs
295
+ # sampler = TPESampler(seed=42, multivariate=True)
296
+ sampler = QMCSampler(qmc_type='halton', scramble=True, seed=42)
297
  # Create an Optuna study object
 
298
  study = optuna.create_study(direction='minimize', sampler=sampler)
299
 
300
  study_loaded = False
 
325
  active_label=active_label,
326
  max_epochs=max_epochs,
327
  disabled_embeddings=[],
 
 
328
  ),
329
  n_trials=n_trials,
330
  )
 
355
  disabled_embeddings=[],
356
  use_logger=True,
357
  logger_save_dir=logger_save_dir,
358
+ logger_name=f'cv_model_{logger_name}',
359
  enable_checkpointing=True,
 
 
360
  )
361
 
362
  # Retrain N models with the best hyperparameters (measure model uncertainty)
 
378
  disabled_embeddings=[],
379
  use_logger=True,
380
  logger_save_dir=logger_save_dir,
381
+ logger_name=f'best_model_n{i}_{logger_name}',
382
  enable_checkpointing=True,
383
  checkpoint_model_name=f'best_model_n{i}_{split_type}',
384
  return_predictions=True,
385
  batch_size=128,
386
  apply_scaling=True,
387
+ use_batch_norm=True,
388
  **study.best_params,
389
  )
390
  # Rename the keys in the metrics dictionary
protac_degradation_predictor/protac_dataset.py CHANGED
@@ -319,7 +319,6 @@ def get_datasets(
319
  protein2embedding: Dict = None,
320
  cell2embedding: Dict = None,
321
  smiles2fp: Dict = None,
322
- use_smote: bool = True,
323
  smote_k_neighbors: int = 5,
324
  active_label: str = 'Active',
325
  disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
@@ -345,14 +344,17 @@ def get_datasets(
345
  use_single_scaler (bool): Whether to use a single scaler for all features.
346
  apply_scaling (bool): Whether to apply scaling to the data now. Defaults to False (the Pytorch Lightning model does that).
347
  """
348
- oversampler = SMOTE(k_neighbors=smote_k_neighbors, random_state=42)
 
 
 
349
  train_ds = PROTAC_Dataset(
350
  train_df,
351
  protein2embedding,
352
  cell2embedding,
353
  smiles2fp,
354
- use_smote=use_smote,
355
- oversampler=oversampler if use_smote else None,
356
  active_label=active_label,
357
  disabled_embeddings=disabled_embeddings,
358
  scaler=scaler,
 
319
  protein2embedding: Dict = None,
320
  cell2embedding: Dict = None,
321
  smiles2fp: Dict = None,
 
322
  smote_k_neighbors: int = 5,
323
  active_label: str = 'Active',
324
  disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
 
344
  use_single_scaler (bool): Whether to use a single scaler for all features.
345
  apply_scaling (bool): Whether to apply scaling to the data now. Defaults to False (the Pytorch Lightning model does that).
346
  """
347
+ if smote_k_neighbors:
348
+ oversampler = SMOTE(k_neighbors=smote_k_neighbors, random_state=42)
349
+ else:
350
+ oversampler = None
351
  train_ds = PROTAC_Dataset(
352
  train_df,
353
  protein2embedding,
354
  cell2embedding,
355
  smiles2fp,
356
+ use_smote=True if smote_k_neighbors else False,
357
+ oversampler=oversampler,
358
  active_label=active_label,
359
  disabled_embeddings=disabled_embeddings,
360
  scaler=scaler,
protac_degradation_predictor/pytorch_models.py CHANGED
@@ -171,6 +171,7 @@ class PROTAC_Model(pl.LightningModule):
171
  test_dataset: PROTAC_Dataset = None,
172
  disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
173
  apply_scaling: bool = True,
 
174
  ):
175
  """ Initialize the PROTAC Pytorch Lightning model.
176
 
@@ -189,6 +190,7 @@ class PROTAC_Model(pl.LightningModule):
189
  test_dataset (PROTAC_Dataset): The test dataset
190
  disabled_embeddings (list): List of disabled embeddings. Can be 'poi', 'e3', 'cell', 'smiles'
191
  apply_scaling (bool): Whether to apply scaling to the embeddings
 
192
  """
193
  super().__init__()
194
  # Set our init args as class attributes
@@ -328,15 +330,31 @@ class PROTAC_Model(pl.LightningModule):
328
  return self.step(batch, batch_idx, 'test')
329
 
330
  def configure_optimizers(self):
331
- optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  return {
333
  'optimizer': optimizer,
334
- 'lr_scheduler': optim.lr_scheduler.ReduceLROnPlateau(
335
- optimizer=optimizer,
336
- mode='min',
337
- factor=0.1,
338
- patience=0,
339
- ),
340
  'interval': 'step', # or 'epoch'
341
  'frequency': 1,
342
  'monitor': 'val_loss',
@@ -411,12 +429,14 @@ def train_model(
411
  hidden_dim: int = 768,
412
  batch_size: int = 128,
413
  learning_rate: float = 2e-5,
 
 
 
414
  dropout: float = 0.2,
415
  max_epochs: int = 50,
416
  use_batch_norm: bool = False,
417
  join_embeddings: Literal['beginning', 'concat', 'sum'] = 'sum',
418
- smote_k_neighbors:int = 5,
419
- use_smote: bool = True,
420
  apply_scaling: bool = True,
421
  active_label: str = 'Active',
422
  fast_dev_run: bool = False,
@@ -468,7 +488,6 @@ def train_model(
468
  protein2embedding,
469
  cell2embedding,
470
  smiles2fp,
471
- use_smote=use_smote,
472
  smote_k_neighbors=smote_k_neighbors,
473
  active_label=active_label,
474
  disabled_embeddings=disabled_embeddings,
@@ -540,6 +559,10 @@ def train_model(
540
  devices=1,
541
  num_nodes=1,
542
  )
 
 
 
 
543
  model = PROTAC_Model(
544
  hidden_dim=hidden_dim,
545
  smiles_emb_dim=smiles_emb_dim,
@@ -556,6 +579,7 @@ def train_model(
556
  val_dataset=val_ds,
557
  test_dataset=test_ds if test_df is not None else None,
558
  disabled_embeddings=disabled_embeddings,
 
559
  )
560
  with warnings.catch_warnings():
561
  warnings.simplefilter("ignore")
 
171
  test_dataset: PROTAC_Dataset = None,
172
  disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
173
  apply_scaling: bool = True,
174
+ extra_optim_params: Optional[dict] = None,
175
  ):
176
  """ Initialize the PROTAC Pytorch Lightning model.
177
 
 
190
  test_dataset (PROTAC_Dataset): The test dataset
191
  disabled_embeddings (list): List of disabled embeddings. Can be 'poi', 'e3', 'cell', 'smiles'
192
  apply_scaling (bool): Whether to apply scaling to the embeddings
193
+ extra_optim_params (dict): Extra parameters for the optimizer
194
  """
195
  super().__init__()
196
  # Set our init args as class attributes
 
330
  return self.step(batch, batch_idx, 'test')
331
 
332
  def configure_optimizers(self):
333
+ # Define optimizer
334
+ if self.extra_optim_params is not None:
335
+ optimizer = optim.Adam(self.parameters(), lr=self.learning_rate, **self.extra_optim_params)
336
+ else:
337
+ optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
338
+ # Define LR scheduler
339
+ if self.trainer.max_epochs:
340
+ total_iters = self.trainer.max_epochs
341
+ elif self.trainer.max_steps:
342
+ total_iters = self.trainer.max_steps
343
+ else:
344
+ total_iters = 20
345
+ lr_scheduler = optim.lr_scheduler.LinearLR(
346
+ optimizer=optimizer,
347
+ total_iters=total_iters,
348
+ )
349
+ # lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
350
+ # optimizer=optimizer,
351
+ # mode='min',
352
+ # factor=0.01,
353
+ # patience=0,
354
+ # )
355
  return {
356
  'optimizer': optimizer,
357
+ 'lr_scheduler': lr_scheduler,
 
 
 
 
 
358
  'interval': 'step', # or 'epoch'
359
  'frequency': 1,
360
  'monitor': 'val_loss',
 
429
  hidden_dim: int = 768,
430
  batch_size: int = 128,
431
  learning_rate: float = 2e-5,
432
+ beta1: float = 0.9,
433
+ beta2: float = 0.999,
434
+ eps: float = 1e-8,
435
  dropout: float = 0.2,
436
  max_epochs: int = 50,
437
  use_batch_norm: bool = False,
438
  join_embeddings: Literal['beginning', 'concat', 'sum'] = 'sum',
439
+ smote_k_neighbors: int = 5,
 
440
  apply_scaling: bool = True,
441
  active_label: str = 'Active',
442
  fast_dev_run: bool = False,
 
488
  protein2embedding,
489
  cell2embedding,
490
  smiles2fp,
 
491
  smote_k_neighbors=smote_k_neighbors,
492
  active_label=active_label,
493
  disabled_embeddings=disabled_embeddings,
 
559
  devices=1,
560
  num_nodes=1,
561
  )
562
+ extra_optim_params = {
563
+ 'betas': (beta1, beta2),
564
+ 'eps': eps,
565
+ }
566
  model = PROTAC_Model(
567
  hidden_dim=hidden_dim,
568
  smiles_emb_dim=smiles_emb_dim,
 
579
  val_dataset=val_ds,
580
  test_dataset=test_ds if test_df is not None else None,
581
  disabled_embeddings=disabled_embeddings,
582
+ extra_optim_params=extra_optim_params,
583
  )
584
  with warnings.catch_warnings():
585
  warnings.simplefilter("ignore")
src/get_studies_datasets.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import sys
 
3
 
4
  sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
5
  import protac_degradation_predictor as pdp
@@ -10,6 +11,7 @@ import logging
10
  from typing import Literal
11
 
12
  from sklearn.preprocessing import OrdinalEncoder
 
13
  from tqdm import tqdm
14
  import pandas as pd
15
  import numpy as np
@@ -109,7 +111,7 @@ def get_tanimoto_split_indices(
109
  active_df: pd.DataFrame,
110
  active_col: str,
111
  test_split: float,
112
- n_bins_tanimoto: int = 200,
113
  ) -> pd.Index:
114
  """ Get the indices of the test set using the Tanimoto-based split.
115
 
@@ -154,9 +156,11 @@ def get_tanimoto_split_indices(
154
  test_df.append(group_df)
155
  continue
156
  # Be more selective and make sure that the percentage of active and
157
- # inactive is balanced
158
- if (num_active_group + num_active_test) / (num_entries_test + num_entries) < 0.6:
159
- if (num_inactive_group + num_inactive_test) / (num_entries_test + num_entries) < 0.6:
 
 
160
  test_df.append(group_df)
161
  test_df = pd.concat(test_df)
162
  return test_df.index
@@ -212,10 +216,130 @@ def get_target_split_indices(active_df: pd.DataFrame, active_col: str, test_spli
212
  return test_df.index
213
 
214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  def main(
216
  active_col: str = 'Active (Dmax 0.6, pDC50 6.0)',
217
  test_split: float = 0.1,
218
  studies: str | Literal['all', 'standard', 'e3_ligase', 'similarity', 'target'] = 'all',
 
219
  ):
220
  """ Get and save the datasets for the different studies.
221
 
@@ -237,49 +361,103 @@ def main(
237
  protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
238
  # Map E3 Ligase Iap to IAP
239
  protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  protac_df[active_col] = protac_df.apply(
241
  lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
242
  )
 
 
243
  _, protac_df = get_smiles2fp_and_avg_tanimoto(protac_df)
244
 
245
  ## Get the test sets
246
  test_indeces = {}
247
  active_df = protac_df[protac_df[active_col].notna()].copy()
248
 
249
- # Remove legacy column 'Active - OR' if it exists
250
- if 'Active - OR' in active_df.columns:
251
- active_df.drop(columns='Active - OR', inplace=True)
252
-
253
  if studies == 'standard' or studies == 'all':
254
  test_indeces['standard'] = get_random_split_indices(active_df, test_split)
255
  if studies == 'target' or studies == 'all':
256
  test_indeces['target'] = get_target_split_indices(active_df, active_col, test_split)
257
- if studies == 'e3_ligase' or studies == 'all':
258
- test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
259
  if studies == 'similarity' or studies == 'all':
260
  test_indeces['similarity'] = get_tanimoto_split_indices(active_df, active_col, test_split)
 
 
261
 
262
  # Make directory for studies datasets if it does not exist
263
  data_dir = '../data/studies'
264
  if not os.path.exists(data_dir):
265
  os.makedirs(data_dir)
266
 
267
- # Cross-Validation Training
268
- for split_type, indeces in test_indeces.items():
269
- test_df = active_df.loc[indeces].copy()
270
- train_val_df = active_df[~active_df.index.isin(test_df.index)].copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
- # Save the datasets
273
 
 
274
  train_val_perc = f'{int((1 - test_split) * 100)}'
275
  test_perc = f'{int(test_split * 100)}'
276
 
277
  train_val_filename = f'{data_dir}/{split_type}_train_val_{train_val_perc}split_{active_name}.csv'
278
  test_filename = f'{data_dir}/{split_type}_test_{test_perc}split_{active_name}.csv'
279
 
280
- print('')
281
- print(f'Saving train_val datasets as: {train_val_filename}')
282
- print(f'Saving test datasets as: {test_filename}')
283
 
284
  train_val_df.to_csv(train_val_filename, index=False)
285
  test_df.to_csv(test_filename, index=False)
 
1
  import os
2
  import sys
3
+ from typing import Dict
4
 
5
  sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
6
  import protac_degradation_predictor as pdp
 
11
  from typing import Literal
12
 
13
  from sklearn.preprocessing import OrdinalEncoder
14
+ from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold
15
  from tqdm import tqdm
16
  import pandas as pd
17
  import numpy as np
 
111
  active_df: pd.DataFrame,
112
  active_col: str,
113
  test_split: float,
114
+ n_bins_tanimoto: int = 100, # Original: 200
115
  ) -> pd.Index:
116
  """ Get the indices of the test set using the Tanimoto-based split.
117
 
 
156
  test_df.append(group_df)
157
  continue
158
  # Be more selective and make sure that the percentage of active and
159
+ # inactive is not over-exceeding 60%
160
+ perc_active_group = (num_active_group + num_active_test) / (num_entries_test + num_entries)
161
+ perc_inactive_group = (num_inactive_group + num_inactive_test) / (num_entries_test + num_entries)
162
+ if perc_active_group < 0.6:
163
+ if perc_inactive_group < 0.6:
164
  test_df.append(group_df)
165
  test_df = pd.concat(test_df)
166
  return test_df.index
 
216
  return test_df.index
217
 
218
 
219
+ def get_dataframe_stats(
220
+ train_df = None,
221
+ val_df = None,
222
+ test_df = None,
223
+ active_label = 'Active',
224
+ ) -> Dict:
225
+ """ Get some statistics from the dataframes.
226
+
227
+ Args:
228
+ train_df (pd.DataFrame): The training set.
229
+ val_df (pd.DataFrame): The validation set.
230
+ test_df (pd.DataFrame): The test set.
231
+ """
232
+ stats = {}
233
+ if train_df is not None:
234
+ stats['train_len'] = len(train_df)
235
+ stats['train_active_perc'] = train_df[active_label].sum() / len(train_df)
236
+ stats['train_inactive_perc'] = (len(train_df) - train_df[active_label].sum()) / len(train_df)
237
+ stats['train_avg_tanimoto_dist'] = train_df['Avg Tanimoto'].mean()
238
+ if val_df is not None:
239
+ stats['val_len'] = len(val_df)
240
+ stats['val_active_perc'] = val_df[active_label].sum() / len(val_df)
241
+ stats['val_inactive_perc'] = (len(val_df) - val_df[active_label].sum()) / len(val_df)
242
+ stats['val_avg_tanimoto_dist'] = val_df['Avg Tanimoto'].mean()
243
+ if test_df is not None:
244
+ stats['test_len'] = len(test_df)
245
+ stats['test_active_perc'] = test_df[active_label].sum() / len(test_df)
246
+ stats['test_inactive_perc'] = (len(test_df) - test_df[active_label].sum()) / len(test_df)
247
+ stats['test_avg_tanimoto_dist'] = test_df['Avg Tanimoto'].mean()
248
+ if train_df is not None and val_df is not None:
249
+ leaking_uniprot = list(set(train_df['Uniprot']).intersection(set(val_df['Uniprot'])))
250
+ leaking_smiles = list(set(train_df['Smiles']).intersection(set(val_df['Smiles'])))
251
+ stats['num_leaking_uniprot_train_val'] = len(leaking_uniprot)
252
+ stats['num_leaking_smiles_train_val'] = len(leaking_smiles)
253
+ stats['perc_leaking_uniprot_train_val'] = len(train_df[train_df['Uniprot'].isin(leaking_uniprot)]) / len(train_df)
254
+ stats['perc_leaking_smiles_train_val'] = len(train_df[train_df['Smiles'].isin(leaking_smiles)]) / len(train_df)
255
+
256
+ key_cols = [
257
+ 'Smiles',
258
+ 'Uniprot',
259
+ 'E3 Ligase Uniprot',
260
+ 'Cell Line Identifier',
261
+ ]
262
+ class_cols = ['DC50 (nM)', 'Dmax (%)']
263
+ # Check if there are any entries that are in BOTH train and val sets
264
+ tmp_train_df = train_df[key_cols + class_cols].copy()
265
+ tmp_val_df = val_df[key_cols + class_cols].copy()
266
+ stats['leaking_train_val'] = len(tmp_train_df.merge(tmp_val_df, on=key_cols + class_cols, how='inner'))
267
+
268
+
269
+ if train_df is not None and test_df is not None:
270
+ leaking_uniprot = list(set(train_df['Uniprot']).intersection(set(test_df['Uniprot'])))
271
+ leaking_smiles = list(set(train_df['Smiles']).intersection(set(test_df['Smiles'])))
272
+ stats['num_leaking_uniprot_train_test'] = len(leaking_uniprot)
273
+ stats['num_leaking_smiles_train_test'] = len(leaking_smiles)
274
+ stats['perc_leaking_uniprot_train_test'] = len(train_df[train_df['Uniprot'].isin(leaking_uniprot)]) / len(train_df)
275
+ stats['perc_leaking_smiles_train_test'] = len(train_df[train_df['Smiles'].isin(leaking_smiles)]) / len(train_df)
276
+
277
+ key_cols = [
278
+ 'Smiles',
279
+ 'Uniprot',
280
+ 'E3 Ligase Uniprot',
281
+ 'Cell Line Identifier',
282
+ ]
283
+ class_cols = ['DC50 (nM)', 'Dmax (%)']
284
+ # Check if there are any entries that are in BOTH train and test sets
285
+ tmp_train_df = train_df[key_cols + class_cols].copy()
286
+ tmp_test_df = test_df[key_cols + class_cols].copy()
287
+ stats['leaking_train_test'] = len(tmp_train_df.merge(tmp_test_df, on=key_cols + class_cols, how='inner'))
288
+
289
+ return stats
290
+
291
+
292
+ def merge_numerical_cols(group):
293
+ key_cols = [
294
+ 'Smiles',
295
+ 'Uniprot',
296
+ 'E3 Ligase Uniprot',
297
+ 'Cell Line Identifier',
298
+ ]
299
+ class_cols = ['DC50 (nM)', 'Dmax (%)']
300
+ # Loop over all numerical columns
301
+ for col in group.select_dtypes(include=[np.number]).columns:
302
+ if col == 'Compound ID':
303
+ continue
304
+ # Compute the geometric mean for the column
305
+ values = group[col].dropna()
306
+ if not values.empty:
307
+ group[col] = np.prod(values) ** (1 / len(values))
308
+
309
+ row = group.drop_duplicates(subset=key_cols + class_cols).reset_index(drop=True)
310
+
311
+ assert len(row) == 1
312
+
313
+ return row
314
+
315
+
316
+ def remove_duplicates(df):
317
+ key_cols = [
318
+ 'Smiles',
319
+ 'Uniprot',
320
+ 'E3 Ligase Uniprot',
321
+ 'Cell Line Identifier',
322
+ ]
323
+ class_cols = ['DC50 (nM)', 'Dmax (%)']
324
+ # Check if there are any duplicated entries having the same key columns, if
325
+ # so, merge them by applying a geometric mean to their DC50 and Dmax columns
326
+ duplicated = df[df.duplicated(subset=key_cols, keep=False)]
327
+
328
+ # NOTE: Reset index to remove the multi-index
329
+ merged = duplicated.groupby(key_cols).apply(lambda x: merge_numerical_cols(x))
330
+ merged = merged.reset_index(drop=True)
331
+
332
+ # Remove the duplicated entries from the original dataframe df
333
+ df = df[~df.duplicated(subset=key_cols, keep=False)]
334
+ # Concatenate the merged dataframe with the original dataframe
335
+ return pd.concat([df, merged], ignore_index=True)
336
+
337
+
338
  def main(
339
  active_col: str = 'Active (Dmax 0.6, pDC50 6.0)',
340
  test_split: float = 0.1,
341
  studies: str | Literal['all', 'standard', 'e3_ligase', 'similarity', 'target'] = 'all',
342
+ cv_n_splits: int = 5,
343
  ):
344
  """ Get and save the datasets for the different studies.
345
 
 
361
  protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
362
  # Map E3 Ligase Iap to IAP
363
  protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
364
+
365
+ # Remove duplicates
366
+ protac_df = remove_duplicates(protac_df)
367
+
368
+ # Remove legacy columns if they exist
369
+ if 'Active - OR' in protac_df.columns:
370
+ protac_df.drop(columns='Active - OR', inplace=True)
371
+ if 'Active - AND' in protac_df.columns:
372
+ protac_df.drop(columns='Active - AND', inplace=True)
373
+ if 'Active' in protac_df.columns:
374
+ protac_df.drop(columns='Active', inplace=True)
375
+
376
+ # Calculate Activity and add it as a column
377
  protac_df[active_col] = protac_df.apply(
378
  lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
379
  )
380
+
381
+ # Precompute fingerprints and average Tanimoto similarity
382
  _, protac_df = get_smiles2fp_and_avg_tanimoto(protac_df)
383
 
384
  ## Get the test sets
385
  test_indeces = {}
386
  active_df = protac_df[protac_df[active_col].notna()].copy()
387
 
 
 
 
 
388
  if studies == 'standard' or studies == 'all':
389
  test_indeces['standard'] = get_random_split_indices(active_df, test_split)
390
  if studies == 'target' or studies == 'all':
391
  test_indeces['target'] = get_target_split_indices(active_df, active_col, test_split)
 
 
392
  if studies == 'similarity' or studies == 'all':
393
  test_indeces['similarity'] = get_tanimoto_split_indices(active_df, active_col, test_split)
394
+ # if studies == 'e3_ligase' or studies == 'all':
395
+ # test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
396
 
397
  # Make directory for studies datasets if it does not exist
398
  data_dir = '../data/studies'
399
  if not os.path.exists(data_dir):
400
  os.makedirs(data_dir)
401
 
402
+ # Open file for reporting
403
+ with open(f'{data_dir}/report_datasets.md', 'w') as f:
404
+ # Cross-Validation Training
405
+ for split_type, indeces in test_indeces.items():
406
+ test_df = active_df.loc[indeces].copy()
407
+ train_val_df = active_df[~active_df.index.isin(test_df.index)].copy()
408
+
409
+ # Print statistics on active/inactive percentages
410
+ perc_active = train_val_df[active_col].sum() / len(train_val_df)
411
+ print('-' * 80)
412
+ print(f'{split_type.capitalize()} Split')
413
+ print(f'Len Train/Val:{len(train_val_df)}')
414
+ print(f'Len Test: {len(test_df)}')
415
+ print(f'Percentage Active in Train/Val: {perc_active:.2%}')
416
+ print(f'Percentage Inactive in Train/Val: {1 - perc_active:.2%}')
417
+
418
+ # Get the CV object
419
+ if split_type == 'standard':
420
+ kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
421
+ group = None
422
+ elif split_type == 'e3_ligase':
423
+ kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
424
+ group = train_val_df['E3 Group'].to_numpy()
425
+ elif split_type == 'similarity':
426
+ kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
427
+ group = train_val_df['Tanimoto Group'].to_numpy()
428
+ elif split_type == 'target':
429
+ kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
430
+ group = train_val_df['Uniprot Group'].to_numpy()
431
+
432
+ # Get the folds on the train_val_df, then collect statistics on active/inactive percentages
433
+ stats = []
434
+ for i, (train_index, val_index) in enumerate(kf.split(train_val_df, train_val_df[active_col].to_list(), group)):
435
+ train_df = train_val_df.iloc[train_index]
436
+ val_df = train_val_df.iloc[val_index]
437
+
438
+ s = get_dataframe_stats(train_df, val_df, test_df, active_col)
439
+ s['fold'] = i + 1
440
+ stats.append(s)
441
+
442
+ # Append the statistics as markdown to report file f
443
+ stats_df = pd.DataFrame(stats)
444
+ f.write(f'## {split_type.capitalize()} Split\n\n')
445
+ f.write(stats_df.to_markdown(index=False))
446
+ f.write('\n\n')
447
+ print('-' * 80)
448
+
449
 
 
450
 
451
+ # Save the datasets
452
  train_val_perc = f'{int((1 - test_split) * 100)}'
453
  test_perc = f'{int(test_split * 100)}'
454
 
455
  train_val_filename = f'{data_dir}/{split_type}_train_val_{train_val_perc}split_{active_name}.csv'
456
  test_filename = f'{data_dir}/{split_type}_test_{test_perc}split_{active_name}.csv'
457
 
458
+ # print('')
459
+ # print(f'Saving train_val datasets as: {train_val_filename}')
460
+ # print(f'Saving test datasets as: {test_filename}')
461
 
462
  train_val_df.to_csv(train_val_filename, index=False)
463
  test_df.to_csv(test_filename, index=False)
src/run_experiments.py CHANGED
@@ -233,7 +233,7 @@ def main(
233
  max_epochs: int = 100,
234
  run_sklearn: bool = False,
235
  force_study: bool = False,
236
- experiments: str | Literal['all', 'random', 'e3_ligase', 'tanimoto', 'uniprot'] = 'all',
237
  ):
238
  """ Train a PROTAC model using the given datasets and hyperparameters.
239
 
@@ -250,34 +250,39 @@ def main(
250
  """
251
  pl.seed_everything(42)
252
 
253
- # Set the Column to Predict
254
- active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
 
 
 
 
 
 
 
255
 
256
- # Get Dmax_threshold from the active_col
257
- Dmax_threshold = float(active_col.split('Dmax')[1].split(',')[0].strip('(').strip(')').strip())
258
- pDC50_threshold = float(active_col.split('pDC50')[1].strip('(').strip(')').strip())
259
-
260
- # Load the PROTAC dataset
261
- protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
262
- # Map E3 Ligase Iap to IAP
263
- protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
264
- protac_df[active_col] = protac_df.apply(
265
- lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
266
- )
267
- smiles2fp, protac_df = get_smiles2fp_and_avg_tanimoto(protac_df)
268
-
269
- ## Get the test sets
270
- test_indeces = {}
271
- active_df = protac_df[protac_df[active_col].notna()].copy()
272
 
273
- if experiments == 'random' or experiments == 'all':
274
- test_indeces['random'] = get_random_split_indices(active_df, test_split)
275
- if experiments == 'uniprot' or experiments == 'all':
276
- test_indeces['uniprot'] = get_target_split_indices(active_df, active_col, test_split)
277
- if experiments == 'e3_ligase' or experiments == 'all':
278
- test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
279
- if experiments == 'tanimoto' or experiments == 'all':
280
- test_indeces['tanimoto'] = get_tanimoto_split_indices(active_df, active_col, test_split)
281
 
282
  # Make directory ../reports if it does not exist
283
  if not os.path.exists('../reports'):
@@ -287,28 +292,46 @@ def main(
287
  protein2embedding = pdp.load_protein2embedding('../data/uniprot2embedding.h5')
288
  cell2embedding = pdp.load_cell2embedding('../data/cell2embedding.pkl')
289
 
 
 
 
 
 
 
 
 
 
 
290
  # Cross-Validation Training
291
  reports = defaultdict(list)
292
- for split_type, indeces in test_indeces.items():
293
- test_df = active_df.loc[indeces].copy()
294
- train_val_df = active_df[~active_df.index.isin(test_df.index)].copy()
 
295
 
 
 
 
 
 
 
 
296
  # Get the CV object
297
- if split_type == 'random':
298
  kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
299
  group = None
300
  elif split_type == 'e3_ligase':
301
  kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
302
  group = train_val_df['E3 Group'].to_numpy()
303
- elif split_type == 'tanimoto':
304
  kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
305
  group = train_val_df['Tanimoto Group'].to_numpy()
306
- elif split_type == 'uniprot':
307
  kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
308
  group = train_val_df['Uniprot Group'].to_numpy()
309
 
310
  # Start the experiment
311
- experiment_name = f'{active_name}_test_split_{test_split}_{split_type}'
312
  optuna_reports = pdp.hyperparameter_tuning_and_training(
313
  protein2embedding=protein2embedding,
314
  cell2embedding=cell2embedding,
@@ -323,7 +346,7 @@ def main(
323
  n_trials=n_trials,
324
  max_epochs=max_epochs,
325
  logger_save_dir='../logs',
326
- logger_name=f'logs_{experiment_name}',
327
  active_label=active_col,
328
  study_filename=f'../reports/study_{experiment_name}.pkl',
329
  force_study=force_study,
@@ -334,121 +357,6 @@ def main(
334
  report.to_csv(f'../reports/{report_name}_{experiment_name}.csv', index=False)
335
  reports[report_name].append(report.copy())
336
 
337
- # # Start the CV over the folds
338
- # X = train_val_df.drop(columns=active_col)
339
- # y = train_val_df[active_col].tolist()
340
- # for k, (train_index, val_index) in enumerate(kf.split(X, y, group)):
341
- # print('-' * 100)
342
- # print(f'Starting CV for group type: {split_type}, fold: {k}')
343
- # print('-' * 100)
344
- # train_df = train_val_df.iloc[train_index]
345
- # val_df = train_val_df.iloc[val_index]
346
-
347
- # leaking_uniprot = list(set(train_df['Uniprot']).intersection(set(val_df['Uniprot'])))
348
- # leaking_smiles = list(set(train_df['Smiles']).intersection(set(val_df['Smiles'])))
349
-
350
- # stats = {
351
- # 'fold': k,
352
- # 'split_type': split_type,
353
- # 'train_len': len(train_df),
354
- # 'val_len': len(val_df),
355
- # 'train_perc': len(train_df) / len(train_val_df),
356
- # 'val_perc': len(val_df) / len(train_val_df),
357
- # 'train_active_perc': train_df[active_col].sum() / len(train_df),
358
- # 'train_inactive_perc': (len(train_df) - train_df[active_col].sum()) / len(train_df),
359
- # 'val_active_perc': val_df[active_col].sum() / len(val_df),
360
- # 'val_inactive_perc': (len(val_df) - val_df[active_col].sum()) / len(val_df),
361
- # 'test_active_perc': test_df[active_col].sum() / len(test_df),
362
- # 'test_inactive_perc': (len(test_df) - test_df[active_col].sum()) / len(test_df),
363
- # 'num_leaking_uniprot': len(leaking_uniprot),
364
- # 'num_leaking_smiles': len(leaking_smiles),
365
- # 'train_leaking_uniprot_perc': len(train_df[train_df['Uniprot'].isin(leaking_uniprot)]) / len(train_df),
366
- # 'train_leaking_smiles_perc': len(train_df[train_df['Smiles'].isin(leaking_smiles)]) / len(train_df),
367
- # }
368
- # if split_type != 'random':
369
- # stats['train_unique_groups'] = len(np.unique(group[train_index]))
370
- # stats['val_unique_groups'] = len(np.unique(group[val_index]))
371
-
372
- # # At each fold, train and evaluate the Pytorch model
373
- # if split_type != 'tanimoto' or run_sklearn:
374
- # logging.info(f'Skipping Pytorch model training on fold {k} with split type {split_type} and test split {test_split}.')
375
- # continue
376
- # else:
377
- # logging.info(f'Starting Pytorch model training on fold {k} with split type {split_type} and test split {test_split}.')
378
- # # Train and evaluate the model
379
- # model, trainer, metrics = pdp.hyperparameter_tuning_and_training(
380
- # protein2embedding,
381
- # cell2embedding,
382
- # smiles2fp,
383
- # train_df,
384
- # val_df,
385
- # test_df,
386
- # fast_dev_run=fast_dev_run,
387
- # n_trials=n_trials,
388
- # logger_name=f'protac_{active_name}_{split_type}_fold_{k}_test_split_{test_split}',
389
- # active_label=active_col,
390
- # study_filename=f'../reports/study_{active_name}_{split_type}_fold_{k}_test_split_{test_split}.pkl',
391
- # )
392
- # hparams = {p.replace('hparam_', ''): v for p, v in stats.items() if p.startswith('hparam_')}
393
- # stats.update(metrics)
394
- # stats['model_type'] = 'Pytorch'
395
- # report.append(stats.copy())
396
- # del model
397
- # del trainer
398
-
399
- # # Ablation study: disable embeddings at a time
400
- # for disabled_embeddings in [['e3'], ['poi'], ['cell'], ['smiles'], ['e3', 'cell'], ['poi', 'e3', 'cell']]:
401
- # print('-' * 100)
402
- # print(f'Ablation study with disabled embeddings: {disabled_embeddings}')
403
- # print('-' * 100)
404
- # stats['disabled_embeddings'] = 'disabled ' + ' '.join(disabled_embeddings)
405
- # model, trainer, metrics = pdp.train_model(
406
- # protein2embedding,
407
- # cell2embedding,
408
- # smiles2fp,
409
- # train_df,
410
- # val_df,
411
- # test_df,
412
- # fast_dev_run=fast_dev_run,
413
- # logger_name=f'protac_{active_name}_{split_type}_fold_{k}_disabled-{"-".join(disabled_embeddings)}',
414
- # active_label=active_col,
415
- # disabled_embeddings=disabled_embeddings,
416
- # **hparams,
417
- # )
418
- # stats.update(metrics)
419
- # report.append(stats.copy())
420
- # del model
421
- # del trainer
422
-
423
- # # At each fold, train and evaluate sklearn models
424
- # if run_sklearn:
425
- # for model_type in ['RandomForest', 'SVC', 'LogisticRegression', 'GradientBoosting']:
426
- # logging.info(f'Starting sklearn model {model_type} training on fold {k} with split type {split_type} and test split {test_split}.')
427
- # # Train and evaluate sklearn models
428
- # model, metrics = pdp.hyperparameter_tuning_and_training_sklearn(
429
- # protein2embedding=protein2embedding,
430
- # cell2embedding=cell2embedding,
431
- # smiles2fp=smiles2fp,
432
- # train_df=train_df,
433
- # val_df=val_df,
434
- # test_df=test_df,
435
- # model_type=model_type,
436
- # active_label=active_col,
437
- # n_trials=n_trials,
438
- # study_filename=f'../reports/study_{active_name}_{split_type}_fold_{k}_test_split_{test_split}_{model_type.lower()}.pkl',
439
- # )
440
- # hparams = {p.replace('hparam_', ''): v for p, v in stats.items() if p.startswith('hparam_')}
441
- # stats['model_type'] = model_type
442
- # stats.update(metrics)
443
- # report.append(stats.copy())
444
-
445
- # # Save the report at the end of each split type
446
- # report_df = pd.DataFrame(report)
447
- # report_df.to_csv(
448
- # f'../reports/cv_report_hparam_search_{cv_n_splits}-splits_{active_name}_test_split_{test_split}{"_sklearn" if run_sklearn else ""}.csv',
449
- # index=False,
450
- # )
451
-
452
 
453
  if __name__ == '__main__':
454
  cli = CLI(main)
 
233
  max_epochs: int = 100,
234
  run_sklearn: bool = False,
235
  force_study: bool = False,
236
+ experiments: str | Literal['all', 'standard', 'e3_ligase', 'similarity', 'target'] = 'all',
237
  ):
238
  """ Train a PROTAC model using the given datasets and hyperparameters.
239
 
 
250
  """
251
  pl.seed_everything(42)
252
 
253
+ # # Set the Column to Predict
254
+ # active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
255
+
256
+ # # Get Dmax_threshold from the active_col
257
+ # Dmax_threshold = float(active_col.split('Dmax')[1].split(',')[0].strip('(').strip(')').strip())
258
+ # pDC50_threshold = float(active_col.split('pDC50')[1].strip('(').strip(')').strip())
259
+
260
+ # # Load the PROTAC dataset
261
+ # protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
262
 
263
+ # # Map E3 Ligase Iap to IAP
264
+ # protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
265
+ # protac_df[active_col] = protac_df.apply(
266
+ # lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
267
+ # )
268
+ # # Drop duplicates
269
+ # protac_df = protac_df.drop_duplicates(subset=['Smiles', 'Uniprot', 'E3 Ligase Uniprot', 'Cell Line Identifier', active_col])
270
+
271
+ # # Precompute fingerprints and average Tanimoto similarity
272
+ # smiles2fp, protac_df = get_smiles2fp_and_avg_tanimoto(protac_df)
273
+
274
+ # ## Get the test sets
275
+ # test_indeces = {}
276
+ # active_df = protac_df[protac_df[active_col].notna()].copy()
 
 
277
 
278
+ # if experiments == 'standard' or experiments == 'all':
279
+ # test_indeces['standard'] = get_random_split_indices(active_df, test_split)
280
+ # if experiments == 'target' or experiments == 'all':
281
+ # test_indeces['target'] = get_target_split_indices(active_df, active_col, test_split)
282
+ # if experiments == 'similarity' or experiments == 'all':
283
+ # test_indeces['similarity'] = get_tanimoto_split_indices(active_df, active_col, test_split, n_bins_tanimoto=100)
284
+ # if experiments == 'e3_ligase' or experiments == 'all':
285
+ # test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
286
 
287
  # Make directory ../reports if it does not exist
288
  if not os.path.exists('../reports'):
 
292
  protein2embedding = pdp.load_protein2embedding('../data/uniprot2embedding.h5')
293
  cell2embedding = pdp.load_cell2embedding('../data/cell2embedding.pkl')
294
 
295
+ studies_dir = '../data/studies'
296
+ train_val_perc = f'{int((1 - test_split) * 100)}'
297
+ test_perc = f'{int(test_split * 100)}'
298
+ active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
299
+
300
+ if experiments == 'all':
301
+ experiments = ['standard', 'similarity', 'target']
302
+ else:
303
+ experiments = [experiments]
304
+
305
  # Cross-Validation Training
306
  reports = defaultdict(list)
307
+ for split_type in experiments:
308
+
309
+ train_val_filename = f'{split_type}_train_val_{train_val_perc}split_{active_name}.csv'
310
+ test_filename = f'{split_type}_test_{test_perc}split_{active_name}.csv'
311
 
312
+ train_val_df = pd.read_csv(os.path.join(studies_dir, train_val_filename))
313
+ test_df = pd.read_csv(os.path.join(studies_dir, test_filename))
314
+
315
+ # Get SMILES and precompute fingerprints dictionary
316
+ unique_smiles = pd.concat([train_val_df, test_df])['Smiles'].unique().tolist()
317
+ smiles2fp = {s: np.array(pdp.get_fingerprint(s)) for s in unique_smiles}
318
+
319
  # Get the CV object
320
+ if split_type == 'standard':
321
  kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
322
  group = None
323
  elif split_type == 'e3_ligase':
324
  kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
325
  group = train_val_df['E3 Group'].to_numpy()
326
+ elif split_type == 'similarity':
327
  kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
328
  group = train_val_df['Tanimoto Group'].to_numpy()
329
+ elif split_type == 'target':
330
  kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
331
  group = train_val_df['Uniprot Group'].to_numpy()
332
 
333
  # Start the experiment
334
+ experiment_name = f'{split_type}_{active_name}_test_split_{test_split}'
335
  optuna_reports = pdp.hyperparameter_tuning_and_training(
336
  protein2embedding=protein2embedding,
337
  cell2embedding=cell2embedding,
 
346
  n_trials=n_trials,
347
  max_epochs=max_epochs,
348
  logger_save_dir='../logs',
349
+ logger_name=f'{experiment_name}',
350
  active_label=active_col,
351
  study_filename=f'../reports/study_{experiment_name}.pkl',
352
  force_study=force_study,
 
357
  report.to_csv(f'../reports/{report_name}_{experiment_name}.csv', index=False)
358
  reports[report_name].append(report.copy())
359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
 
361
  if __name__ == '__main__':
362
  cli = CLI(main)
src/run_experiments_aminoacid_counts.py CHANGED
@@ -59,6 +59,7 @@ def main(
59
  force_study (bool): Whether to force the creation of a new study.
60
  experiments (str): Type of experiments to run. Options are 'all', 'standard', 'e3_ligase', 'similarity', 'target'.
61
  """
 
62
 
63
  # Make directory ../reports if it does not exist
64
  if not os.path.exists('../reports'):
@@ -81,7 +82,7 @@ def main(
81
  # NOTE: Check that the protein2embedding is a dictionary of strings
82
  if not all(isinstance(k, str) for k in protein2embedding.keys()):
83
  raise ValueError("All keys in `protein2embedding` must be strings.")
84
- countvec = CountVectorizer(ngram_range=(1,1), analyzer='char')
85
  protein_embeddings = countvec.fit_transform(
86
  list(protein2embedding.keys())
87
  ).toarray()
@@ -126,7 +127,7 @@ def main(
126
  group = train_val_df['Uniprot Group'].to_numpy()
127
 
128
  # Start the experiment
129
- experiment_name = f'{active_name}_test_split_{test_split}_{split_type}'
130
  optuna_reports = pdp.hyperparameter_tuning_and_training(
131
  protein2embedding=protein2embedding,
132
  cell2embedding=cell2embedding,
@@ -141,11 +142,10 @@ def main(
141
  n_trials=n_trials,
142
  max_epochs=max_epochs,
143
  logger_save_dir='../logs',
144
- logger_name=f'logs_{experiment_name}',
145
  active_label=active_col,
146
  study_filename=f'../reports/study_aminoacidcnt_{experiment_name}.pkl',
147
  force_study=force_study,
148
- use_amino_acid_count=True,
149
  )
150
 
151
  # Save the reports to file
 
59
  force_study (bool): Whether to force the creation of a new study.
60
  experiments (str): Type of experiments to run. Options are 'all', 'standard', 'e3_ligase', 'similarity', 'target'.
61
  """
62
+ pl.seed_everything(42)
63
 
64
  # Make directory ../reports if it does not exist
65
  if not os.path.exists('../reports'):
 
82
  # NOTE: Check that the protein2embedding is a dictionary of strings
83
  if not all(isinstance(k, str) for k in protein2embedding.keys()):
84
  raise ValueError("All keys in `protein2embedding` must be strings.")
85
+ countvec = CountVectorizer(ngram_range=(1, 1), analyzer='char')
86
  protein_embeddings = countvec.fit_transform(
87
  list(protein2embedding.keys())
88
  ).toarray()
 
127
  group = train_val_df['Uniprot Group'].to_numpy()
128
 
129
  # Start the experiment
130
+ experiment_name = f'{split_type}_{active_name}_test_split_{test_split}'
131
  optuna_reports = pdp.hyperparameter_tuning_and_training(
132
  protein2embedding=protein2embedding,
133
  cell2embedding=cell2embedding,
 
142
  n_trials=n_trials,
143
  max_epochs=max_epochs,
144
  logger_save_dir='../logs',
145
+ logger_name=f'aminoacidcnt_{experiment_name}',
146
  active_label=active_col,
147
  study_filename=f'../reports/study_aminoacidcnt_{experiment_name}.pkl',
148
  force_study=force_study,
 
149
  )
150
 
151
  # Save the reports to file
src/run_experiments_cells_onehot.py CHANGED
@@ -61,6 +61,7 @@ def main(
61
  force_study (bool): Whether to force the creation of a new study.
62
  experiments (str): Type of experiments to run. Options are 'all', 'standard', 'e3_ligase', 'similarity', 'target'.
63
  """
 
64
 
65
  # Make directory ../reports if it does not exist
66
  if not os.path.exists('../reports'):
@@ -116,7 +117,7 @@ def main(
116
  group = train_val_df['Uniprot Group'].to_numpy()
117
 
118
  # Start the experiment
119
- experiment_name = f'{active_name}_test_split_{test_split}_{split_type}'
120
  optuna_reports = pdp.hyperparameter_tuning_and_training(
121
  protein2embedding=protein2embedding,
122
  cell2embedding=cell2embedding,
@@ -131,11 +132,10 @@ def main(
131
  n_trials=n_trials,
132
  max_epochs=max_epochs,
133
  logger_save_dir='../logs',
134
- logger_name=f'logs_{experiment_name}',
135
  active_label=active_col,
136
  study_filename=f'../reports/study_cellsonehot_{experiment_name}.pkl',
137
  force_study=force_study,
138
- use_cells_one_hot=True,
139
  )
140
 
141
  # Save the reports to file
 
61
  force_study (bool): Whether to force the creation of a new study.
62
  experiments (str): Type of experiments to run. Options are 'all', 'standard', 'e3_ligase', 'similarity', 'target'.
63
  """
64
+ pl.seed_everything(42)
65
 
66
  # Make directory ../reports if it does not exist
67
  if not os.path.exists('../reports'):
 
117
  group = train_val_df['Uniprot Group'].to_numpy()
118
 
119
  # Start the experiment
120
+ experiment_name = f'{split_type}_{active_name}_test_split_{test_split}'
121
  optuna_reports = pdp.hyperparameter_tuning_and_training(
122
  protein2embedding=protein2embedding,
123
  cell2embedding=cell2embedding,
 
132
  n_trials=n_trials,
133
  max_epochs=max_epochs,
134
  logger_save_dir='../logs',
135
+ logger_name=f'cellsonehot_{experiment_name}',
136
  active_label=active_col,
137
  study_filename=f'../reports/study_cellsonehot_{experiment_name}.pkl',
138
  force_study=force_study,
 
139
  )
140
 
141
  # Save the reports to file
src/run_experiments_xgboost.py CHANGED
@@ -232,7 +232,7 @@ def main(
232
  cv_n_splits: int = 5,
233
  num_boost_round: int = 100,
234
  force_study: bool = False,
235
- experiments: str | Literal['all', 'random', 'e3_ligase', 'tanimoto', 'uniprot'] = 'all',
236
  ):
237
  """ Train a PROTAC model using the given datasets and hyperparameters.
238
 
@@ -244,34 +244,38 @@ def main(
244
  """
245
  pl.seed_everything(42)
246
 
247
- # Set the Column to Predict
248
- active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
249
-
250
- # Get Dmax_threshold from the active_col
251
- Dmax_threshold = float(active_col.split('Dmax')[1].split(',')[0].strip('(').strip(')').strip())
252
- pDC50_threshold = float(active_col.split('pDC50')[1].strip('(').strip(')').strip())
253
-
254
- # Load the PROTAC dataset
255
- protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
256
- # Map E3 Ligase Iap to IAP
257
- protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
258
- protac_df[active_col] = protac_df.apply(
259
- lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
260
- )
261
- smiles2fp, protac_df = get_smiles2fp_and_avg_tanimoto(protac_df)
262
-
263
- ## Get the test sets
264
- test_indeces = {}
265
- active_df = protac_df[protac_df[active_col].notna()].copy()
 
 
 
 
266
 
267
- if experiments == 'random' or experiments == 'all':
268
- test_indeces['random'] = get_random_split_indices(active_df, test_split)
269
- if experiments == 'uniprot' or experiments == 'all':
270
- test_indeces['uniprot'] = get_target_split_indices(active_df, active_col, test_split)
271
- if experiments == 'e3_ligase' or experiments == 'all':
272
- test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
273
- if experiments == 'tanimoto' or experiments == 'all':
274
- test_indeces['tanimoto'] = get_tanimoto_split_indices(active_df, active_col, test_split)
275
 
276
  # Make directory ../reports if it does not exist
277
  if not os.path.exists('../reports'):
@@ -281,23 +285,41 @@ def main(
281
  protein2embedding = pdp.load_protein2embedding('../data/uniprot2embedding.h5')
282
  cell2embedding = pdp.load_cell2embedding('../data/cell2embedding.pkl')
283
 
 
 
 
 
 
 
 
 
 
 
284
  # Cross-Validation Training
285
  reports = defaultdict(list)
286
- for split_type, indeces in test_indeces.items():
287
- test_df = active_df.loc[indeces].copy()
288
- train_val_df = active_df[~active_df.index.isin(test_df.index)].copy()
 
 
 
 
 
 
 
 
289
 
290
  # Get the CV object
291
- if split_type == 'random':
292
  kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
293
  group = None
294
  elif split_type == 'e3_ligase':
295
  kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
296
  group = train_val_df['E3 Group'].to_numpy()
297
- elif split_type == 'tanimoto':
298
  kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
299
  group = train_val_df['Tanimoto Group'].to_numpy()
300
- elif split_type == 'uniprot':
301
  kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
302
  group = train_val_df['Uniprot Group'].to_numpy()
303
 
@@ -326,5 +348,6 @@ def main(
326
  report.to_csv(f'../reports/xgboost_{report_name}_{experiment_name}.csv', index=False)
327
  reports[report_name].append(report.copy())
328
 
 
329
  if __name__ == '__main__':
330
  cli = CLI(main)
 
232
  cv_n_splits: int = 5,
233
  num_boost_round: int = 100,
234
  force_study: bool = False,
235
+ experiments: str | Literal['all', 'standard', 'e3_ligase', 'similarity', 'target'] = 'all',
236
  ):
237
  """ Train a PROTAC model using the given datasets and hyperparameters.
238
 
 
244
  """
245
  pl.seed_everything(42)
246
 
247
+ # # Set the Column to Predict
248
+ # active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
249
+
250
+ # # Get Dmax_threshold from the active_col
251
+ # Dmax_threshold = float(active_col.split('Dmax')[1].split(',')[0].strip('(').strip(')').strip())
252
+ # pDC50_threshold = float(active_col.split('pDC50')[1].strip('(').strip(')').strip())
253
+
254
+ # # Load the PROTAC dataset
255
+ # protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
256
+ # # Map E3 Ligase Iap to IAP
257
+ # protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
258
+ # protac_df[active_col] = protac_df.apply(
259
+ # lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
260
+ # )
261
+ # # Drop duplicates
262
+ # protac_df = protac_df.drop_duplicates(subset=['Smiles', 'Uniprot', 'E3 Ligase Uniprot', 'Cell Line Identifier', active_col])
263
+
264
+ # # Precompute fingerprint dictionary and the average Tanimoto similarity
265
+ # smiles2fp, protac_df = get_smiles2fp_and_avg_tanimoto(protac_df)
266
+
267
+ # ## Get the test sets
268
+ # test_indeces = {}
269
+ # active_df = protac_df[protac_df[active_col].notna()].copy()
270
 
271
+ # if experiments == 'standard' or experiments == 'all':
272
+ # test_indeces['standard'] = get_random_split_indices(active_df, test_split)
273
+ # if experiments == 'target' or experiments == 'all':
274
+ # test_indeces['target'] = get_target_split_indices(active_df, active_col, test_split)
275
+ # if experiments == 'similarity' or experiments == 'all':
276
+ # test_indeces['similarity'] = get_tanimoto_split_indices(active_df, active_col, test_split, n_bins_tanimoto=100)
277
+ # if experiments == 'e3_ligase' or experiments == 'all':
278
+ # test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
279
 
280
  # Make directory ../reports if it does not exist
281
  if not os.path.exists('../reports'):
 
285
  protein2embedding = pdp.load_protein2embedding('../data/uniprot2embedding.h5')
286
  cell2embedding = pdp.load_cell2embedding('../data/cell2embedding.pkl')
287
 
288
+ studies_dir = '../data/studies'
289
+ train_val_perc = f'{int((1 - test_split) * 100)}'
290
+ test_perc = f'{int(test_split * 100)}'
291
+ active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
292
+
293
+ if experiments == 'all':
294
+ experiments = ['standard', 'similarity', 'target']
295
+ else:
296
+ experiments = [experiments]
297
+
298
  # Cross-Validation Training
299
  reports = defaultdict(list)
300
+ for split_type in experiments:
301
+
302
+ train_val_filename = f'{split_type}_train_val_{train_val_perc}split_{active_name}.csv'
303
+ test_filename = f'{split_type}_test_{test_perc}split_{active_name}.csv'
304
+
305
+ train_val_df = pd.read_csv(os.path.join(studies_dir, train_val_filename))
306
+ test_df = pd.read_csv(os.path.join(studies_dir, test_filename))
307
+
308
+ # Get SMILES and precompute fingerprints dictionary
309
+ unique_smiles = pd.concat([train_val_df, test_df])['Smiles'].unique().tolist()
310
+ smiles2fp = {s: np.array(pdp.get_fingerprint(s)) for s in unique_smiles}
311
 
312
  # Get the CV object
313
+ if split_type == 'standard':
314
  kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
315
  group = None
316
  elif split_type == 'e3_ligase':
317
  kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
318
  group = train_val_df['E3 Group'].to_numpy()
319
+ elif split_type == 'similarity':
320
  kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
321
  group = train_val_df['Tanimoto Group'].to_numpy()
322
+ elif split_type == 'target':
323
  kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
324
  group = train_val_df['Uniprot Group'].to_numpy()
325
 
 
348
  report.to_csv(f'../reports/xgboost_{report_name}_{experiment_name}.csv', index=False)
349
  reports[report_name].append(report.copy())
350
 
351
+
352
  if __name__ == '__main__':
353
  cli = CLI(main)