Commit
·
251060c
1
Parent(s):
1e811f2
Fixed issue with duplicates + Experiments now rely on predefined datasets + Added experiments on simple embeddings
Browse files- protac_degradation_predictor/optuna_utils.py +32 -38
- protac_degradation_predictor/protac_dataset.py +6 -4
- protac_degradation_predictor/pytorch_models.py +34 -10
- src/get_studies_datasets.py +196 -18
- src/run_experiments.py +58 -150
- src/run_experiments_aminoacid_counts.py +4 -4
- src/run_experiments_cells_onehot.py +3 -3
- src/run_experiments_xgboost.py +57 -34
protac_degradation_predictor/optuna_utils.py
CHANGED
@@ -11,7 +11,7 @@ from .protac_dataset import get_datasets
|
|
11 |
|
12 |
import torch
|
13 |
import optuna
|
14 |
-
from optuna.samplers import TPESampler
|
15 |
import joblib
|
16 |
import pandas as pd
|
17 |
from sklearn.model_selection import (
|
@@ -117,8 +117,6 @@ def pytorch_model_objective(
|
|
117 |
logger_save_dir: str = 'logs',
|
118 |
logger_name: str = 'cv_model',
|
119 |
enable_checkpointing: bool = False,
|
120 |
-
use_cells_one_hot: bool = False,
|
121 |
-
use_amino_acid_count: bool = False,
|
122 |
) -> float:
|
123 |
""" Objective function for hyperparameter optimization.
|
124 |
|
@@ -135,17 +133,24 @@ def pytorch_model_objective(
|
|
135 |
active_label (str): The active label column.
|
136 |
disabled_embeddings (List[str]): The list of disabled embeddings.
|
137 |
"""
|
|
|
|
|
|
|
|
|
|
|
138 |
# Suggest hyperparameters to be used accross the CV folds
|
139 |
-
hidden_dim = trial.
|
140 |
-
|
141 |
-
|
142 |
-
smote_k_neighbors = trial.suggest_categorical('smote_k_neighbors', smote_k_neighbors_options)
|
143 |
-
|
144 |
-
#
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
|
|
149 |
|
150 |
# Start the CV over the folds
|
151 |
X = train_val_df.copy().drop(columns=active_label)
|
@@ -185,12 +190,13 @@ def pytorch_model_objective(
|
|
185 |
hidden_dim=hidden_dim,
|
186 |
batch_size=batch_size,
|
187 |
learning_rate=learning_rate,
|
188 |
-
|
|
|
|
|
189 |
use_batch_norm=use_batch_norm,
|
190 |
max_epochs=max_epochs,
|
191 |
smote_k_neighbors=smote_k_neighbors,
|
192 |
apply_scaling=apply_scaling,
|
193 |
-
use_smote=use_smote,
|
194 |
fast_dev_run=fast_dev_run,
|
195 |
active_label=active_label,
|
196 |
return_predictions=True,
|
@@ -224,18 +230,6 @@ def pytorch_model_objective(
|
|
224 |
|
225 |
# Optuna aims to minimize the pytorch_model_objective
|
226 |
return - val_roc_auc
|
227 |
-
# # Get the majority vote for the test predictions
|
228 |
-
# if test_df is not None and not fast_dev_run:
|
229 |
-
# majority_vote_metrics = get_majority_vote_metrics(test_preds, test_df, active_label)
|
230 |
-
# majority_vote_metrics.update(get_dataframe_stats(train_df, val_df, test_df, active_label))
|
231 |
-
# trial.set_user_attr('majority_vote_metrics', majority_vote_metrics)
|
232 |
-
# logging.info(f'Majority vote metrics: {majority_vote_metrics}')
|
233 |
-
|
234 |
-
# # Get the average validation accuracy and ROC AUC accross the folds
|
235 |
-
# val_roc_auc = np.mean([r['val_roc_auc'] for r in report])
|
236 |
-
|
237 |
-
# # Optuna aims to minimize the pytorch_model_objective
|
238 |
-
# return - val_roc_auc
|
239 |
|
240 |
|
241 |
def hyperparameter_tuning_and_training(
|
@@ -256,8 +250,6 @@ def hyperparameter_tuning_and_training(
|
|
256 |
max_epochs: int = 100,
|
257 |
study_filename: Optional[str] = None,
|
258 |
force_study: bool = False,
|
259 |
-
use_cells_one_hot: bool = False,
|
260 |
-
use_amino_acid_count: bool = False,
|
261 |
) -> tuple:
|
262 |
""" Hyperparameter tuning and training of a PROTAC model.
|
263 |
|
@@ -285,10 +277,13 @@ def hyperparameter_tuning_and_training(
|
|
285 |
"""
|
286 |
pl.seed_everything(42)
|
287 |
|
|
|
|
|
|
|
288 |
# Define the search space
|
289 |
-
hidden_dim_options = [16, 32, 64, 128, 256] #, 512]
|
290 |
batch_size_options = [128, 128] # [4, 8, 16, 32, 64, 128]
|
291 |
-
learning_rate_options = (1e-6, 1e-
|
292 |
smote_k_neighbors_options = list(range(3, 16))
|
293 |
# NOTE: We want Optuna to explore the combination (very low dropout, very
|
294 |
# small hidden_dim)
|
@@ -296,8 +291,10 @@ def hyperparameter_tuning_and_training(
|
|
296 |
|
297 |
# Set the verbosity of Optuna
|
298 |
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
|
|
|
|
|
|
299 |
# Create an Optuna study object
|
300 |
-
sampler = TPESampler(seed=42, multivariate=True)
|
301 |
study = optuna.create_study(direction='minimize', sampler=sampler)
|
302 |
|
303 |
study_loaded = False
|
@@ -328,8 +325,6 @@ def hyperparameter_tuning_and_training(
|
|
328 |
active_label=active_label,
|
329 |
max_epochs=max_epochs,
|
330 |
disabled_embeddings=[],
|
331 |
-
use_cells_one_hot=use_cells_one_hot,
|
332 |
-
use_amino_acid_count=use_amino_acid_count,
|
333 |
),
|
334 |
n_trials=n_trials,
|
335 |
)
|
@@ -360,10 +355,8 @@ def hyperparameter_tuning_and_training(
|
|
360 |
disabled_embeddings=[],
|
361 |
use_logger=True,
|
362 |
logger_save_dir=logger_save_dir,
|
363 |
-
logger_name=f'{logger_name}
|
364 |
enable_checkpointing=True,
|
365 |
-
use_cells_one_hot=use_cells_one_hot,
|
366 |
-
use_amino_acid_count=use_amino_acid_count,
|
367 |
)
|
368 |
|
369 |
# Retrain N models with the best hyperparameters (measure model uncertainty)
|
@@ -385,12 +378,13 @@ def hyperparameter_tuning_and_training(
|
|
385 |
disabled_embeddings=[],
|
386 |
use_logger=True,
|
387 |
logger_save_dir=logger_save_dir,
|
388 |
-
logger_name=f'{
|
389 |
enable_checkpointing=True,
|
390 |
checkpoint_model_name=f'best_model_n{i}_{split_type}',
|
391 |
return_predictions=True,
|
392 |
batch_size=128,
|
393 |
apply_scaling=True,
|
|
|
394 |
**study.best_params,
|
395 |
)
|
396 |
# Rename the keys in the metrics dictionary
|
|
|
11 |
|
12 |
import torch
|
13 |
import optuna
|
14 |
+
from optuna.samplers import TPESampler, QMCSampler
|
15 |
import joblib
|
16 |
import pandas as pd
|
17 |
from sklearn.model_selection import (
|
|
|
117 |
logger_save_dir: str = 'logs',
|
118 |
logger_name: str = 'cv_model',
|
119 |
enable_checkpointing: bool = False,
|
|
|
|
|
120 |
) -> float:
|
121 |
""" Objective function for hyperparameter optimization.
|
122 |
|
|
|
133 |
active_label (str): The active label column.
|
134 |
disabled_embeddings (List[str]): The list of disabled embeddings.
|
135 |
"""
|
136 |
+
# Set fixed hyperparameters
|
137 |
+
batch_size = 128
|
138 |
+
apply_scaling = True # It is dynamically disabled for binary data
|
139 |
+
use_batch_norm = True
|
140 |
+
|
141 |
# Suggest hyperparameters to be used accross the CV folds
|
142 |
+
hidden_dim = trial.suggest_int('hidden_dim', 32, 512, step=32)
|
143 |
+
smote_k_neighbors = trial.suggest_int('smote_k_neighbors', 0, 12)
|
144 |
+
# hidden_dim = trial.suggest_categorical('hidden_dim', hidden_dim_options)
|
145 |
+
# smote_k_neighbors = trial.suggest_categorical('smote_k_neighbors', smote_k_neighbors_options)
|
146 |
+
# dropout = trial.suggest_float('dropout', *dropout_options)
|
147 |
+
# use_batch_norm = trial.suggest_categorical('use_batch_norm', [True, False])
|
148 |
+
|
149 |
+
# Optimizer parameters
|
150 |
+
learning_rate = trial.suggest_float('learning_rate', 1e-6, 1e-1, log=True)
|
151 |
+
beta1 = trial.suggest_float('beta1', 0.1, 0.999)
|
152 |
+
beta2 = trial.suggest_float('beta2', 0.1, 0.999)
|
153 |
+
eps = trial.suggest_float('eps', 1e-9, 1.0, log=True)
|
154 |
|
155 |
# Start the CV over the folds
|
156 |
X = train_val_df.copy().drop(columns=active_label)
|
|
|
190 |
hidden_dim=hidden_dim,
|
191 |
batch_size=batch_size,
|
192 |
learning_rate=learning_rate,
|
193 |
+
beta1=beta1,
|
194 |
+
beta2=beta2,
|
195 |
+
eps=eps,
|
196 |
use_batch_norm=use_batch_norm,
|
197 |
max_epochs=max_epochs,
|
198 |
smote_k_neighbors=smote_k_neighbors,
|
199 |
apply_scaling=apply_scaling,
|
|
|
200 |
fast_dev_run=fast_dev_run,
|
201 |
active_label=active_label,
|
202 |
return_predictions=True,
|
|
|
230 |
|
231 |
# Optuna aims to minimize the pytorch_model_objective
|
232 |
return - val_roc_auc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
|
234 |
|
235 |
def hyperparameter_tuning_and_training(
|
|
|
250 |
max_epochs: int = 100,
|
251 |
study_filename: Optional[str] = None,
|
252 |
force_study: bool = False,
|
|
|
|
|
253 |
) -> tuple:
|
254 |
""" Hyperparameter tuning and training of a PROTAC model.
|
255 |
|
|
|
277 |
"""
|
278 |
pl.seed_everything(42)
|
279 |
|
280 |
+
# TODO: Make the following code more modular, i.e., the ranges shall be put
|
281 |
+
# in dictionaries or config files or something like that.
|
282 |
+
|
283 |
# Define the search space
|
284 |
+
hidden_dim_options = [8, 16, 32, 64, 128, 256] #, 512]
|
285 |
batch_size_options = [128, 128] # [4, 8, 16, 32, 64, 128]
|
286 |
+
learning_rate_options = (1e-6, 1e-1) # min and max values for loguniform distribution
|
287 |
smote_k_neighbors_options = list(range(3, 16))
|
288 |
# NOTE: We want Optuna to explore the combination (very low dropout, very
|
289 |
# small hidden_dim)
|
|
|
291 |
|
292 |
# Set the verbosity of Optuna
|
293 |
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
294 |
+
# Set a quasi-random sampler, as suggested in: https://github.com/google-research/tuning_playbook?tab=readme-ov-file#faqs
|
295 |
+
# sampler = TPESampler(seed=42, multivariate=True)
|
296 |
+
sampler = QMCSampler(qmc_type='halton', scramble=True, seed=42)
|
297 |
# Create an Optuna study object
|
|
|
298 |
study = optuna.create_study(direction='minimize', sampler=sampler)
|
299 |
|
300 |
study_loaded = False
|
|
|
325 |
active_label=active_label,
|
326 |
max_epochs=max_epochs,
|
327 |
disabled_embeddings=[],
|
|
|
|
|
328 |
),
|
329 |
n_trials=n_trials,
|
330 |
)
|
|
|
355 |
disabled_embeddings=[],
|
356 |
use_logger=True,
|
357 |
logger_save_dir=logger_save_dir,
|
358 |
+
logger_name=f'cv_model_{logger_name}',
|
359 |
enable_checkpointing=True,
|
|
|
|
|
360 |
)
|
361 |
|
362 |
# Retrain N models with the best hyperparameters (measure model uncertainty)
|
|
|
378 |
disabled_embeddings=[],
|
379 |
use_logger=True,
|
380 |
logger_save_dir=logger_save_dir,
|
381 |
+
logger_name=f'best_model_n{i}_{logger_name}',
|
382 |
enable_checkpointing=True,
|
383 |
checkpoint_model_name=f'best_model_n{i}_{split_type}',
|
384 |
return_predictions=True,
|
385 |
batch_size=128,
|
386 |
apply_scaling=True,
|
387 |
+
use_batch_norm=True,
|
388 |
**study.best_params,
|
389 |
)
|
390 |
# Rename the keys in the metrics dictionary
|
protac_degradation_predictor/protac_dataset.py
CHANGED
@@ -319,7 +319,6 @@ def get_datasets(
|
|
319 |
protein2embedding: Dict = None,
|
320 |
cell2embedding: Dict = None,
|
321 |
smiles2fp: Dict = None,
|
322 |
-
use_smote: bool = True,
|
323 |
smote_k_neighbors: int = 5,
|
324 |
active_label: str = 'Active',
|
325 |
disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
|
@@ -345,14 +344,17 @@ def get_datasets(
|
|
345 |
use_single_scaler (bool): Whether to use a single scaler for all features.
|
346 |
apply_scaling (bool): Whether to apply scaling to the data now. Defaults to False (the Pytorch Lightning model does that).
|
347 |
"""
|
348 |
-
|
|
|
|
|
|
|
349 |
train_ds = PROTAC_Dataset(
|
350 |
train_df,
|
351 |
protein2embedding,
|
352 |
cell2embedding,
|
353 |
smiles2fp,
|
354 |
-
use_smote=
|
355 |
-
oversampler=oversampler
|
356 |
active_label=active_label,
|
357 |
disabled_embeddings=disabled_embeddings,
|
358 |
scaler=scaler,
|
|
|
319 |
protein2embedding: Dict = None,
|
320 |
cell2embedding: Dict = None,
|
321 |
smiles2fp: Dict = None,
|
|
|
322 |
smote_k_neighbors: int = 5,
|
323 |
active_label: str = 'Active',
|
324 |
disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
|
|
|
344 |
use_single_scaler (bool): Whether to use a single scaler for all features.
|
345 |
apply_scaling (bool): Whether to apply scaling to the data now. Defaults to False (the Pytorch Lightning model does that).
|
346 |
"""
|
347 |
+
if smote_k_neighbors:
|
348 |
+
oversampler = SMOTE(k_neighbors=smote_k_neighbors, random_state=42)
|
349 |
+
else:
|
350 |
+
oversampler = None
|
351 |
train_ds = PROTAC_Dataset(
|
352 |
train_df,
|
353 |
protein2embedding,
|
354 |
cell2embedding,
|
355 |
smiles2fp,
|
356 |
+
use_smote=True if smote_k_neighbors else False,
|
357 |
+
oversampler=oversampler,
|
358 |
active_label=active_label,
|
359 |
disabled_embeddings=disabled_embeddings,
|
360 |
scaler=scaler,
|
protac_degradation_predictor/pytorch_models.py
CHANGED
@@ -171,6 +171,7 @@ class PROTAC_Model(pl.LightningModule):
|
|
171 |
test_dataset: PROTAC_Dataset = None,
|
172 |
disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
|
173 |
apply_scaling: bool = True,
|
|
|
174 |
):
|
175 |
""" Initialize the PROTAC Pytorch Lightning model.
|
176 |
|
@@ -189,6 +190,7 @@ class PROTAC_Model(pl.LightningModule):
|
|
189 |
test_dataset (PROTAC_Dataset): The test dataset
|
190 |
disabled_embeddings (list): List of disabled embeddings. Can be 'poi', 'e3', 'cell', 'smiles'
|
191 |
apply_scaling (bool): Whether to apply scaling to the embeddings
|
|
|
192 |
"""
|
193 |
super().__init__()
|
194 |
# Set our init args as class attributes
|
@@ -328,15 +330,31 @@ class PROTAC_Model(pl.LightningModule):
|
|
328 |
return self.step(batch, batch_idx, 'test')
|
329 |
|
330 |
def configure_optimizers(self):
|
331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
332 |
return {
|
333 |
'optimizer': optimizer,
|
334 |
-
'lr_scheduler':
|
335 |
-
optimizer=optimizer,
|
336 |
-
mode='min',
|
337 |
-
factor=0.1,
|
338 |
-
patience=0,
|
339 |
-
),
|
340 |
'interval': 'step', # or 'epoch'
|
341 |
'frequency': 1,
|
342 |
'monitor': 'val_loss',
|
@@ -411,12 +429,14 @@ def train_model(
|
|
411 |
hidden_dim: int = 768,
|
412 |
batch_size: int = 128,
|
413 |
learning_rate: float = 2e-5,
|
|
|
|
|
|
|
414 |
dropout: float = 0.2,
|
415 |
max_epochs: int = 50,
|
416 |
use_batch_norm: bool = False,
|
417 |
join_embeddings: Literal['beginning', 'concat', 'sum'] = 'sum',
|
418 |
-
smote_k_neighbors:int = 5,
|
419 |
-
use_smote: bool = True,
|
420 |
apply_scaling: bool = True,
|
421 |
active_label: str = 'Active',
|
422 |
fast_dev_run: bool = False,
|
@@ -468,7 +488,6 @@ def train_model(
|
|
468 |
protein2embedding,
|
469 |
cell2embedding,
|
470 |
smiles2fp,
|
471 |
-
use_smote=use_smote,
|
472 |
smote_k_neighbors=smote_k_neighbors,
|
473 |
active_label=active_label,
|
474 |
disabled_embeddings=disabled_embeddings,
|
@@ -540,6 +559,10 @@ def train_model(
|
|
540 |
devices=1,
|
541 |
num_nodes=1,
|
542 |
)
|
|
|
|
|
|
|
|
|
543 |
model = PROTAC_Model(
|
544 |
hidden_dim=hidden_dim,
|
545 |
smiles_emb_dim=smiles_emb_dim,
|
@@ -556,6 +579,7 @@ def train_model(
|
|
556 |
val_dataset=val_ds,
|
557 |
test_dataset=test_ds if test_df is not None else None,
|
558 |
disabled_embeddings=disabled_embeddings,
|
|
|
559 |
)
|
560 |
with warnings.catch_warnings():
|
561 |
warnings.simplefilter("ignore")
|
|
|
171 |
test_dataset: PROTAC_Dataset = None,
|
172 |
disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
|
173 |
apply_scaling: bool = True,
|
174 |
+
extra_optim_params: Optional[dict] = None,
|
175 |
):
|
176 |
""" Initialize the PROTAC Pytorch Lightning model.
|
177 |
|
|
|
190 |
test_dataset (PROTAC_Dataset): The test dataset
|
191 |
disabled_embeddings (list): List of disabled embeddings. Can be 'poi', 'e3', 'cell', 'smiles'
|
192 |
apply_scaling (bool): Whether to apply scaling to the embeddings
|
193 |
+
extra_optim_params (dict): Extra parameters for the optimizer
|
194 |
"""
|
195 |
super().__init__()
|
196 |
# Set our init args as class attributes
|
|
|
330 |
return self.step(batch, batch_idx, 'test')
|
331 |
|
332 |
def configure_optimizers(self):
|
333 |
+
# Define optimizer
|
334 |
+
if self.extra_optim_params is not None:
|
335 |
+
optimizer = optim.Adam(self.parameters(), lr=self.learning_rate, **self.extra_optim_params)
|
336 |
+
else:
|
337 |
+
optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
|
338 |
+
# Define LR scheduler
|
339 |
+
if self.trainer.max_epochs:
|
340 |
+
total_iters = self.trainer.max_epochs
|
341 |
+
elif self.trainer.max_steps:
|
342 |
+
total_iters = self.trainer.max_steps
|
343 |
+
else:
|
344 |
+
total_iters = 20
|
345 |
+
lr_scheduler = optim.lr_scheduler.LinearLR(
|
346 |
+
optimizer=optimizer,
|
347 |
+
total_iters=total_iters,
|
348 |
+
)
|
349 |
+
# lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
|
350 |
+
# optimizer=optimizer,
|
351 |
+
# mode='min',
|
352 |
+
# factor=0.01,
|
353 |
+
# patience=0,
|
354 |
+
# )
|
355 |
return {
|
356 |
'optimizer': optimizer,
|
357 |
+
'lr_scheduler': lr_scheduler,
|
|
|
|
|
|
|
|
|
|
|
358 |
'interval': 'step', # or 'epoch'
|
359 |
'frequency': 1,
|
360 |
'monitor': 'val_loss',
|
|
|
429 |
hidden_dim: int = 768,
|
430 |
batch_size: int = 128,
|
431 |
learning_rate: float = 2e-5,
|
432 |
+
beta1: float = 0.9,
|
433 |
+
beta2: float = 0.999,
|
434 |
+
eps: float = 1e-8,
|
435 |
dropout: float = 0.2,
|
436 |
max_epochs: int = 50,
|
437 |
use_batch_norm: bool = False,
|
438 |
join_embeddings: Literal['beginning', 'concat', 'sum'] = 'sum',
|
439 |
+
smote_k_neighbors: int = 5,
|
|
|
440 |
apply_scaling: bool = True,
|
441 |
active_label: str = 'Active',
|
442 |
fast_dev_run: bool = False,
|
|
|
488 |
protein2embedding,
|
489 |
cell2embedding,
|
490 |
smiles2fp,
|
|
|
491 |
smote_k_neighbors=smote_k_neighbors,
|
492 |
active_label=active_label,
|
493 |
disabled_embeddings=disabled_embeddings,
|
|
|
559 |
devices=1,
|
560 |
num_nodes=1,
|
561 |
)
|
562 |
+
extra_optim_params = {
|
563 |
+
'betas': (beta1, beta2),
|
564 |
+
'eps': eps,
|
565 |
+
}
|
566 |
model = PROTAC_Model(
|
567 |
hidden_dim=hidden_dim,
|
568 |
smiles_emb_dim=smiles_emb_dim,
|
|
|
579 |
val_dataset=val_ds,
|
580 |
test_dataset=test_ds if test_df is not None else None,
|
581 |
disabled_embeddings=disabled_embeddings,
|
582 |
+
extra_optim_params=extra_optim_params,
|
583 |
)
|
584 |
with warnings.catch_warnings():
|
585 |
warnings.simplefilter("ignore")
|
src/get_studies_datasets.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import os
|
2 |
import sys
|
|
|
3 |
|
4 |
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
5 |
import protac_degradation_predictor as pdp
|
@@ -10,6 +11,7 @@ import logging
|
|
10 |
from typing import Literal
|
11 |
|
12 |
from sklearn.preprocessing import OrdinalEncoder
|
|
|
13 |
from tqdm import tqdm
|
14 |
import pandas as pd
|
15 |
import numpy as np
|
@@ -109,7 +111,7 @@ def get_tanimoto_split_indices(
|
|
109 |
active_df: pd.DataFrame,
|
110 |
active_col: str,
|
111 |
test_split: float,
|
112 |
-
n_bins_tanimoto: int = 200
|
113 |
) -> pd.Index:
|
114 |
""" Get the indices of the test set using the Tanimoto-based split.
|
115 |
|
@@ -154,9 +156,11 @@ def get_tanimoto_split_indices(
|
|
154 |
test_df.append(group_df)
|
155 |
continue
|
156 |
# Be more selective and make sure that the percentage of active and
|
157 |
-
# inactive is
|
158 |
-
|
159 |
-
|
|
|
|
|
160 |
test_df.append(group_df)
|
161 |
test_df = pd.concat(test_df)
|
162 |
return test_df.index
|
@@ -212,10 +216,130 @@ def get_target_split_indices(active_df: pd.DataFrame, active_col: str, test_spli
|
|
212 |
return test_df.index
|
213 |
|
214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
def main(
|
216 |
active_col: str = 'Active (Dmax 0.6, pDC50 6.0)',
|
217 |
test_split: float = 0.1,
|
218 |
studies: str | Literal['all', 'standard', 'e3_ligase', 'similarity', 'target'] = 'all',
|
|
|
219 |
):
|
220 |
""" Get and save the datasets for the different studies.
|
221 |
|
@@ -237,49 +361,103 @@ def main(
|
|
237 |
protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
|
238 |
# Map E3 Ligase Iap to IAP
|
239 |
protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
protac_df[active_col] = protac_df.apply(
|
241 |
lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
|
242 |
)
|
|
|
|
|
243 |
_, protac_df = get_smiles2fp_and_avg_tanimoto(protac_df)
|
244 |
|
245 |
## Get the test sets
|
246 |
test_indeces = {}
|
247 |
active_df = protac_df[protac_df[active_col].notna()].copy()
|
248 |
|
249 |
-
# Remove legacy column 'Active - OR' if it exists
|
250 |
-
if 'Active - OR' in active_df.columns:
|
251 |
-
active_df.drop(columns='Active - OR', inplace=True)
|
252 |
-
|
253 |
if studies == 'standard' or studies == 'all':
|
254 |
test_indeces['standard'] = get_random_split_indices(active_df, test_split)
|
255 |
if studies == 'target' or studies == 'all':
|
256 |
test_indeces['target'] = get_target_split_indices(active_df, active_col, test_split)
|
257 |
-
if studies == 'e3_ligase' or studies == 'all':
|
258 |
-
test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
|
259 |
if studies == 'similarity' or studies == 'all':
|
260 |
test_indeces['similarity'] = get_tanimoto_split_indices(active_df, active_col, test_split)
|
|
|
|
|
261 |
|
262 |
# Make directory for studies datasets if it does not exist
|
263 |
data_dir = '../data/studies'
|
264 |
if not os.path.exists(data_dir):
|
265 |
os.makedirs(data_dir)
|
266 |
|
267 |
-
#
|
268 |
-
|
269 |
-
|
270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
|
272 |
-
# Save the datasets
|
273 |
|
|
|
274 |
train_val_perc = f'{int((1 - test_split) * 100)}'
|
275 |
test_perc = f'{int(test_split * 100)}'
|
276 |
|
277 |
train_val_filename = f'{data_dir}/{split_type}_train_val_{train_val_perc}split_{active_name}.csv'
|
278 |
test_filename = f'{data_dir}/{split_type}_test_{test_perc}split_{active_name}.csv'
|
279 |
|
280 |
-
print('')
|
281 |
-
print(f'Saving train_val datasets as: {train_val_filename}')
|
282 |
-
print(f'Saving test datasets as: {test_filename}')
|
283 |
|
284 |
train_val_df.to_csv(train_val_filename, index=False)
|
285 |
test_df.to_csv(test_filename, index=False)
|
|
|
1 |
import os
|
2 |
import sys
|
3 |
+
from typing import Dict
|
4 |
|
5 |
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
6 |
import protac_degradation_predictor as pdp
|
|
|
11 |
from typing import Literal
|
12 |
|
13 |
from sklearn.preprocessing import OrdinalEncoder
|
14 |
+
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold
|
15 |
from tqdm import tqdm
|
16 |
import pandas as pd
|
17 |
import numpy as np
|
|
|
111 |
active_df: pd.DataFrame,
|
112 |
active_col: str,
|
113 |
test_split: float,
|
114 |
+
n_bins_tanimoto: int = 100, # Original: 200
|
115 |
) -> pd.Index:
|
116 |
""" Get the indices of the test set using the Tanimoto-based split.
|
117 |
|
|
|
156 |
test_df.append(group_df)
|
157 |
continue
|
158 |
# Be more selective and make sure that the percentage of active and
|
159 |
+
# inactive is not over-exceeding 60%
|
160 |
+
perc_active_group = (num_active_group + num_active_test) / (num_entries_test + num_entries)
|
161 |
+
perc_inactive_group = (num_inactive_group + num_inactive_test) / (num_entries_test + num_entries)
|
162 |
+
if perc_active_group < 0.6:
|
163 |
+
if perc_inactive_group < 0.6:
|
164 |
test_df.append(group_df)
|
165 |
test_df = pd.concat(test_df)
|
166 |
return test_df.index
|
|
|
216 |
return test_df.index
|
217 |
|
218 |
|
219 |
+
def get_dataframe_stats(
|
220 |
+
train_df = None,
|
221 |
+
val_df = None,
|
222 |
+
test_df = None,
|
223 |
+
active_label = 'Active',
|
224 |
+
) -> Dict:
|
225 |
+
""" Get some statistics from the dataframes.
|
226 |
+
|
227 |
+
Args:
|
228 |
+
train_df (pd.DataFrame): The training set.
|
229 |
+
val_df (pd.DataFrame): The validation set.
|
230 |
+
test_df (pd.DataFrame): The test set.
|
231 |
+
"""
|
232 |
+
stats = {}
|
233 |
+
if train_df is not None:
|
234 |
+
stats['train_len'] = len(train_df)
|
235 |
+
stats['train_active_perc'] = train_df[active_label].sum() / len(train_df)
|
236 |
+
stats['train_inactive_perc'] = (len(train_df) - train_df[active_label].sum()) / len(train_df)
|
237 |
+
stats['train_avg_tanimoto_dist'] = train_df['Avg Tanimoto'].mean()
|
238 |
+
if val_df is not None:
|
239 |
+
stats['val_len'] = len(val_df)
|
240 |
+
stats['val_active_perc'] = val_df[active_label].sum() / len(val_df)
|
241 |
+
stats['val_inactive_perc'] = (len(val_df) - val_df[active_label].sum()) / len(val_df)
|
242 |
+
stats['val_avg_tanimoto_dist'] = val_df['Avg Tanimoto'].mean()
|
243 |
+
if test_df is not None:
|
244 |
+
stats['test_len'] = len(test_df)
|
245 |
+
stats['test_active_perc'] = test_df[active_label].sum() / len(test_df)
|
246 |
+
stats['test_inactive_perc'] = (len(test_df) - test_df[active_label].sum()) / len(test_df)
|
247 |
+
stats['test_avg_tanimoto_dist'] = test_df['Avg Tanimoto'].mean()
|
248 |
+
if train_df is not None and val_df is not None:
|
249 |
+
leaking_uniprot = list(set(train_df['Uniprot']).intersection(set(val_df['Uniprot'])))
|
250 |
+
leaking_smiles = list(set(train_df['Smiles']).intersection(set(val_df['Smiles'])))
|
251 |
+
stats['num_leaking_uniprot_train_val'] = len(leaking_uniprot)
|
252 |
+
stats['num_leaking_smiles_train_val'] = len(leaking_smiles)
|
253 |
+
stats['perc_leaking_uniprot_train_val'] = len(train_df[train_df['Uniprot'].isin(leaking_uniprot)]) / len(train_df)
|
254 |
+
stats['perc_leaking_smiles_train_val'] = len(train_df[train_df['Smiles'].isin(leaking_smiles)]) / len(train_df)
|
255 |
+
|
256 |
+
key_cols = [
|
257 |
+
'Smiles',
|
258 |
+
'Uniprot',
|
259 |
+
'E3 Ligase Uniprot',
|
260 |
+
'Cell Line Identifier',
|
261 |
+
]
|
262 |
+
class_cols = ['DC50 (nM)', 'Dmax (%)']
|
263 |
+
# Check if there are any entries that are in BOTH train and val sets
|
264 |
+
tmp_train_df = train_df[key_cols + class_cols].copy()
|
265 |
+
tmp_val_df = val_df[key_cols + class_cols].copy()
|
266 |
+
stats['leaking_train_val'] = len(tmp_train_df.merge(tmp_val_df, on=key_cols + class_cols, how='inner'))
|
267 |
+
|
268 |
+
|
269 |
+
if train_df is not None and test_df is not None:
|
270 |
+
leaking_uniprot = list(set(train_df['Uniprot']).intersection(set(test_df['Uniprot'])))
|
271 |
+
leaking_smiles = list(set(train_df['Smiles']).intersection(set(test_df['Smiles'])))
|
272 |
+
stats['num_leaking_uniprot_train_test'] = len(leaking_uniprot)
|
273 |
+
stats['num_leaking_smiles_train_test'] = len(leaking_smiles)
|
274 |
+
stats['perc_leaking_uniprot_train_test'] = len(train_df[train_df['Uniprot'].isin(leaking_uniprot)]) / len(train_df)
|
275 |
+
stats['perc_leaking_smiles_train_test'] = len(train_df[train_df['Smiles'].isin(leaking_smiles)]) / len(train_df)
|
276 |
+
|
277 |
+
key_cols = [
|
278 |
+
'Smiles',
|
279 |
+
'Uniprot',
|
280 |
+
'E3 Ligase Uniprot',
|
281 |
+
'Cell Line Identifier',
|
282 |
+
]
|
283 |
+
class_cols = ['DC50 (nM)', 'Dmax (%)']
|
284 |
+
# Check if there are any entries that are in BOTH train and test sets
|
285 |
+
tmp_train_df = train_df[key_cols + class_cols].copy()
|
286 |
+
tmp_test_df = test_df[key_cols + class_cols].copy()
|
287 |
+
stats['leaking_train_test'] = len(tmp_train_df.merge(tmp_test_df, on=key_cols + class_cols, how='inner'))
|
288 |
+
|
289 |
+
return stats
|
290 |
+
|
291 |
+
|
292 |
+
def merge_numerical_cols(group):
|
293 |
+
key_cols = [
|
294 |
+
'Smiles',
|
295 |
+
'Uniprot',
|
296 |
+
'E3 Ligase Uniprot',
|
297 |
+
'Cell Line Identifier',
|
298 |
+
]
|
299 |
+
class_cols = ['DC50 (nM)', 'Dmax (%)']
|
300 |
+
# Loop over all numerical columns
|
301 |
+
for col in group.select_dtypes(include=[np.number]).columns:
|
302 |
+
if col == 'Compound ID':
|
303 |
+
continue
|
304 |
+
# Compute the geometric mean for the column
|
305 |
+
values = group[col].dropna()
|
306 |
+
if not values.empty:
|
307 |
+
group[col] = np.prod(values) ** (1 / len(values))
|
308 |
+
|
309 |
+
row = group.drop_duplicates(subset=key_cols + class_cols).reset_index(drop=True)
|
310 |
+
|
311 |
+
assert len(row) == 1
|
312 |
+
|
313 |
+
return row
|
314 |
+
|
315 |
+
|
316 |
+
def remove_duplicates(df):
|
317 |
+
key_cols = [
|
318 |
+
'Smiles',
|
319 |
+
'Uniprot',
|
320 |
+
'E3 Ligase Uniprot',
|
321 |
+
'Cell Line Identifier',
|
322 |
+
]
|
323 |
+
class_cols = ['DC50 (nM)', 'Dmax (%)']
|
324 |
+
# Check if there are any duplicated entries having the same key columns, if
|
325 |
+
# so, merge them by applying a geometric mean to their DC50 and Dmax columns
|
326 |
+
duplicated = df[df.duplicated(subset=key_cols, keep=False)]
|
327 |
+
|
328 |
+
# NOTE: Reset index to remove the multi-index
|
329 |
+
merged = duplicated.groupby(key_cols).apply(lambda x: merge_numerical_cols(x))
|
330 |
+
merged = merged.reset_index(drop=True)
|
331 |
+
|
332 |
+
# Remove the duplicated entries from the original dataframe df
|
333 |
+
df = df[~df.duplicated(subset=key_cols, keep=False)]
|
334 |
+
# Concatenate the merged dataframe with the original dataframe
|
335 |
+
return pd.concat([df, merged], ignore_index=True)
|
336 |
+
|
337 |
+
|
338 |
def main(
|
339 |
active_col: str = 'Active (Dmax 0.6, pDC50 6.0)',
|
340 |
test_split: float = 0.1,
|
341 |
studies: str | Literal['all', 'standard', 'e3_ligase', 'similarity', 'target'] = 'all',
|
342 |
+
cv_n_splits: int = 5,
|
343 |
):
|
344 |
""" Get and save the datasets for the different studies.
|
345 |
|
|
|
361 |
protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
|
362 |
# Map E3 Ligase Iap to IAP
|
363 |
protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
|
364 |
+
|
365 |
+
# Remove duplicates
|
366 |
+
protac_df = remove_duplicates(protac_df)
|
367 |
+
|
368 |
+
# Remove legacy columns if they exist
|
369 |
+
if 'Active - OR' in protac_df.columns:
|
370 |
+
protac_df.drop(columns='Active - OR', inplace=True)
|
371 |
+
if 'Active - AND' in protac_df.columns:
|
372 |
+
protac_df.drop(columns='Active - AND', inplace=True)
|
373 |
+
if 'Active' in protac_df.columns:
|
374 |
+
protac_df.drop(columns='Active', inplace=True)
|
375 |
+
|
376 |
+
# Calculate Activity and add it as a column
|
377 |
protac_df[active_col] = protac_df.apply(
|
378 |
lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
|
379 |
)
|
380 |
+
|
381 |
+
# Precompute fingerprints and average Tanimoto similarity
|
382 |
_, protac_df = get_smiles2fp_and_avg_tanimoto(protac_df)
|
383 |
|
384 |
## Get the test sets
|
385 |
test_indeces = {}
|
386 |
active_df = protac_df[protac_df[active_col].notna()].copy()
|
387 |
|
|
|
|
|
|
|
|
|
388 |
if studies == 'standard' or studies == 'all':
|
389 |
test_indeces['standard'] = get_random_split_indices(active_df, test_split)
|
390 |
if studies == 'target' or studies == 'all':
|
391 |
test_indeces['target'] = get_target_split_indices(active_df, active_col, test_split)
|
|
|
|
|
392 |
if studies == 'similarity' or studies == 'all':
|
393 |
test_indeces['similarity'] = get_tanimoto_split_indices(active_df, active_col, test_split)
|
394 |
+
# if studies == 'e3_ligase' or studies == 'all':
|
395 |
+
# test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
|
396 |
|
397 |
# Make directory for studies datasets if it does not exist
|
398 |
data_dir = '../data/studies'
|
399 |
if not os.path.exists(data_dir):
|
400 |
os.makedirs(data_dir)
|
401 |
|
402 |
+
# Open file for reporting
|
403 |
+
with open(f'{data_dir}/report_datasets.md', 'w') as f:
|
404 |
+
# Cross-Validation Training
|
405 |
+
for split_type, indeces in test_indeces.items():
|
406 |
+
test_df = active_df.loc[indeces].copy()
|
407 |
+
train_val_df = active_df[~active_df.index.isin(test_df.index)].copy()
|
408 |
+
|
409 |
+
# Print statistics on active/inactive percentages
|
410 |
+
perc_active = train_val_df[active_col].sum() / len(train_val_df)
|
411 |
+
print('-' * 80)
|
412 |
+
print(f'{split_type.capitalize()} Split')
|
413 |
+
print(f'Len Train/Val:{len(train_val_df)}')
|
414 |
+
print(f'Len Test: {len(test_df)}')
|
415 |
+
print(f'Percentage Active in Train/Val: {perc_active:.2%}')
|
416 |
+
print(f'Percentage Inactive in Train/Val: {1 - perc_active:.2%}')
|
417 |
+
|
418 |
+
# Get the CV object
|
419 |
+
if split_type == 'standard':
|
420 |
+
kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
421 |
+
group = None
|
422 |
+
elif split_type == 'e3_ligase':
|
423 |
+
kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
424 |
+
group = train_val_df['E3 Group'].to_numpy()
|
425 |
+
elif split_type == 'similarity':
|
426 |
+
kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
427 |
+
group = train_val_df['Tanimoto Group'].to_numpy()
|
428 |
+
elif split_type == 'target':
|
429 |
+
kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
430 |
+
group = train_val_df['Uniprot Group'].to_numpy()
|
431 |
+
|
432 |
+
# Get the folds on the train_val_df, then collect statistics on active/inactive percentages
|
433 |
+
stats = []
|
434 |
+
for i, (train_index, val_index) in enumerate(kf.split(train_val_df, train_val_df[active_col].to_list(), group)):
|
435 |
+
train_df = train_val_df.iloc[train_index]
|
436 |
+
val_df = train_val_df.iloc[val_index]
|
437 |
+
|
438 |
+
s = get_dataframe_stats(train_df, val_df, test_df, active_col)
|
439 |
+
s['fold'] = i + 1
|
440 |
+
stats.append(s)
|
441 |
+
|
442 |
+
# Append the statistics as markdown to report file f
|
443 |
+
stats_df = pd.DataFrame(stats)
|
444 |
+
f.write(f'## {split_type.capitalize()} Split\n\n')
|
445 |
+
f.write(stats_df.to_markdown(index=False))
|
446 |
+
f.write('\n\n')
|
447 |
+
print('-' * 80)
|
448 |
+
|
449 |
|
|
|
450 |
|
451 |
+
# Save the datasets
|
452 |
train_val_perc = f'{int((1 - test_split) * 100)}'
|
453 |
test_perc = f'{int(test_split * 100)}'
|
454 |
|
455 |
train_val_filename = f'{data_dir}/{split_type}_train_val_{train_val_perc}split_{active_name}.csv'
|
456 |
test_filename = f'{data_dir}/{split_type}_test_{test_perc}split_{active_name}.csv'
|
457 |
|
458 |
+
# print('')
|
459 |
+
# print(f'Saving train_val datasets as: {train_val_filename}')
|
460 |
+
# print(f'Saving test datasets as: {test_filename}')
|
461 |
|
462 |
train_val_df.to_csv(train_val_filename, index=False)
|
463 |
test_df.to_csv(test_filename, index=False)
|
src/run_experiments.py
CHANGED
@@ -233,7 +233,7 @@ def main(
|
|
233 |
max_epochs: int = 100,
|
234 |
run_sklearn: bool = False,
|
235 |
force_study: bool = False,
|
236 |
-
experiments: str | Literal['all', '
|
237 |
):
|
238 |
""" Train a PROTAC model using the given datasets and hyperparameters.
|
239 |
|
@@ -250,34 +250,39 @@ def main(
|
|
250 |
"""
|
251 |
pl.seed_everything(42)
|
252 |
|
253 |
-
# Set the Column to Predict
|
254 |
-
active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
|
256 |
-
#
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
#
|
261 |
-
|
262 |
-
#
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
test_indeces = {}
|
271 |
-
active_df = protac_df[protac_df[active_col].notna()].copy()
|
272 |
|
273 |
-
if experiments == '
|
274 |
-
|
275 |
-
if experiments == '
|
276 |
-
|
277 |
-
if experiments == '
|
278 |
-
|
279 |
-
if experiments == '
|
280 |
-
|
281 |
|
282 |
# Make directory ../reports if it does not exist
|
283 |
if not os.path.exists('../reports'):
|
@@ -287,28 +292,46 @@ def main(
|
|
287 |
protein2embedding = pdp.load_protein2embedding('../data/uniprot2embedding.h5')
|
288 |
cell2embedding = pdp.load_cell2embedding('../data/cell2embedding.pkl')
|
289 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
290 |
# Cross-Validation Training
|
291 |
reports = defaultdict(list)
|
292 |
-
for split_type
|
293 |
-
|
294 |
-
|
|
|
295 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
# Get the CV object
|
297 |
-
if split_type == '
|
298 |
kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
299 |
group = None
|
300 |
elif split_type == 'e3_ligase':
|
301 |
kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
302 |
group = train_val_df['E3 Group'].to_numpy()
|
303 |
-
elif split_type == '
|
304 |
kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
305 |
group = train_val_df['Tanimoto Group'].to_numpy()
|
306 |
-
elif split_type == '
|
307 |
kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
308 |
group = train_val_df['Uniprot Group'].to_numpy()
|
309 |
|
310 |
# Start the experiment
|
311 |
-
experiment_name = f'{active_name}_test_split_{test_split}
|
312 |
optuna_reports = pdp.hyperparameter_tuning_and_training(
|
313 |
protein2embedding=protein2embedding,
|
314 |
cell2embedding=cell2embedding,
|
@@ -323,7 +346,7 @@ def main(
|
|
323 |
n_trials=n_trials,
|
324 |
max_epochs=max_epochs,
|
325 |
logger_save_dir='../logs',
|
326 |
-
logger_name=f'
|
327 |
active_label=active_col,
|
328 |
study_filename=f'../reports/study_{experiment_name}.pkl',
|
329 |
force_study=force_study,
|
@@ -334,121 +357,6 @@ def main(
|
|
334 |
report.to_csv(f'../reports/{report_name}_{experiment_name}.csv', index=False)
|
335 |
reports[report_name].append(report.copy())
|
336 |
|
337 |
-
# # Start the CV over the folds
|
338 |
-
# X = train_val_df.drop(columns=active_col)
|
339 |
-
# y = train_val_df[active_col].tolist()
|
340 |
-
# for k, (train_index, val_index) in enumerate(kf.split(X, y, group)):
|
341 |
-
# print('-' * 100)
|
342 |
-
# print(f'Starting CV for group type: {split_type}, fold: {k}')
|
343 |
-
# print('-' * 100)
|
344 |
-
# train_df = train_val_df.iloc[train_index]
|
345 |
-
# val_df = train_val_df.iloc[val_index]
|
346 |
-
|
347 |
-
# leaking_uniprot = list(set(train_df['Uniprot']).intersection(set(val_df['Uniprot'])))
|
348 |
-
# leaking_smiles = list(set(train_df['Smiles']).intersection(set(val_df['Smiles'])))
|
349 |
-
|
350 |
-
# stats = {
|
351 |
-
# 'fold': k,
|
352 |
-
# 'split_type': split_type,
|
353 |
-
# 'train_len': len(train_df),
|
354 |
-
# 'val_len': len(val_df),
|
355 |
-
# 'train_perc': len(train_df) / len(train_val_df),
|
356 |
-
# 'val_perc': len(val_df) / len(train_val_df),
|
357 |
-
# 'train_active_perc': train_df[active_col].sum() / len(train_df),
|
358 |
-
# 'train_inactive_perc': (len(train_df) - train_df[active_col].sum()) / len(train_df),
|
359 |
-
# 'val_active_perc': val_df[active_col].sum() / len(val_df),
|
360 |
-
# 'val_inactive_perc': (len(val_df) - val_df[active_col].sum()) / len(val_df),
|
361 |
-
# 'test_active_perc': test_df[active_col].sum() / len(test_df),
|
362 |
-
# 'test_inactive_perc': (len(test_df) - test_df[active_col].sum()) / len(test_df),
|
363 |
-
# 'num_leaking_uniprot': len(leaking_uniprot),
|
364 |
-
# 'num_leaking_smiles': len(leaking_smiles),
|
365 |
-
# 'train_leaking_uniprot_perc': len(train_df[train_df['Uniprot'].isin(leaking_uniprot)]) / len(train_df),
|
366 |
-
# 'train_leaking_smiles_perc': len(train_df[train_df['Smiles'].isin(leaking_smiles)]) / len(train_df),
|
367 |
-
# }
|
368 |
-
# if split_type != 'random':
|
369 |
-
# stats['train_unique_groups'] = len(np.unique(group[train_index]))
|
370 |
-
# stats['val_unique_groups'] = len(np.unique(group[val_index]))
|
371 |
-
|
372 |
-
# # At each fold, train and evaluate the Pytorch model
|
373 |
-
# if split_type != 'tanimoto' or run_sklearn:
|
374 |
-
# logging.info(f'Skipping Pytorch model training on fold {k} with split type {split_type} and test split {test_split}.')
|
375 |
-
# continue
|
376 |
-
# else:
|
377 |
-
# logging.info(f'Starting Pytorch model training on fold {k} with split type {split_type} and test split {test_split}.')
|
378 |
-
# # Train and evaluate the model
|
379 |
-
# model, trainer, metrics = pdp.hyperparameter_tuning_and_training(
|
380 |
-
# protein2embedding,
|
381 |
-
# cell2embedding,
|
382 |
-
# smiles2fp,
|
383 |
-
# train_df,
|
384 |
-
# val_df,
|
385 |
-
# test_df,
|
386 |
-
# fast_dev_run=fast_dev_run,
|
387 |
-
# n_trials=n_trials,
|
388 |
-
# logger_name=f'protac_{active_name}_{split_type}_fold_{k}_test_split_{test_split}',
|
389 |
-
# active_label=active_col,
|
390 |
-
# study_filename=f'../reports/study_{active_name}_{split_type}_fold_{k}_test_split_{test_split}.pkl',
|
391 |
-
# )
|
392 |
-
# hparams = {p.replace('hparam_', ''): v for p, v in stats.items() if p.startswith('hparam_')}
|
393 |
-
# stats.update(metrics)
|
394 |
-
# stats['model_type'] = 'Pytorch'
|
395 |
-
# report.append(stats.copy())
|
396 |
-
# del model
|
397 |
-
# del trainer
|
398 |
-
|
399 |
-
# # Ablation study: disable embeddings at a time
|
400 |
-
# for disabled_embeddings in [['e3'], ['poi'], ['cell'], ['smiles'], ['e3', 'cell'], ['poi', 'e3', 'cell']]:
|
401 |
-
# print('-' * 100)
|
402 |
-
# print(f'Ablation study with disabled embeddings: {disabled_embeddings}')
|
403 |
-
# print('-' * 100)
|
404 |
-
# stats['disabled_embeddings'] = 'disabled ' + ' '.join(disabled_embeddings)
|
405 |
-
# model, trainer, metrics = pdp.train_model(
|
406 |
-
# protein2embedding,
|
407 |
-
# cell2embedding,
|
408 |
-
# smiles2fp,
|
409 |
-
# train_df,
|
410 |
-
# val_df,
|
411 |
-
# test_df,
|
412 |
-
# fast_dev_run=fast_dev_run,
|
413 |
-
# logger_name=f'protac_{active_name}_{split_type}_fold_{k}_disabled-{"-".join(disabled_embeddings)}',
|
414 |
-
# active_label=active_col,
|
415 |
-
# disabled_embeddings=disabled_embeddings,
|
416 |
-
# **hparams,
|
417 |
-
# )
|
418 |
-
# stats.update(metrics)
|
419 |
-
# report.append(stats.copy())
|
420 |
-
# del model
|
421 |
-
# del trainer
|
422 |
-
|
423 |
-
# # At each fold, train and evaluate sklearn models
|
424 |
-
# if run_sklearn:
|
425 |
-
# for model_type in ['RandomForest', 'SVC', 'LogisticRegression', 'GradientBoosting']:
|
426 |
-
# logging.info(f'Starting sklearn model {model_type} training on fold {k} with split type {split_type} and test split {test_split}.')
|
427 |
-
# # Train and evaluate sklearn models
|
428 |
-
# model, metrics = pdp.hyperparameter_tuning_and_training_sklearn(
|
429 |
-
# protein2embedding=protein2embedding,
|
430 |
-
# cell2embedding=cell2embedding,
|
431 |
-
# smiles2fp=smiles2fp,
|
432 |
-
# train_df=train_df,
|
433 |
-
# val_df=val_df,
|
434 |
-
# test_df=test_df,
|
435 |
-
# model_type=model_type,
|
436 |
-
# active_label=active_col,
|
437 |
-
# n_trials=n_trials,
|
438 |
-
# study_filename=f'../reports/study_{active_name}_{split_type}_fold_{k}_test_split_{test_split}_{model_type.lower()}.pkl',
|
439 |
-
# )
|
440 |
-
# hparams = {p.replace('hparam_', ''): v for p, v in stats.items() if p.startswith('hparam_')}
|
441 |
-
# stats['model_type'] = model_type
|
442 |
-
# stats.update(metrics)
|
443 |
-
# report.append(stats.copy())
|
444 |
-
|
445 |
-
# # Save the report at the end of each split type
|
446 |
-
# report_df = pd.DataFrame(report)
|
447 |
-
# report_df.to_csv(
|
448 |
-
# f'../reports/cv_report_hparam_search_{cv_n_splits}-splits_{active_name}_test_split_{test_split}{"_sklearn" if run_sklearn else ""}.csv',
|
449 |
-
# index=False,
|
450 |
-
# )
|
451 |
-
|
452 |
|
453 |
if __name__ == '__main__':
|
454 |
cli = CLI(main)
|
|
|
233 |
max_epochs: int = 100,
|
234 |
run_sklearn: bool = False,
|
235 |
force_study: bool = False,
|
236 |
+
experiments: str | Literal['all', 'standard', 'e3_ligase', 'similarity', 'target'] = 'all',
|
237 |
):
|
238 |
""" Train a PROTAC model using the given datasets and hyperparameters.
|
239 |
|
|
|
250 |
"""
|
251 |
pl.seed_everything(42)
|
252 |
|
253 |
+
# # Set the Column to Predict
|
254 |
+
# active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
|
255 |
+
|
256 |
+
# # Get Dmax_threshold from the active_col
|
257 |
+
# Dmax_threshold = float(active_col.split('Dmax')[1].split(',')[0].strip('(').strip(')').strip())
|
258 |
+
# pDC50_threshold = float(active_col.split('pDC50')[1].strip('(').strip(')').strip())
|
259 |
+
|
260 |
+
# # Load the PROTAC dataset
|
261 |
+
# protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
|
262 |
|
263 |
+
# # Map E3 Ligase Iap to IAP
|
264 |
+
# protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
|
265 |
+
# protac_df[active_col] = protac_df.apply(
|
266 |
+
# lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
|
267 |
+
# )
|
268 |
+
# # Drop duplicates
|
269 |
+
# protac_df = protac_df.drop_duplicates(subset=['Smiles', 'Uniprot', 'E3 Ligase Uniprot', 'Cell Line Identifier', active_col])
|
270 |
+
|
271 |
+
# # Precompute fingerprints and average Tanimoto similarity
|
272 |
+
# smiles2fp, protac_df = get_smiles2fp_and_avg_tanimoto(protac_df)
|
273 |
+
|
274 |
+
# ## Get the test sets
|
275 |
+
# test_indeces = {}
|
276 |
+
# active_df = protac_df[protac_df[active_col].notna()].copy()
|
|
|
|
|
277 |
|
278 |
+
# if experiments == 'standard' or experiments == 'all':
|
279 |
+
# test_indeces['standard'] = get_random_split_indices(active_df, test_split)
|
280 |
+
# if experiments == 'target' or experiments == 'all':
|
281 |
+
# test_indeces['target'] = get_target_split_indices(active_df, active_col, test_split)
|
282 |
+
# if experiments == 'similarity' or experiments == 'all':
|
283 |
+
# test_indeces['similarity'] = get_tanimoto_split_indices(active_df, active_col, test_split, n_bins_tanimoto=100)
|
284 |
+
# if experiments == 'e3_ligase' or experiments == 'all':
|
285 |
+
# test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
|
286 |
|
287 |
# Make directory ../reports if it does not exist
|
288 |
if not os.path.exists('../reports'):
|
|
|
292 |
protein2embedding = pdp.load_protein2embedding('../data/uniprot2embedding.h5')
|
293 |
cell2embedding = pdp.load_cell2embedding('../data/cell2embedding.pkl')
|
294 |
|
295 |
+
studies_dir = '../data/studies'
|
296 |
+
train_val_perc = f'{int((1 - test_split) * 100)}'
|
297 |
+
test_perc = f'{int(test_split * 100)}'
|
298 |
+
active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
|
299 |
+
|
300 |
+
if experiments == 'all':
|
301 |
+
experiments = ['standard', 'similarity', 'target']
|
302 |
+
else:
|
303 |
+
experiments = [experiments]
|
304 |
+
|
305 |
# Cross-Validation Training
|
306 |
reports = defaultdict(list)
|
307 |
+
for split_type in experiments:
|
308 |
+
|
309 |
+
train_val_filename = f'{split_type}_train_val_{train_val_perc}split_{active_name}.csv'
|
310 |
+
test_filename = f'{split_type}_test_{test_perc}split_{active_name}.csv'
|
311 |
|
312 |
+
train_val_df = pd.read_csv(os.path.join(studies_dir, train_val_filename))
|
313 |
+
test_df = pd.read_csv(os.path.join(studies_dir, test_filename))
|
314 |
+
|
315 |
+
# Get SMILES and precompute fingerprints dictionary
|
316 |
+
unique_smiles = pd.concat([train_val_df, test_df])['Smiles'].unique().tolist()
|
317 |
+
smiles2fp = {s: np.array(pdp.get_fingerprint(s)) for s in unique_smiles}
|
318 |
+
|
319 |
# Get the CV object
|
320 |
+
if split_type == 'standard':
|
321 |
kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
322 |
group = None
|
323 |
elif split_type == 'e3_ligase':
|
324 |
kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
325 |
group = train_val_df['E3 Group'].to_numpy()
|
326 |
+
elif split_type == 'similarity':
|
327 |
kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
328 |
group = train_val_df['Tanimoto Group'].to_numpy()
|
329 |
+
elif split_type == 'target':
|
330 |
kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
331 |
group = train_val_df['Uniprot Group'].to_numpy()
|
332 |
|
333 |
# Start the experiment
|
334 |
+
experiment_name = f'{split_type}_{active_name}_test_split_{test_split}'
|
335 |
optuna_reports = pdp.hyperparameter_tuning_and_training(
|
336 |
protein2embedding=protein2embedding,
|
337 |
cell2embedding=cell2embedding,
|
|
|
346 |
n_trials=n_trials,
|
347 |
max_epochs=max_epochs,
|
348 |
logger_save_dir='../logs',
|
349 |
+
logger_name=f'{experiment_name}',
|
350 |
active_label=active_col,
|
351 |
study_filename=f'../reports/study_{experiment_name}.pkl',
|
352 |
force_study=force_study,
|
|
|
357 |
report.to_csv(f'../reports/{report_name}_{experiment_name}.csv', index=False)
|
358 |
reports[report_name].append(report.copy())
|
359 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
360 |
|
361 |
if __name__ == '__main__':
|
362 |
cli = CLI(main)
|
src/run_experiments_aminoacid_counts.py
CHANGED
@@ -59,6 +59,7 @@ def main(
|
|
59 |
force_study (bool): Whether to force the creation of a new study.
|
60 |
experiments (str): Type of experiments to run. Options are 'all', 'standard', 'e3_ligase', 'similarity', 'target'.
|
61 |
"""
|
|
|
62 |
|
63 |
# Make directory ../reports if it does not exist
|
64 |
if not os.path.exists('../reports'):
|
@@ -81,7 +82,7 @@ def main(
|
|
81 |
# NOTE: Check that the protein2embedding is a dictionary of strings
|
82 |
if not all(isinstance(k, str) for k in protein2embedding.keys()):
|
83 |
raise ValueError("All keys in `protein2embedding` must be strings.")
|
84 |
-
countvec = CountVectorizer(ngram_range=(1,1), analyzer='char')
|
85 |
protein_embeddings = countvec.fit_transform(
|
86 |
list(protein2embedding.keys())
|
87 |
).toarray()
|
@@ -126,7 +127,7 @@ def main(
|
|
126 |
group = train_val_df['Uniprot Group'].to_numpy()
|
127 |
|
128 |
# Start the experiment
|
129 |
-
experiment_name = f'{active_name}_test_split_{test_split}
|
130 |
optuna_reports = pdp.hyperparameter_tuning_and_training(
|
131 |
protein2embedding=protein2embedding,
|
132 |
cell2embedding=cell2embedding,
|
@@ -141,11 +142,10 @@ def main(
|
|
141 |
n_trials=n_trials,
|
142 |
max_epochs=max_epochs,
|
143 |
logger_save_dir='../logs',
|
144 |
-
logger_name=f'
|
145 |
active_label=active_col,
|
146 |
study_filename=f'../reports/study_aminoacidcnt_{experiment_name}.pkl',
|
147 |
force_study=force_study,
|
148 |
-
use_amino_acid_count=True,
|
149 |
)
|
150 |
|
151 |
# Save the reports to file
|
|
|
59 |
force_study (bool): Whether to force the creation of a new study.
|
60 |
experiments (str): Type of experiments to run. Options are 'all', 'standard', 'e3_ligase', 'similarity', 'target'.
|
61 |
"""
|
62 |
+
pl.seed_everything(42)
|
63 |
|
64 |
# Make directory ../reports if it does not exist
|
65 |
if not os.path.exists('../reports'):
|
|
|
82 |
# NOTE: Check that the protein2embedding is a dictionary of strings
|
83 |
if not all(isinstance(k, str) for k in protein2embedding.keys()):
|
84 |
raise ValueError("All keys in `protein2embedding` must be strings.")
|
85 |
+
countvec = CountVectorizer(ngram_range=(1, 1), analyzer='char')
|
86 |
protein_embeddings = countvec.fit_transform(
|
87 |
list(protein2embedding.keys())
|
88 |
).toarray()
|
|
|
127 |
group = train_val_df['Uniprot Group'].to_numpy()
|
128 |
|
129 |
# Start the experiment
|
130 |
+
experiment_name = f'{split_type}_{active_name}_test_split_{test_split}'
|
131 |
optuna_reports = pdp.hyperparameter_tuning_and_training(
|
132 |
protein2embedding=protein2embedding,
|
133 |
cell2embedding=cell2embedding,
|
|
|
142 |
n_trials=n_trials,
|
143 |
max_epochs=max_epochs,
|
144 |
logger_save_dir='../logs',
|
145 |
+
logger_name=f'aminoacidcnt_{experiment_name}',
|
146 |
active_label=active_col,
|
147 |
study_filename=f'../reports/study_aminoacidcnt_{experiment_name}.pkl',
|
148 |
force_study=force_study,
|
|
|
149 |
)
|
150 |
|
151 |
# Save the reports to file
|
src/run_experiments_cells_onehot.py
CHANGED
@@ -61,6 +61,7 @@ def main(
|
|
61 |
force_study (bool): Whether to force the creation of a new study.
|
62 |
experiments (str): Type of experiments to run. Options are 'all', 'standard', 'e3_ligase', 'similarity', 'target'.
|
63 |
"""
|
|
|
64 |
|
65 |
# Make directory ../reports if it does not exist
|
66 |
if not os.path.exists('../reports'):
|
@@ -116,7 +117,7 @@ def main(
|
|
116 |
group = train_val_df['Uniprot Group'].to_numpy()
|
117 |
|
118 |
# Start the experiment
|
119 |
-
experiment_name = f'{active_name}_test_split_{test_split}
|
120 |
optuna_reports = pdp.hyperparameter_tuning_and_training(
|
121 |
protein2embedding=protein2embedding,
|
122 |
cell2embedding=cell2embedding,
|
@@ -131,11 +132,10 @@ def main(
|
|
131 |
n_trials=n_trials,
|
132 |
max_epochs=max_epochs,
|
133 |
logger_save_dir='../logs',
|
134 |
-
logger_name=f'
|
135 |
active_label=active_col,
|
136 |
study_filename=f'../reports/study_cellsonehot_{experiment_name}.pkl',
|
137 |
force_study=force_study,
|
138 |
-
use_cells_one_hot=True,
|
139 |
)
|
140 |
|
141 |
# Save the reports to file
|
|
|
61 |
force_study (bool): Whether to force the creation of a new study.
|
62 |
experiments (str): Type of experiments to run. Options are 'all', 'standard', 'e3_ligase', 'similarity', 'target'.
|
63 |
"""
|
64 |
+
pl.seed_everything(42)
|
65 |
|
66 |
# Make directory ../reports if it does not exist
|
67 |
if not os.path.exists('../reports'):
|
|
|
117 |
group = train_val_df['Uniprot Group'].to_numpy()
|
118 |
|
119 |
# Start the experiment
|
120 |
+
experiment_name = f'{split_type}_{active_name}_test_split_{test_split}'
|
121 |
optuna_reports = pdp.hyperparameter_tuning_and_training(
|
122 |
protein2embedding=protein2embedding,
|
123 |
cell2embedding=cell2embedding,
|
|
|
132 |
n_trials=n_trials,
|
133 |
max_epochs=max_epochs,
|
134 |
logger_save_dir='../logs',
|
135 |
+
logger_name=f'cellsonehot_{experiment_name}',
|
136 |
active_label=active_col,
|
137 |
study_filename=f'../reports/study_cellsonehot_{experiment_name}.pkl',
|
138 |
force_study=force_study,
|
|
|
139 |
)
|
140 |
|
141 |
# Save the reports to file
|
src/run_experiments_xgboost.py
CHANGED
@@ -232,7 +232,7 @@ def main(
|
|
232 |
cv_n_splits: int = 5,
|
233 |
num_boost_round: int = 100,
|
234 |
force_study: bool = False,
|
235 |
-
experiments: str | Literal['all', '
|
236 |
):
|
237 |
""" Train a PROTAC model using the given datasets and hyperparameters.
|
238 |
|
@@ -244,34 +244,38 @@ def main(
|
|
244 |
"""
|
245 |
pl.seed_everything(42)
|
246 |
|
247 |
-
# Set the Column to Predict
|
248 |
-
active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
|
249 |
-
|
250 |
-
# Get Dmax_threshold from the active_col
|
251 |
-
Dmax_threshold = float(active_col.split('Dmax')[1].split(',')[0].strip('(').strip(')').strip())
|
252 |
-
pDC50_threshold = float(active_col.split('pDC50')[1].strip('(').strip(')').strip())
|
253 |
-
|
254 |
-
# Load the PROTAC dataset
|
255 |
-
protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
|
256 |
-
# Map E3 Ligase Iap to IAP
|
257 |
-
protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
|
258 |
-
protac_df[active_col] = protac_df.apply(
|
259 |
-
|
260 |
-
)
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
|
|
|
|
|
|
|
|
266 |
|
267 |
-
if experiments == '
|
268 |
-
|
269 |
-
if experiments == '
|
270 |
-
|
271 |
-
if experiments == '
|
272 |
-
|
273 |
-
if experiments == '
|
274 |
-
|
275 |
|
276 |
# Make directory ../reports if it does not exist
|
277 |
if not os.path.exists('../reports'):
|
@@ -281,23 +285,41 @@ def main(
|
|
281 |
protein2embedding = pdp.load_protein2embedding('../data/uniprot2embedding.h5')
|
282 |
cell2embedding = pdp.load_cell2embedding('../data/cell2embedding.pkl')
|
283 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
# Cross-Validation Training
|
285 |
reports = defaultdict(list)
|
286 |
-
for split_type
|
287 |
-
|
288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
|
290 |
# Get the CV object
|
291 |
-
if split_type == '
|
292 |
kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
293 |
group = None
|
294 |
elif split_type == 'e3_ligase':
|
295 |
kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
296 |
group = train_val_df['E3 Group'].to_numpy()
|
297 |
-
elif split_type == '
|
298 |
kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
299 |
group = train_val_df['Tanimoto Group'].to_numpy()
|
300 |
-
elif split_type == '
|
301 |
kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
302 |
group = train_val_df['Uniprot Group'].to_numpy()
|
303 |
|
@@ -326,5 +348,6 @@ def main(
|
|
326 |
report.to_csv(f'../reports/xgboost_{report_name}_{experiment_name}.csv', index=False)
|
327 |
reports[report_name].append(report.copy())
|
328 |
|
|
|
329 |
if __name__ == '__main__':
|
330 |
cli = CLI(main)
|
|
|
232 |
cv_n_splits: int = 5,
|
233 |
num_boost_round: int = 100,
|
234 |
force_study: bool = False,
|
235 |
+
experiments: str | Literal['all', 'standard', 'e3_ligase', 'similarity', 'target'] = 'all',
|
236 |
):
|
237 |
""" Train a PROTAC model using the given datasets and hyperparameters.
|
238 |
|
|
|
244 |
"""
|
245 |
pl.seed_everything(42)
|
246 |
|
247 |
+
# # Set the Column to Predict
|
248 |
+
# active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
|
249 |
+
|
250 |
+
# # Get Dmax_threshold from the active_col
|
251 |
+
# Dmax_threshold = float(active_col.split('Dmax')[1].split(',')[0].strip('(').strip(')').strip())
|
252 |
+
# pDC50_threshold = float(active_col.split('pDC50')[1].strip('(').strip(')').strip())
|
253 |
+
|
254 |
+
# # Load the PROTAC dataset
|
255 |
+
# protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
|
256 |
+
# # Map E3 Ligase Iap to IAP
|
257 |
+
# protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
|
258 |
+
# protac_df[active_col] = protac_df.apply(
|
259 |
+
# lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
|
260 |
+
# )
|
261 |
+
# # Drop duplicates
|
262 |
+
# protac_df = protac_df.drop_duplicates(subset=['Smiles', 'Uniprot', 'E3 Ligase Uniprot', 'Cell Line Identifier', active_col])
|
263 |
+
|
264 |
+
# # Precompute fingerprint dictionary and the average Tanimoto similarity
|
265 |
+
# smiles2fp, protac_df = get_smiles2fp_and_avg_tanimoto(protac_df)
|
266 |
+
|
267 |
+
# ## Get the test sets
|
268 |
+
# test_indeces = {}
|
269 |
+
# active_df = protac_df[protac_df[active_col].notna()].copy()
|
270 |
|
271 |
+
# if experiments == 'standard' or experiments == 'all':
|
272 |
+
# test_indeces['standard'] = get_random_split_indices(active_df, test_split)
|
273 |
+
# if experiments == 'target' or experiments == 'all':
|
274 |
+
# test_indeces['target'] = get_target_split_indices(active_df, active_col, test_split)
|
275 |
+
# if experiments == 'similarity' or experiments == 'all':
|
276 |
+
# test_indeces['similarity'] = get_tanimoto_split_indices(active_df, active_col, test_split, n_bins_tanimoto=100)
|
277 |
+
# if experiments == 'e3_ligase' or experiments == 'all':
|
278 |
+
# test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
|
279 |
|
280 |
# Make directory ../reports if it does not exist
|
281 |
if not os.path.exists('../reports'):
|
|
|
285 |
protein2embedding = pdp.load_protein2embedding('../data/uniprot2embedding.h5')
|
286 |
cell2embedding = pdp.load_cell2embedding('../data/cell2embedding.pkl')
|
287 |
|
288 |
+
studies_dir = '../data/studies'
|
289 |
+
train_val_perc = f'{int((1 - test_split) * 100)}'
|
290 |
+
test_perc = f'{int(test_split * 100)}'
|
291 |
+
active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
|
292 |
+
|
293 |
+
if experiments == 'all':
|
294 |
+
experiments = ['standard', 'similarity', 'target']
|
295 |
+
else:
|
296 |
+
experiments = [experiments]
|
297 |
+
|
298 |
# Cross-Validation Training
|
299 |
reports = defaultdict(list)
|
300 |
+
for split_type in experiments:
|
301 |
+
|
302 |
+
train_val_filename = f'{split_type}_train_val_{train_val_perc}split_{active_name}.csv'
|
303 |
+
test_filename = f'{split_type}_test_{test_perc}split_{active_name}.csv'
|
304 |
+
|
305 |
+
train_val_df = pd.read_csv(os.path.join(studies_dir, train_val_filename))
|
306 |
+
test_df = pd.read_csv(os.path.join(studies_dir, test_filename))
|
307 |
+
|
308 |
+
# Get SMILES and precompute fingerprints dictionary
|
309 |
+
unique_smiles = pd.concat([train_val_df, test_df])['Smiles'].unique().tolist()
|
310 |
+
smiles2fp = {s: np.array(pdp.get_fingerprint(s)) for s in unique_smiles}
|
311 |
|
312 |
# Get the CV object
|
313 |
+
if split_type == 'standard':
|
314 |
kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
315 |
group = None
|
316 |
elif split_type == 'e3_ligase':
|
317 |
kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
318 |
group = train_val_df['E3 Group'].to_numpy()
|
319 |
+
elif split_type == 'similarity':
|
320 |
kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
321 |
group = train_val_df['Tanimoto Group'].to_numpy()
|
322 |
+
elif split_type == 'target':
|
323 |
kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
|
324 |
group = train_val_df['Uniprot Group'].to_numpy()
|
325 |
|
|
|
348 |
report.to_csv(f'../reports/xgboost_{report_name}_{experiment_name}.csv', index=False)
|
349 |
reports[report_name].append(report.copy())
|
350 |
|
351 |
+
|
352 |
if __name__ == '__main__':
|
353 |
cli = CLI(main)
|