Spaces:

ailab-bio
/

PROTAC-Degradation-Predictor

Sleeping

App Files Files Community

ribesstefano commited on May 29

Commit

33f1644

•

1 Parent(s): 4bf0ec2

Added XGBoost Optuna training + Added Ablation studies with zeroed input vectors

Browse files

Files changed (8) hide show

protac_degradation_predictor/__init__.py +3 -1
protac_degradation_predictor/optuna_utils.py +12 -0
protac_degradation_predictor/optuna_utils_xgboost.py +323 -0
protac_degradation_predictor/protac_dataset.py +31 -4
reports/ablation_zero_vectors_report_Active_Dmax_0.6_pDC50_6.0_test_split_0.1_random.csv +29 -0
reports/ablation_zero_vectors_report_Active_Dmax_0.6_pDC50_6.0_test_split_0.1_tanimoto.csv +29 -0
reports/ablation_zero_vectors_report_Active_Dmax_0.6_pDC50_6.0_test_split_0.1_uniprot.csv +29 -0
src/run_xgboost_experiments.py +329 -0

protac_degradation_predictor/__init__.py CHANGED Viewed

@@ -17,7 +17,9 @@ from .sklearn_models import (
 )
 from .optuna_utils import (
     hyperparameter_tuning_and_training,
-    hyperparameter_tuning_and_training_sklearn,
 )
 from .protac_degradation_predictor import (
     get_protac_active_proba,

 )
 from .optuna_utils import (
     hyperparameter_tuning_and_training,
+)
+from .optuna_utils_xgboost import (
+    xgboost_hyperparameter_tuning_and_training,
 )
 from .protac_degradation_predictor import (
     get_protac_active_proba,

protac_degradation_predictor/optuna_utils.py CHANGED Viewed

@@ -234,6 +234,18 @@ def pytorch_model_objective(
     # Optuna aims to minimize the pytorch_model_objective
     return - val_roc_auc
 def hyperparameter_tuning_and_training(

     # Optuna aims to minimize the pytorch_model_objective
     return - val_roc_auc
+    # # Get the majority vote for the test predictions
+    # if test_df is not None and not fast_dev_run:
+    #     majority_vote_metrics = get_majority_vote_metrics(test_preds, test_df, active_label)
+    #     majority_vote_metrics.update(get_dataframe_stats(train_df, val_df, test_df, active_label))
+    #     trial.set_user_attr('majority_vote_metrics', majority_vote_metrics)
+    #     logging.info(f'Majority vote metrics: {majority_vote_metrics}')
+    # # Get the average validation accuracy and ROC AUC accross the folds
+    # val_roc_auc = np.mean([r['val_roc_auc'] for r in report])
+    # # Optuna aims to minimize the pytorch_model_objective
+    # return - val_roc_auc
 def hyperparameter_tuning_and_training(

protac_degradation_predictor/optuna_utils_xgboost.py ADDED Viewed

	@@ -0,0 +1,323 @@

+from typing import Optional, Dict
+import logging
+import os
+from .optuna_utils import get_majority_vote_metrics, get_dataframe_stats
+from .protac_dataset import get_datasets
+import optuna
+import xgboost as xgb
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import StratifiedKFold
+from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
+import xgboost as xgb
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import StratifiedKFold
+from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
+import joblib
+from optuna.samplers import TPESampler
+import torch
+xgb.set_config(verbosity=0)
+def train_and_evaluate_xgboost(
+        protein2embedding: Dict,
+        cell2embedding: Dict,
+        smiles2fp: Dict,
+        train_df: pd.DataFrame,
+        val_df: pd.DataFrame,
+        params: dict,
+        test_df: Optional[pd.DataFrame] = None,
+        active_label: str = 'Active',
+        num_boost_round: int = 100,
+        shuffle_train_data: bool = False,
+) -> tuple:
+    """
+    Train and evaluate an XGBoost model with the given parameters.
+    Args:
+        train_df (pd.DataFrame): The training and validation data.
+        test_df (pd.DataFrame): The test data.
+        params (dict): Hyperparameters for the XGBoost model.
+        active_label (str): The active label column.
+        num_boost_round (int): Maximum number of epochs.
+    Returns:
+        tuple: The trained model, test predictions, and metrics.
+    """
+    # Get datasets and their numpy arrays
+    train_ds, val_ds, test_ds  = get_datasets(
+        protein2embedding=protein2embedding,
+        cell2embedding=cell2embedding,
+        smiles2fp=smiles2fp,
+        train_df=train_df,
+        val_df=val_df,
+        test_df=test_df,
+        disabled_embeddings=[],
+        active_label=active_label,
+        apply_scaling=False,
+    )
+    X_train, y_train = train_ds.get_numpy_arrays()
+    X_val, y_val = val_ds.get_numpy_arrays()
+    # Shuffle the training data
+    if shuffle_train_data:
+        idx = np.random.permutation(len(X_train))
+        X_train, y_train = X_train[idx], y_train[idx]
+    # Setup training and validation data in XGBoost data format
+    dtrain = xgb.DMatrix(X_train, label=y_train)
+    dval = xgb.DMatrix(X_val, label=y_val)
+    evallist = [(dval, 'eval'), (dtrain, 'train')]
+    # Setup test data
+    if test_df is not None:
+        X_test, y_test = test_ds.get_numpy_arrays()
+        dtest = xgb.DMatrix(X_test, label=y_test)
+        evallist.append((dtest, 'test'))
+    model = xgb.train(
+        params,
+        dtrain,
+        num_boost_round=num_boost_round,
+        evals=evallist,
+        early_stopping_rounds=10,
+        verbose_eval=False,
+    )
+    # Evaluate model
+    val_pred = model.predict(dval)
+    val_pred_binary = (val_pred > 0.5).astype(int)
+    metrics = {
+        'val_accuracy': accuracy_score(y_val, val_pred_binary),
+        'val_roc_auc': roc_auc_score(y_val, val_pred),
+        'val_precision': precision_score(y_val, val_pred_binary),
+        'val_recall': recall_score(y_val, val_pred_binary),
+        'val_f1_score': f1_score(y_val, val_pred_binary),
+    }
+    preds = {'val_pred': val_pred}
+    if test_df is not None:
+        test_pred = model.predict(dtest)
+        test_pred_binary = (test_pred > 0.5).astype(int)
+        metrics.update({
+            'test_accuracy': accuracy_score(y_test, test_pred_binary),
+            'test_roc_auc': roc_auc_score(y_test, test_pred),
+            'test_precision': precision_score(y_test, test_pred_binary),
+            'test_recall': recall_score(y_test, test_pred_binary),
+            'test_f1_score': f1_score(y_test, test_pred_binary),
+        })
+        preds.update({'test_pred': test_pred})
+    return model, preds, metrics
+def xgboost_model_objective(
+        trial: optuna.Trial,
+        protein2embedding: Dict,
+        cell2embedding: Dict,
+        smiles2fp: Dict,
+        train_val_df: pd.DataFrame,
+        kf: StratifiedKFold,
+        groups: Optional[np.array] = None,
+        active_label: str = 'Active',
+        num_boost_round: int = 100,
+) -> float:
+    """ Objective function for hyperparameter optimization with XGBoost.
+    Args:
+        trial (optuna.Trial): The Optuna trial object.
+        train_val_df (pd.DataFrame): The training and validation data.
+        kf (StratifiedKFold): Stratified K-Folds cross-validator.
+        test_df (Optional[pd.DataFrame]): The test data.
+        active_label (str): The active label column.
+        num_boost_round (int): Maximum number of epochs.
+        use_logger (bool): Whether to use logging.
+    """
+    # Suggest hyperparameters to be used across the CV folds
+    params = {
+        'booster': 'gbtree',
+        'tree_method': 'hist', # if torch.cuda.is_available() else 'hist',
+        'objective': 'binary:logistic',
+        'eval_metric': 'auc',
+        'eta': trial.suggest_float('eta', 1e-4, 1e-1, log=True),
+        'max_depth': trial.suggest_int('max_depth', 3, 10),
+        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10.0, log=True),
+        'gamma': trial.suggest_float('gamma', 1e-4, 1e-1, log=True),
+        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
+        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
+    }
+    X = train_val_df.copy().drop(columns=active_label)
+    y = train_val_df[active_label].tolist()
+    report = []
+    val_preds = []
+    for k, (train_index, val_index) in enumerate(kf.split(X, y, groups)):
+        logging.info(f'Fold {k + 1}/{kf.get_n_splits()}')
+        train_df = train_val_df.iloc[train_index]
+        val_df = train_val_df.iloc[val_index]
+        # Get some statistics from the dataframes
+        stats = {
+            'model_type': 'XGBoost',
+            'fold': k,
+            'train_len': len(train_df),
+            'val_len': len(val_df),
+            'train_perc': len(train_df) / len(train_val_df),
+            'val_perc': len(val_df) / len(train_val_df),
+        }
+        stats.update(get_dataframe_stats(train_df, val_df, active_label=active_label))
+        if groups is not None:
+            stats['train_unique_groups'] = len(np.unique(groups[train_index]))
+            stats['val_unique_groups'] = len(np.unique(groups[val_index]))
+        _, preds, metrics = train_and_evaluate_xgboost(
+            protein2embedding=protein2embedding,
+            cell2embedding=cell2embedding,
+            smiles2fp=smiles2fp,
+            train_df=train_df,
+            val_df=val_df,
+            params=params,
+            active_label=active_label,
+            num_boost_round=num_boost_round,
+        )
+        stats.update(metrics)
+        report.append(stats.copy())
+        val_preds.append(preds['val_pred'])
+    # Save the report in the trial
+    trial.set_user_attr('report', report)
+    trial.set_user_attr('val_preds', val_preds)
+    trial.set_user_attr('params', params)
+    # Get the average validation metrics across the folds
+    mean_val_roc_auc = np.mean([r['val_roc_auc'] for r in report])
+    logging.info(f'\tMean val ROC AUC: {mean_val_roc_auc:.4f}')
+    # Optuna aims to minimize the objective, so return the negative ROC AUC
+    return -mean_val_roc_auc
+def xgboost_hyperparameter_tuning_and_training(
+        protein2embedding: Dict,
+        cell2embedding: Dict,
+        smiles2fp: Dict,
+        train_val_df: pd.DataFrame,
+        test_df: pd.DataFrame,
+        kf: StratifiedKFold,
+        groups: Optional[np.array] = None,
+        split_type: str = 'random',
+        n_models_for_test: int = 3,
+        n_trials: int = 50,
+        active_label: str = 'Active',
+        num_boost_round: int = 100,
+        study_filename: Optional[str] = None,
+        force_study: bool = False,
+) -> dict:
+    """ Hyperparameter tuning and training of an XGBoost model.
+    Args:
+        train_val_df (pd.DataFrame): The training and validation data.
+        test_df (pd.DataFrame): The test data.
+        kf (StratifiedKFold): Stratified K-Folds cross-validator.
+        groups (Optional[np.array]): Group labels for the samples used while splitting the dataset into train/test set.
+        split_type (str): Type of the data split.
+        n_models_for_test (int): Number of models to train for testing.
+        fast_dev_run (bool): Whether to run a fast development run.
+        n_trials (int): Number of trials for hyperparameter optimization.
+        logger_save_dir (str): Directory to save logs.
+        logger_name (str): Name of the logger.
+        active_label (str): The active label column.
+        num_boost_round (int): Maximum number of epochs.
+        study_filename (Optional[str]): File name to save/load the Optuna study.
+        force_study (bool): Whether to force the study optimization even if the study file exists.
+    Returns:
+        dict: A dictionary containing reports from the CV and test.
+    """
+    # Set the verbosity of Optuna
+    optuna.logging.set_verbosity(optuna.logging.WARNING)
+    # Create an Optuna study object
+    sampler = TPESampler(seed=42)
+    study = optuna.create_study(direction='minimize', sampler=sampler)
+    study_loaded = False
+    if study_filename and not force_study:
+        if os.path.exists(study_filename):
+            study = joblib.load(study_filename)
+            study_loaded = True
+            logging.info(f'Loaded study from {study_filename}')
+    if not study_loaded or force_study:
+        study.optimize(
+            lambda trial: xgboost_model_objective(
+                trial=trial,
+                protein2embedding=protein2embedding,
+                cell2embedding=cell2embedding,
+                smiles2fp=smiles2fp,
+                train_val_df=train_val_df,
+                kf=kf,
+                groups=groups,
+                active_label=active_label,
+                num_boost_round=num_boost_round,
+            ),
+            n_trials=n_trials,
+        )
+        if study_filename:
+            joblib.dump(study, study_filename)
+    cv_report = pd.DataFrame(study.best_trial.user_attrs['report'])
+    hparam_report = pd.DataFrame([study.best_params])
+    # Retrain N models with the best hyperparameters (measure model uncertainty)
+    best_models = []
+    test_report = []
+    test_preds = []
+    for i in range(n_models_for_test):
+        logging.info(f'Training best model {i + 1}/{n_models_for_test}')
+        model, preds, metrics = train_and_evaluate_xgboost(
+            protein2embedding=protein2embedding,
+            cell2embedding=cell2embedding,
+            smiles2fp=smiles2fp,
+            train_df=train_val_df,
+            val_df=test_df,
+            params=study.best_trial.user_attrs['params'],
+            active_label=active_label,
+            num_boost_round=num_boost_round,
+            shuffle_train_data=True,
+        )
+        metrics = {k.replace('val_', 'test_'): v for k, v in metrics.items()}
+        metrics['model_type'] = 'XGBoost'
+        metrics['test_model_id'] = i
+        metrics.update(get_dataframe_stats(
+            train_val_df,
+            test_df=test_df,
+            active_label=active_label,
+        ))
+        test_report.append(metrics.copy())
+        test_preds.append(torch.tensor(preds['val_pred']))
+        best_models.append(model)
+    test_report = pd.DataFrame(test_report)
+    # Get the majority vote for the test predictions
+    majority_vote_metrics = get_majority_vote_metrics(test_preds, test_df, active_label)
+    majority_vote_report = pd.DataFrame([majority_vote_metrics])
+    majority_vote_report['model_type'] = 'XGBoost'
+    # Add a column with the split_type to all reports
+    for report in [cv_report, hparam_report, test_report, majority_vote_report]:
+        report['split_type'] = split_type
+    # Return the reports
+    return {
+        'cv_report': cv_report,
+        'hparam_report': hparam_report,
+        'test_report': test_report,
+        'majority_vote_report' :majority_vote_report,
+    }

protac_degradation_predictor/protac_dataset.py CHANGED Viewed

@@ -42,7 +42,11 @@ class PROTAC_Dataset(Dataset):
             cell2embedding (dict): Dictionary of cell line embeddings
             smiles2fp (dict): Dictionary of SMILES to fingerprint
             use_smote (bool): Whether to use SMOTE for oversampling
-            use_ored_activity (bool): Whether to use the 'Active - OR' column
         """
         # Filter out examples with NaN in active_label column
         self.data = protac_df  # [~protac_df[active_label].isna()]
@@ -124,7 +128,7 @@ class PROTAC_Dataset(Dataset):
         self.data = df_smote
     def fit_scaling(self, use_single_scaler: bool = False, **scaler_kwargs) -> dict:
-        """ Fit the scalers for the data.
         Args:
             use_single_scaler (bool): Whether to use a single scaler for all features.
@@ -288,8 +292,25 @@ def get_datasets(
         disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
         scaler: Optional[StandardScaler | Dict[str, StandardScaler]] = None,
         use_single_scaler: Optional[bool] = None,
 ) -> Tuple[PROTAC_Dataset, PROTAC_Dataset, Optional[PROTAC_Dataset]]:
-    """ Get the datasets for training the PROTAC model. """
     oversampler = SMOTE(k_neighbors=smote_k_neighbors, random_state=42)
     train_ds = PROTAC_Dataset(
         train_df,
@@ -313,6 +334,10 @@ def get_datasets(
         scaler=train_ds.scaler if train_ds.scaler is not None else scaler,
         use_single_scaler=train_ds.use_single_scaler if train_ds.use_single_scaler is not None else use_single_scaler,
     )
     if test_df is not None:
         test_ds = PROTAC_Dataset(
             test_df,
@@ -321,9 +346,11 @@ def get_datasets(
             smiles2fp,
             active_label=active_label,
             disabled_embeddings=disabled_embeddings,
-            scaler=train_ds.scaler if train_ds.scaler is not None else scaler,
             use_single_scaler=train_ds.use_single_scaler if train_ds.use_single_scaler is not None else use_single_scaler,
         )
     else:
         test_ds = None
     return train_ds, val_ds, test_ds

             cell2embedding (dict): Dictionary of cell line embeddings
             smiles2fp (dict): Dictionary of SMILES to fingerprint
             use_smote (bool): Whether to use SMOTE for oversampling
+            oversampler (SMOTE | ADASYN): The oversampler to use
+            active_label (str): The column containing the active/inactive information
+            disabled_embeddings (list): The list of embeddings to disable, i.e., return a zero vector
+            scaler (StandardScaler | dict): The scaler to use for the embeddings
+            use_single_scaler (bool): Whether to use a single scaler for all features
         """
         # Filter out examples with NaN in active_label column
         self.data = protac_df  # [~protac_df[active_label].isna()]
         self.data = df_smote
     def fit_scaling(self, use_single_scaler: bool = False, **scaler_kwargs) -> dict:
+        """ Fit the scalers for the data and save them in the dataset class.
         Args:
             use_single_scaler (bool): Whether to use a single scaler for all features.
         disabled_embeddings: List[Literal['smiles', 'poi', 'e3', 'cell']] = [],
         scaler: Optional[StandardScaler | Dict[str, StandardScaler]] = None,
         use_single_scaler: Optional[bool] = None,
+        apply_scaling: bool = False,
 ) -> Tuple[PROTAC_Dataset, PROTAC_Dataset, Optional[PROTAC_Dataset]]:
+    """ Get the datasets for training the PROTAC model.
+    Args:
+        train_df (pd.DataFrame): The training data.
+        val_df (pd.DataFrame): The validation data.
+        test_df (pd.DataFrame): The test data.
+        protein2embedding (dict): Dictionary of protein embeddings.
+        cell2embedding (dict): Dictionary of cell line embeddings.
+        smiles2fp (dict): Dictionary of SMILES to fingerprint.
+        use_smote (bool): Whether to use SMOTE for oversampling.
+        smote_k_neighbors (int): The number of neighbors to use for SMOTE.
+        active_label (str): The active label column.
+        disabled_embeddings (list): The list of embeddings to disable.
+        scaler (StandardScaler | dict): The scaler to use for the embeddings.
+        use_single_scaler (bool): Whether to use a single scaler for all features.
+        apply_scaling (bool): Whether to apply scaling to the data now. Defaults to False (the Pytorch Lightning model does that).
+    """
     oversampler = SMOTE(k_neighbors=smote_k_neighbors, random_state=42)
     train_ds = PROTAC_Dataset(
         train_df,
         scaler=train_ds.scaler if train_ds.scaler is not None else scaler,
         use_single_scaler=train_ds.use_single_scaler if train_ds.use_single_scaler is not None else use_single_scaler,
     )
+    train_scalers = None
+    if apply_scaling:
+        train_scalers = train_ds.fit_scaling(use_single_scaler=use_single_scaler)
+        val_ds.apply_scaling(train_scalers, use_single_scaler=use_single_scaler)
     if test_df is not None:
         test_ds = PROTAC_Dataset(
             test_df,
             smiles2fp,
             active_label=active_label,
             disabled_embeddings=disabled_embeddings,
+            scaler=train_scalers if apply_scaling else scaler,
             use_single_scaler=train_ds.use_single_scaler if train_ds.use_single_scaler is not None else use_single_scaler,
         )
+        if apply_scaling:
+            test_ds.apply_scaling(train_ds.scaler, use_single_scaler=use_single_scaler)
     else:
         test_ds = None
     return train_ds, val_ds, test_ds

reports/ablation_zero_vectors_report_Active_Dmax_0.6_pDC50_6.0_test_split_0.1_random.csv ADDED Viewed

	@@ -0,0 +1,29 @@

+test_loss,test_acc,test_f1_score,test_precision,test_recall,test_roc_auc,train_len,train_active_perc,train_inactive_perc,train_avg_tanimoto_dist,test_len,test_active_perc,test_inactive_perc,test_avg_tanimoto_dist,num_leaking_uniprot_train_test,num_leaking_smiles_train_test,perc_leaking_uniprot_train_test,perc_leaking_smiles_train_test,majority_vote,model_type,disabled_embeddings,test_f1,split_type
+0.7269228100776672,0.604651153087616,0.6730769276618958,0.546875,0.875,0.7173913717269897,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled e3,,random
+0.6971672177314758,0.6162790656089783,0.5352112650871277,0.6129032373428345,0.4749999940395355,0.6717391014099121,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled e3,,random
+0.6542536020278931,0.6395348906517029,0.6436781883239746,0.5957446694374084,0.699999988079071,0.7141305208206177,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled e3,,random
+,0.6162790656089783,,0.6296296119689941,0.42500001192092896,0.689673900604248,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,True,Pytorch,disabled e3,0.5074626803398132,random
+0.7447491884231567,0.5930232405662537,0.6534653306007385,0.5409836173057556,0.824999988079071,0.70923912525177,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled poi,,random
+0.7114118933677673,0.604651153087616,0.5405405163764954,0.5882353186607361,0.5,0.6630434989929199,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled poi,,random
+0.6734361052513123,0.6162790656089783,0.6373626589775085,0.5686274766921997,0.7250000238418579,0.6940217614173889,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled poi,,random
+,0.5930232405662537,,0.5806451439857483,0.44999998807907104,0.6809782981872559,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,True,Pytorch,disabled poi,0.5070422291755676,random
+0.7288045883178711,0.6162790656089783,0.6796116232872009,0.5555555820465088,0.875,0.717663049697876,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled cell,,random
+0.6981603503227234,0.6395348906517029,0.5866666436195374,0.6285714507102966,0.550000011920929,0.6709238886833191,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled cell,,random
+0.6586534380912781,0.6395348906517029,0.6436781883239746,0.5957446694374084,0.699999988079071,0.7122282385826111,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled cell,,random
+,0.6279069781303406,,0.6333333253860474,0.4749999940395355,0.688858687877655,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,True,Pytorch,disabled cell,0.5428571701049805,random
+0.7676423788070679,0.4651162922382355,0.6349206566810608,0.4651162922382355,1.0,0.7361413240432739,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled smiles,,random
+0.7521520256996155,0.5348837375640869,0.0,0.0,0.0,0.7638586759567261,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled smiles,,random
+0.7137073278427124,0.5930232405662537,0.2857142984867096,0.7777777910232544,0.17499999701976776,0.727989137172699,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled smiles,,random
+,0.5348837375640869,,0.0,0.0,0.7638587951660156,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,True,Pytorch,disabled smiles,0.0,random
+0.7207046151161194,0.6162790656089783,0.6796116232872009,0.5555555820465088,0.875,0.7160326242446899,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled e3 cell,,random
+0.6998258829116821,0.6162790656089783,0.5352112650871277,0.6129032373428345,0.4749999940395355,0.6720108985900879,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled e3 cell,,random
+0.6533703207969666,0.6395348906517029,0.6436781883239746,0.5957446694374084,0.699999988079071,0.7122282385826111,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled e3 cell,,random
+,0.6162790656089783,,0.6296296119689941,0.42500001192092896,0.688858687877655,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,True,Pytorch,disabled e3 cell,0.5074626803398132,random
+0.7362547516822815,0.5930232405662537,0.6534653306007385,0.5409836173057556,0.824999988079071,0.710326075553894,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled poi e3,,random
+0.7125736474990845,0.6162790656089783,0.5479452013969421,0.6060606241226196,0.5,0.6619565486907959,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled poi e3,,random
+0.6676729321479797,0.6395348906517029,0.6436781883239746,0.5957446694374084,0.699999988079071,0.6945651769638062,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled poi e3,,random
+,0.6162790656089783,,0.6206896305084229,0.44999998807907104,0.6836956143379211,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,True,Pytorch,disabled poi e3,0.52173912525177,random
+0.7300900816917419,0.5930232405662537,0.6534653306007385,0.5409836173057556,0.824999988079071,0.706793487071991,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled poi e3 cell,,random
+0.7153109908103943,0.6162790656089783,0.5352112650871277,0.6129032373428345,0.4749999940395355,0.6611412763595581,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled poi e3 cell,,random
+0.6669936180114746,0.6279069781303406,0.6279069781303406,0.5869565010070801,0.675000011920929,0.6932065486907959,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,False,Pytorch,disabled poi e3 cell,,random
+,0.6162790656089783,,0.6296296119689941,0.42500001192092896,0.6834239363670349,771,0.5149156939040207,0.48508430609597925,0.3768059369269877,86,0.46511627906976744,0.5348837209302325,0.38114673659326254,34,44,0.8326848249027238,0.10246433203631647,True,Pytorch,disabled poi e3 cell,0.5074626803398132,random

reports/ablation_zero_vectors_report_Active_Dmax_0.6_pDC50_6.0_test_split_0.1_tanimoto.csv ADDED Viewed

	@@ -0,0 +1,29 @@

+test_loss,test_acc,test_f1_score,test_precision,test_recall,test_roc_auc,train_len,train_active_perc,train_inactive_perc,train_avg_tanimoto_dist,test_len,test_active_perc,test_inactive_perc,test_avg_tanimoto_dist,num_leaking_uniprot_train_test,num_leaking_smiles_train_test,perc_leaking_uniprot_train_test,perc_leaking_smiles_train_test,majority_vote,model_type,disabled_embeddings,test_f1,split_type
+0.8296061754226685,0.43529412150382996,0.6065573692321777,0.43529412150382996,1.0,0.7832207083702087,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled e3,,tanimoto
+0.6474169492721558,0.6000000238418579,0.6600000262260437,0.523809552192688,0.8918918967247009,0.7668918371200562,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled e3,,tanimoto
+0.6295721530914307,0.7529411911964417,0.7042253613471985,0.7352941036224365,0.6756756901741028,0.8141891956329346,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled e3,,tanimoto
+,0.7529411911964417,,0.75,0.6486486196517944,0.8023648858070374,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,True,Pytorch,disabled e3,0.695652186870575,tanimoto
+0.8408050537109375,0.43529412150382996,0.6065573692321777,0.43529412150382996,1.0,0.7691441774368286,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled poi,,tanimoto
+0.6602048277854919,0.5764706134796143,0.6470588445663452,0.5076923370361328,0.8918918967247009,0.7494369745254517,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled poi,,tanimoto
+0.634836733341217,0.7411764860153198,0.6944444179534912,0.7142857313156128,0.6756756901741028,0.7849099636077881,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled poi,,tanimoto
+,0.7411764860153198,,0.7272727489471436,0.6486486196517944,0.7770270109176636,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,True,Pytorch,disabled poi,0.6857143044471741,tanimoto
+0.835131824016571,0.43529412150382996,0.6065573692321777,0.43529412150382996,1.0,0.7736486196517944,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled cell,,tanimoto
+0.6562066674232483,0.5882353186607361,0.6534653306007385,0.515625,0.8918918967247009,0.75,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled cell,,tanimoto
+0.6323299407958984,0.729411780834198,0.6760563254356384,0.7058823704719543,0.6486486196517944,0.8001126050949097,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled cell,,tanimoto
+,0.729411780834198,,0.71875,0.6216216087341309,0.7905405163764954,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,True,Pytorch,disabled cell,0.6666666865348816,tanimoto
+0.8332716226577759,0.43529412150382996,0.6065573692321777,0.43529412150382996,1.0,0.798704981803894,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled smiles,,tanimoto
+0.765400767326355,0.43529412150382996,0.6065573692321777,0.43529412150382996,1.0,0.7919481992721558,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled smiles,,tanimoto
+0.6887043118476868,0.4941176474094391,0.632478654384613,0.4625000059604645,1.0,0.8110923171043396,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled smiles,,tanimoto
+,0.4941176474094391,,0.4625000059604645,1.0,0.8110923767089844,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,True,Pytorch,disabled smiles,0.632478654384613,tanimoto
+0.825886070728302,0.43529412150382996,0.6065573692321777,0.43529412150382996,1.0,0.7787162065505981,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled e3 cell,,tanimoto
+0.6474983096122742,0.6000000238418579,0.6600000262260437,0.523809552192688,0.8918918967247009,0.7567567825317383,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled e3 cell,,tanimoto
+0.6309086680412292,0.7411764860153198,0.6857143044471741,0.7272727489471436,0.6486486196517944,0.8119369149208069,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled e3 cell,,tanimoto
+,0.7411764860153198,,0.7419354915618896,0.6216216087341309,0.8006756901741028,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,True,Pytorch,disabled e3 cell,0.6764705777168274,tanimoto
+0.8314616680145264,0.43529412150382996,0.6065573692321777,0.43529412150382996,1.0,0.7697072625160217,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled poi e3,,tanimoto
+0.651317298412323,0.6117647290229797,0.6666666865348816,0.5322580933570862,0.8918918967247009,0.7488738894462585,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled poi e3,,tanimoto
+0.633421003818512,0.7529411911964417,0.7042253613471985,0.7352941036224365,0.6756756901741028,0.795045018196106,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled poi e3,,tanimoto
+,0.7529411911964417,,0.75,0.6486486196517944,0.7837837934494019,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,True,Pytorch,disabled poi e3,0.695652186870575,tanimoto
+0.8277769088745117,0.43529412150382996,0.6065573692321777,0.43529412150382996,1.0,0.7629504203796387,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled poi e3 cell,,tanimoto
+0.6514514088630676,0.6000000238418579,0.6530612111091614,0.5245901346206665,0.8648648858070374,0.7438063025474548,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled poi e3 cell,,tanimoto
+0.6348393559455872,0.7411764860153198,0.6857143044471741,0.7272727489471436,0.6486486196517944,0.7837837934494019,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,False,Pytorch,disabled poi e3 cell,,tanimoto
+,0.7411764860153198,,0.7419354915618896,0.6216216087341309,0.7742117047309875,772,0.5181347150259067,0.48186528497409326,0.37254018872057115,85,0.43529411764705883,0.5647058823529412,0.4199408355934975,22,0,0.5699481865284974,0.0,True,Pytorch,disabled poi e3 cell,0.6764705777168274,tanimoto

reports/ablation_zero_vectors_report_Active_Dmax_0.6_pDC50_6.0_test_split_0.1_uniprot.csv ADDED Viewed

	@@ -0,0 +1,29 @@

+test_loss,test_acc,test_f1_score,test_precision,test_recall,test_roc_auc,train_len,train_active_perc,train_inactive_perc,train_avg_tanimoto_dist,test_len,test_active_perc,test_inactive_perc,test_avg_tanimoto_dist,num_leaking_uniprot_train_test,num_leaking_smiles_train_test,perc_leaking_uniprot_train_test,perc_leaking_smiles_train_test,majority_vote,model_type,disabled_embeddings,test_f1,split_type
+0.7041562795639038,0.4588235318660736,0.0,0.0,0.0,0.5156075954437256,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled e3,,uniprot
+0.6916469931602478,0.4588235318660736,0.0,0.0,0.0,0.4420289397239685,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled e3,,uniprot
+0.6960257887840271,0.4588235318660736,0.0,0.0,0.0,0.4303233027458191,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled e3,,uniprot
+,0.4588235318660736,,0.0,0.0,0.5156075954437256,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,True,Pytorch,disabled e3,0.0,uniprot
+0.7039564251899719,0.4588235318660736,0.0,0.0,0.0,0.532608687877655,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled poi,,uniprot
+0.6913965940475464,0.4588235318660736,0.0,0.0,0.0,0.46739131212234497,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled poi,,uniprot
+0.6957095265388489,0.4588235318660736,0.0,0.0,0.0,0.45234110951423645,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled poi,,uniprot
+,0.4588235318660736,,0.0,0.0,0.532608687877655,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,True,Pytorch,disabled poi,0.0,uniprot
+0.7036164402961731,0.4588235318660736,0.0,0.0,0.0,0.530379056930542,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled cell,,uniprot
+0.6914005875587463,0.4588235318660736,0.0,0.0,0.0,0.48188406229019165,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled cell,,uniprot
+0.695412814617157,0.4588235318660736,0.0,0.0,0.0,0.4763098955154419,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled cell,,uniprot
+,0.4588235318660736,,0.0,0.0,0.530379056930542,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,True,Pytorch,disabled cell,0.0,uniprot
+0.697465717792511,0.4588235318660736,0.0,0.0,0.0,0.6223523020744324,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled smiles,,uniprot
+0.6916133761405945,0.4588235318660736,0.0,0.0,0.0,0.6636008620262146,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled smiles,,uniprot
+0.6932395696640015,0.4588235318660736,0.0,0.0,0.0,0.651337742805481,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled smiles,,uniprot
+,0.4588235318660736,,0.0,0.0,0.6223522424697876,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,True,Pytorch,disabled smiles,0.0,uniprot
+0.704821765422821,0.4588235318660736,0.0,0.0,0.0,0.518673300743103,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled e3 cell,,uniprot
+0.6916972398757935,0.4588235318660736,0.0,0.0,0.0,0.45234113931655884,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled e3 cell,,uniprot
+0.6962708830833435,0.4588235318660736,0.0,0.0,0.0,0.42892974615097046,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled e3 cell,,uniprot
+,0.4588235318660736,,0.0,0.0,0.5186733603477478,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,True,Pytorch,disabled e3 cell,0.0,uniprot
+0.7051585912704468,0.4588235318660736,0.0,0.0,0.0,0.5103121399879456,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled poi e3,,uniprot
+0.6916910409927368,0.4588235318660736,0.0,0.0,0.0,0.44732439517974854,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled poi e3,,uniprot
+0.6965663433074951,0.4588235318660736,0.0,0.0,0.0,0.40328872203826904,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled poi e3,,uniprot
+,0.4588235318660736,,0.0,0.0,0.5103121399879456,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,True,Pytorch,disabled poi e3,0.0,uniprot
+0.7058382034301758,0.4588235318660736,0.0,0.0,0.0,0.5080825090408325,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled poi e3 cell,,uniprot
+0.6917427778244019,0.4588235318660736,0.0,0.0,0.0,0.450111448764801,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled poi e3 cell,,uniprot
+0.6968205571174622,0.4588235318660736,0.0,0.0,0.0,0.4155518114566803,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,False,Pytorch,disabled poi e3 cell,,uniprot
+,0.4588235318660736,,0.0,0.0,0.5080825090408325,772,0.5064766839378239,0.49352331606217614,0.3753049487934892,85,0.5411764705882353,0.4588235294117647,0.39483030881358294,0,6,0.0,0.011658031088082901,True,Pytorch,disabled poi e3 cell,0.0,uniprot

src/run_xgboost_experiments.py ADDED Viewed

	@@ -0,0 +1,329 @@

+import os
+import sys
+from collections import defaultdict
+import warnings
+import logging
+from typing import Literal
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import protac_degradation_predictor as pdp
+from protac_degradation_predictor.optuna_utils import get_dataframe_stats
+import pytorch_lightning as pl
+from rdkit import Chem
+from rdkit.Chem import AllChem
+from rdkit import DataStructs
+from jsonargparse import CLI
+import pandas as pd
+from tqdm import tqdm
+import numpy as np
+from sklearn.preprocessing import OrdinalEncoder
+from sklearn.model_selection import (
+    StratifiedKFold,
+    StratifiedGroupKFold,
+)
+# Ignore UserWarning from Matplotlib
+warnings.filterwarnings("ignore", ".*FixedLocator*")
+# Ignore UserWarning from PyTorch Lightning
+warnings.filterwarnings("ignore", ".*does not have many workers.*")
+root = logging.getLogger()
+root.setLevel(logging.DEBUG)
+handler = logging.StreamHandler(sys.stdout)
+handler.setLevel(logging.DEBUG)
+formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+handler.setFormatter(formatter)
+root.addHandler(handler)
+def get_random_split_indices(active_df: pd.DataFrame, test_split: float) -> pd.Index:
+    """ Get the indices of the test set using a random split.
+    Args:
+        active_df (pd.DataFrame): The DataFrame containing the active PROTACs.
+        test_split (float): The percentage of the active PROTACs to use as the test set.
+    Returns:
+        pd.Index: The indices of the test set.
+    """
+    test_df = active_df.sample(frac=test_split, random_state=42)
+    return test_df.index
+def get_e3_ligase_split_indices(active_df: pd.DataFrame) -> pd.Index:
+    """ Get the indices of the test set using the E3 ligase split.
+    Args:
+        active_df (pd.DataFrame): The DataFrame containing the active PROTACs.
+    Returns:
+        pd.Index: The indices of the test set.
+    """
+    encoder = OrdinalEncoder()
+    active_df['E3 Group'] = encoder.fit_transform(active_df[['E3 Ligase']]).astype(int)
+    test_df = active_df[(active_df['E3 Ligase'] != 'VHL') & (active_df['E3 Ligase'] != 'CRBN')]
+    return test_df.index
+def get_smiles2fp_and_avg_tanimoto(protac_df: pd.DataFrame) -> tuple:
+    """ Get the SMILES to fingerprint dictionary and the average Tanimoto similarity.
+    Args:
+        protac_df (pd.DataFrame): The DataFrame containing the PROTACs.
+    Returns:
+        tuple: The SMILES to fingerprint dictionary and the average Tanimoto similarity.
+    """
+    unique_smiles = protac_df['Smiles'].unique().tolist()
+    smiles2fp = {}
+    for smiles in tqdm(unique_smiles, desc='Precomputing fingerprints'):
+        smiles2fp[smiles] = pdp.get_fingerprint(smiles)
+    # # Get the pair-wise tanimoto similarity between the PROTAC fingerprints
+    # tanimoto_matrix = defaultdict(list)
+    # for i, smiles1 in enumerate(tqdm(protac_df['Smiles'].unique(), desc='Computing Tanimoto similarity')):
+    #     fp1 = smiles2fp[smiles1]
+    #     # TODO: Use BulkTanimotoSimilarity for better performance
+    #     for j, smiles2 in enumerate(protac_df['Smiles'].unique()[i:]):
+    #         fp2 = smiles2fp[smiles2]
+    #         tanimoto_dist = 1 - DataStructs.TanimotoSimilarity(fp1, fp2)
+    #         tanimoto_matrix[smiles1].append(tanimoto_dist)
+    # avg_tanimoto = {k: np.mean(v) for k, v in tanimoto_matrix.items()}
+    # protac_df['Avg Tanimoto'] = protac_df['Smiles'].map(avg_tanimoto)
+    tanimoto_matrix = defaultdict(list)
+    fps = list(smiles2fp.values())
+    # Compute all-against-all Tanimoto similarity using BulkTanimotoSimilarity
+    for i, (smiles1, fp1) in enumerate(tqdm(zip(unique_smiles, fps), desc='Computing Tanimoto similarity', total=len(fps))):
+        similarities = DataStructs.BulkTanimotoSimilarity(fp1, fps[i:])  # Only compute for i to end, avoiding duplicates
+        for j, similarity in enumerate(similarities):
+            distance = 1 - similarity
+            tanimoto_matrix[smiles1].append(distance)  # Store as distance
+            if i != i + j:
+                tanimoto_matrix[unique_smiles[i + j]].append(distance)  # Symmetric filling
+    # Calculate average Tanimoto distance for each unique SMILES
+    avg_tanimoto = {k: np.mean(v) for k, v in tanimoto_matrix.items()}
+    protac_df['Avg Tanimoto'] = protac_df['Smiles'].map(avg_tanimoto)
+    smiles2fp = {s: np.array(fp) for s, fp in smiles2fp.items()}
+    return smiles2fp, protac_df
+def get_tanimoto_split_indices(
+        active_df: pd.DataFrame,
+        active_col: str,
+        test_split: float,
+        n_bins_tanimoto: int = 200,
+) -> pd.Index:
+    """ Get the indices of the test set using the Tanimoto-based split.
+    Args:
+        active_df (pd.DataFrame): The DataFrame containing the active PROTACs.
+        n_bins_tanimoto (int): The number of bins to use for the Tanimoto similarity.
+    Returns:
+        pd.Index: The indices of the test set.
+    """
+    tanimoto_groups = pd.cut(active_df['Avg Tanimoto'], bins=n_bins_tanimoto).copy()
+    encoder = OrdinalEncoder()
+    active_df['Tanimoto Group'] = encoder.fit_transform(tanimoto_groups.values.reshape(-1, 1)).astype(int)
+    # Sort the groups so that samples with the highest tanimoto similarity,
+    # i.e., the "less similar" ones, are placed in the test set first
+    tanimoto_groups = active_df.groupby('Tanimoto Group')['Avg Tanimoto'].mean().sort_values(ascending=False).index
+    test_df = []
+    # For each group, get the number of active and inactive entries. Then, add those
+    # entries to the test_df if: 1) the test_df lenght + the group entries is less
+    # 20% of the active_df lenght, and 2) the percentage of True and False entries
+    # in the active_col in test_df is roughly 50%.
+    for group in tanimoto_groups:
+        group_df = active_df[active_df['Tanimoto Group'] == group]
+        if test_df == []:
+            test_df.append(group_df)
+            continue
+        num_entries = len(group_df)
+        num_active_group = group_df[active_col].sum()
+        num_inactive_group = num_entries - num_active_group
+        tmp_test_df = pd.concat(test_df)
+        num_entries_test = len(tmp_test_df)
+        num_active_test = tmp_test_df[active_col].sum()
+        num_inactive_test = num_entries_test - num_active_test
+        # Check if the group entries can be added to the test_df
+        if num_entries_test + num_entries < test_split * len(active_df):
+            # Add anything at the beggining
+            if num_entries_test + num_entries < test_split / 2 * len(active_df):
+                test_df.append(group_df)
+                continue
+            # Be more selective and make sure that the percentage of active and
+            # inactive is balanced
+            if (num_active_group + num_active_test) / (num_entries_test + num_entries) < 0.6:
+                if (num_inactive_group + num_inactive_test) / (num_entries_test + num_entries) < 0.6:
+                    test_df.append(group_df)
+    test_df = pd.concat(test_df)
+    return test_df.index
+def get_target_split_indices(active_df: pd.DataFrame, active_col: str, test_split: float) -> pd.Index:
+    """ Get the indices of the test set using the target-based split.
+    Args:
+        active_df (pd.DataFrame): The DataFrame containing the active PROTACs.
+        active_col (str): The column containing the active/inactive information.
+        test_split (float): The percentage of the active PROTACs to use as the test set.
+    Returns:
+        pd.Index: The indices of the test set.
+    """
+    encoder = OrdinalEncoder()
+    active_df['Uniprot Group'] = encoder.fit_transform(active_df[['Uniprot']]).astype(int)
+    test_df = []
+    # For each group, get the number of active and inactive entries. Then, add those
+    # entries to the test_df if: 1) the test_df lenght + the group entries is less
+    # 20% of the active_df lenght, and 2) the percentage of True and False entries
+    # in the active_col in test_df is roughly 50%.
+    # Start the loop from the groups containing the smallest number of entries.
+    for group in reversed(active_df['Uniprot'].value_counts().index):
+        group_df = active_df[active_df['Uniprot'] == group]
+        if test_df == []:
+            test_df.append(group_df)
+            continue
+        num_entries = len(group_df)
+        num_active_group = group_df[active_col].sum()
+        num_inactive_group = num_entries - num_active_group
+        tmp_test_df = pd.concat(test_df)
+        num_entries_test = len(tmp_test_df)
+        num_active_test = tmp_test_df[active_col].sum()
+        num_inactive_test = num_entries_test - num_active_test
+        # Check if the group entries can be added to the test_df
+        if num_entries_test + num_entries < test_split * len(active_df):
+            # Add anything at the beggining
+            if num_entries_test + num_entries < test_split / 2 * len(active_df):
+                test_df.append(group_df)
+                continue
+            # Be more selective and make sure that the percentage of active and
+            # inactive is balanced
+            if (num_active_group + num_active_test) / (num_entries_test + num_entries) < 0.6:
+                if (num_inactive_group + num_inactive_test) / (num_entries_test + num_entries) < 0.6:
+                    test_df.append(group_df)
+    test_df = pd.concat(test_df)
+    return test_df.index
+def main(
+    active_col: str = 'Active (Dmax 0.6, pDC50 6.0)',
+    n_trials: int = 100,
+    test_split: float = 0.1,
+    cv_n_splits: int = 5,
+    num_boost_round: int = 100,
+    force_study: bool = False,
+    experiments: str | Literal['all', 'random', 'e3_ligase', 'tanimoto', 'uniprot'] = 'all',
+):
+    """ Train a PROTAC model using the given datasets and hyperparameters.
+    Args:
+        use_ored_activity (bool): Whether to use the 'Active - OR' column.
+        n_trials (int): The number of hyperparameter optimization trials.
+        n_splits (int): The number of cross-validation splits.
+        fast_dev_run (bool): Whether to run a fast development run.
+    """
+    pl.seed_everything(42)
+    # Set the Column to Predict
+    active_name = active_col.replace(' ', '_').replace('(', '').replace(')', '').replace(',', '')
+    # Get Dmax_threshold from the active_col
+    Dmax_threshold = float(active_col.split('Dmax')[1].split(',')[0].strip('(').strip(')').strip())
+    pDC50_threshold = float(active_col.split('pDC50')[1].strip('(').strip(')').strip())
+    # Load the PROTAC dataset
+    protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
+    # Map E3 Ligase Iap to IAP
+    protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
+    protac_df[active_col] = protac_df.apply(
+        lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
+    )
+    smiles2fp, protac_df = get_smiles2fp_and_avg_tanimoto(protac_df)
+    ## Get the test sets
+    test_indeces = {}
+    active_df = protac_df[protac_df[active_col].notna()].copy()
+    if experiments == 'random' or experiments == 'all':
+        test_indeces['random'] = get_random_split_indices(active_df, test_split)
+    if experiments == 'uniprot' or experiments == 'all':
+        test_indeces['uniprot'] = get_target_split_indices(active_df, active_col, test_split)
+    if experiments == 'e3_ligase' or experiments == 'all':
+        test_indeces['e3_ligase'] = get_e3_ligase_split_indices(active_df)
+    if experiments == 'tanimoto' or experiments == 'all':
+        test_indeces['tanimoto'] = get_tanimoto_split_indices(active_df, active_col, test_split)
+    # Make directory ../reports if it does not exist
+    if not os.path.exists('../reports'):
+        os.makedirs('../reports')
+    # Load embedding dictionaries
+    protein2embedding = pdp.load_protein2embedding('../data/uniprot2embedding.h5')
+    cell2embedding = pdp.load_cell2embedding('../data/cell2embedding.pkl')
+    # Cross-Validation Training
+    reports = defaultdict(list)
+    for split_type, indeces in test_indeces.items():
+        test_df = active_df.loc[indeces].copy()
+        train_val_df = active_df[~active_df.index.isin(test_df.index)].copy()
+        # Get the CV object
+        if split_type == 'random':
+            kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = None
+        elif split_type == 'e3_ligase':
+            kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = train_val_df['E3 Group'].to_numpy()
+        elif split_type == 'tanimoto':
+            kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = train_val_df['Tanimoto Group'].to_numpy()
+        elif split_type == 'uniprot':
+            kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = train_val_df['Uniprot Group'].to_numpy()
+        # Start the experiment
+        experiment_name = f'{active_name}_test_split_{test_split}_{split_type}'
+        optuna_reports = pdp.xgboost_hyperparameter_tuning_and_training(
+            protein2embedding=protein2embedding,
+            cell2embedding=cell2embedding,
+            smiles2fp=smiles2fp,
+            train_val_df=train_val_df,
+            test_df=test_df,
+            kf=kf,
+            groups=group,
+            split_type=split_type,
+            n_models_for_test=3,
+            n_trials=n_trials,
+            active_label=active_col,
+            num_boost_round=num_boost_round,
+            study_filename=f'../reports/study_xgboost_{experiment_name}.pkl',
+            force_study=force_study,
+        )
+        # Save the reports to file
+        for report_name, report in optuna_reports.items():
+            report.to_csv(f'../reports/xgboost_{report_name}_{experiment_name}.csv', index=False)
+            reports[report_name].append(report.copy())
+if __name__ == '__main__':
+    cli = CLI(main)