Spaces:

ailab-bio
/

PROTAC-Degradation-Predictor

Running

App Files Files Community

ribesstefano commited on Apr 29

Commit

fda7af7

•

1 Parent(s): 62ccb16

Added FP search + Added train metrics to logs

Browse files

Files changed (5) hide show

notebooks/best_fingerprint_search.ipynb +275 -0
protac_degradation_predictor/config.py +1 -1
protac_degradation_predictor/optuna_utils.py +3 -12
protac_degradation_predictor/pytorch_models.py +17 -5
src/plot_experiment_results.py +168 -22

notebooks/best_fingerprint_search.ipynb ADDED Viewed

	@@ -0,0 +1,275 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "from collections import defaultdict\n",
+    "import warnings\n",
+    "import logging\n",
+    "from typing import Literal\n",
+    "\n",
+    "sys.path.append('~/PROTAC-Degradation-Predictor/protac_degradation_predictor')\n",
+    "import protac_degradation_predictor as pdp\n",
+    "\n",
+    "import pytorch_lightning as pl\n",
+    "from rdkit import Chem\n",
+    "from rdkit.Chem import AllChem\n",
+    "from rdkit import DataStructs\n",
+    "from jsonargparse import CLI\n",
+    "import pandas as pd\n",
+    "# Import tqdm for notebook\n",
+    "from tqdm.notebook import tqdm\n",
+    "import numpy as np\n",
+    "from sklearn.preprocessing import OrdinalEncoder\n",
+    "from sklearn.model_selection import (\n",
+    "    StratifiedKFold,\n",
+    "    StratifiedGroupKFold,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "active_col = 'Active (Dmax 0.6, pDC50 6.0)'\n",
+    "pDC50_threshold = 6.0\n",
+    "Dmax_threshold = 0.6\n",
+    "\n",
+    "protac_df = pd.read_csv('~/PROTAC-Degradation-Predictor/data/PROTAC-Degradation-DB.csv')\n",
+    "protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')\n",
+    "protac_df[active_col] = protac_df.apply(\n",
+    "    lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "771"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def get_random_split_indices(active_df: pd.DataFrame, test_split: float) -> pd.Index:\n",
+    "    \"\"\" Get the indices of the test set using a random split.\n",
+    "    \n",
+    "    Args:\n",
+    "        active_df (pd.DataFrame): The DataFrame containing the active PROTACs.\n",
+    "        test_split (float): The percentage of the active PROTACs to use as the test set.\n",
+    "    \n",
+    "    Returns:\n",
+    "        pd.Index: The indices of the test set.\n",
+    "    \"\"\"\n",
+    "    test_df = active_df.sample(frac=test_split, random_state=42)\n",
+    "    return test_df.index\n",
+    "\n",
+    "active_df = protac_df[protac_df[active_col].notna()].copy()\n",
+    "test_split = 0.1\n",
+    "test_indices = get_random_split_indices(active_df, test_split)\n",
+    "train_val_df = active_df[~active_df.index.isin(test_indices)].copy()\n",
+    "len(train_val_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import optuna\n",
+    "\n",
+    "def objective(trial: optuna.Trial, verbose: int = 0) -> float:\n",
+    "    \n",
+    "    radius = trial.suggest_int('radius', 1, 15)\n",
+    "    fpsize = trial.suggest_int('fpsize', 128, 2048, step=128)\n",
+    "\n",
+    "    morgan_fpgen = AllChem.GetMorganGenerator(\n",
+    "        radius=radius,\n",
+    "        fpSize=fpsize,\n",
+    "        includeChirality=True,\n",
+    "    )\n",
+    "\n",
+    "    smiles2fp = {}\n",
+    "    for smiles in train_val_df['Smiles'].unique().tolist():\n",
+    "        smiles2fp[smiles] = pdp.get_fingerprint(smiles, morgan_fpgen)\n",
+    "\n",
+    "    # Count the number of unique SMILES and the number of unique Morgan fingerprints\n",
+    "    unique_fps = set([tuple(fp) for fp in smiles2fp.values()])\n",
+    "    # Get the list of SMILES with overlapping fingerprints\n",
+    "    overlapping_smiles = []\n",
+    "    unique_fps = set()\n",
+    "    for smiles, fp in smiles2fp.items():\n",
+    "        if tuple(fp) in unique_fps:\n",
+    "            overlapping_smiles.append(smiles)\n",
+    "        else:\n",
+    "            unique_fps.add(tuple(fp))\n",
+    "    num_overlaps = len(train_val_df[train_val_df[\"Smiles\"].isin(overlapping_smiles)])\n",
+    "    num_overlaps_tot = len(protac_df[protac_df[\"Smiles\"].isin(overlapping_smiles)])\n",
+    "\n",
+    "    if verbose:\n",
+    "        print(f'Radius: {radius}')\n",
+    "        print(f'FP length: {fpsize}')\n",
+    "        print(f'Number of unique SMILES: {len(smiles2fp)}')\n",
+    "        print(f'Number of unique fingerprints: {len(unique_fps)}')\n",
+    "        print(f'Number of SMILES with overlapping fingerprints: {len(overlapping_smiles)}')\n",
+    "        print(f'Number of overlapping SMILES in train_val_df: {num_overlaps}')\n",
+    "        print(f'Number of overlapping SMILES in protac_df: {num_overlaps_tot}')\n",
+    "    return num_overlaps + radius + fpsize / 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[I 2024-04-29 11:28:05,626] A new study created in memory with name: no-name-4db5d822-6220-4ab8-bc3a-c776b0e5cac2\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "678150f59ec548bb89562e2230993989",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/50 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[I 2024-04-29 11:28:07,705] Trial 0 finished with value: 39.480000000000004 and parameters: {'radius': 6, 'fpsize': 2048}. Best is trial 0 with value: 39.480000000000004.\n",
+      "[I 2024-04-29 11:28:09,590] Trial 1 finished with value: 23.8 and parameters: {'radius': 11, 'fpsize': 1280}. Best is trial 1 with value: 23.8.\n",
+      "[I 2024-04-29 11:28:10,474] Trial 2 finished with value: 131.84 and parameters: {'radius': 3, 'fpsize': 384}. Best is trial 1 with value: 23.8.\n",
+      "[I 2024-04-29 11:28:11,978] Trial 3 finished with value: 281.92 and parameters: {'radius': 1, 'fpsize': 1792}. Best is trial 1 with value: 23.8.\n",
+      "[I 2024-04-29 11:28:13,994] Trial 4 finished with value: 25.36 and parameters: {'radius': 10, 'fpsize': 1536}. Best is trial 1 with value: 23.8.\n",
+      "[I 2024-04-29 11:28:15,642] Trial 5 finished with value: 284.48 and parameters: {'radius': 1, 'fpsize': 2048}. Best is trial 1 with value: 23.8.\n",
+      "[I 2024-04-29 11:28:17,154] Trial 6 finished with value: 18.12 and parameters: {'radius': 13, 'fpsize': 512}. Best is trial 6 with value: 18.12.\n",
+      "[I 2024-04-29 11:28:18,057] Trial 7 finished with value: 131.84 and parameters: {'radius': 3, 'fpsize': 384}. Best is trial 6 with value: 18.12.\n",
+      "[I 2024-04-29 11:28:19,570] Trial 8 finished with value: 41.519999999999996 and parameters: {'radius': 5, 'fpsize': 1152}. Best is trial 6 with value: 18.12.\n",
+      "[I 2024-04-29 11:28:20,860] Trial 9 finished with value: 23.4 and parameters: {'radius': 7, 'fpsize': 640}. Best is trial 6 with value: 18.12.\n",
+      "[I 2024-04-29 11:28:22,631] Trial 10 finished with value: 22.68 and parameters: {'radius': 15, 'fpsize': 768}. Best is trial 6 with value: 18.12.\n",
+      "[I 2024-04-29 11:28:24,427] Trial 11 finished with value: 22.68 and parameters: {'radius': 15, 'fpsize': 768}. Best is trial 6 with value: 18.12.\n",
+      "[I 2024-04-29 11:28:25,756] Trial 12 finished with value: 92.28 and parameters: {'radius': 15, 'fpsize': 128}. Best is trial 6 with value: 18.12.\n",
+      "[I 2024-04-29 11:28:27,466] Trial 13 finished with value: 20.96 and parameters: {'radius': 12, 'fpsize': 896}. Best is trial 6 with value: 18.12.\n",
+      "[I 2024-04-29 11:28:29,156] Trial 14 finished with value: 20.96 and parameters: {'radius': 12, 'fpsize': 896}. Best is trial 6 with value: 18.12.\n",
+      "[I 2024-04-29 11:28:30,727] Trial 15 finished with value: 18.12 and parameters: {'radius': 13, 'fpsize': 512}. Best is trial 6 with value: 18.12.\n",
+      "[I 2024-04-29 11:28:31,842] Trial 16 finished with value: 22.28 and parameters: {'radius': 9, 'fpsize': 128}. Best is trial 6 with value: 18.12.\n",
+      "[I 2024-04-29 11:28:33,365] Trial 17 finished with value: 18.12 and parameters: {'radius': 13, 'fpsize': 512}. Best is trial 6 with value: 18.12.\n",
+      "[I 2024-04-29 11:28:34,801] Trial 18 finished with value: 16.84 and parameters: {'radius': 13, 'fpsize': 384}. Best is trial 18 with value: 16.84.\n",
+      "[I 2024-04-29 11:28:35,986] Trial 19 finished with value: 13.56 and parameters: {'radius': 9, 'fpsize': 256}. Best is trial 19 with value: 13.56.\n",
+      "[I 2024-04-29 11:28:37,122] Trial 20 finished with value: 14.56 and parameters: {'radius': 8, 'fpsize': 256}. Best is trial 19 with value: 13.56.\n",
+      "[I 2024-04-29 11:28:38,175] Trial 21 finished with value: 30.28 and parameters: {'radius': 8, 'fpsize': 128}. Best is trial 19 with value: 13.56.\n",
+      "[I 2024-04-29 11:28:39,406] Trial 22 finished with value: 13.56 and parameters: {'radius': 9, 'fpsize': 256}. Best is trial 19 with value: 13.56.\n",
+      "[I 2024-04-29 11:28:40,649] Trial 23 finished with value: 13.56 and parameters: {'radius': 9, 'fpsize': 256}. Best is trial 19 with value: 13.56.\n",
+      "[I 2024-04-29 11:28:41,868] Trial 24 finished with value: 12.56 and parameters: {'radius': 10, 'fpsize': 256}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:28:43,109] Trial 25 finished with value: 12.56 and parameters: {'radius': 10, 'fpsize': 256}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:28:44,587] Trial 26 finished with value: 16.4 and parameters: {'radius': 10, 'fpsize': 640}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:28:46,599] Trial 27 finished with value: 25.08 and parameters: {'radius': 11, 'fpsize': 1408}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:28:48,015] Trial 28 finished with value: 31.96 and parameters: {'radius': 6, 'fpsize': 896}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:28:49,347] Trial 29 finished with value: 23.4 and parameters: {'radius': 7, 'fpsize': 640}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:28:51,503] Trial 30 finished with value: 27.64 and parameters: {'radius': 11, 'fpsize': 1664}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:28:52,657] Trial 31 finished with value: 13.56 and parameters: {'radius': 9, 'fpsize': 256}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:28:53,840] Trial 32 finished with value: 12.56 and parameters: {'radius': 10, 'fpsize': 256}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:28:55,159] Trial 33 finished with value: 13.84 and parameters: {'radius': 10, 'fpsize': 384}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:28:56,140] Trial 34 finished with value: 39.28 and parameters: {'radius': 7, 'fpsize': 128}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:28:57,508] Trial 35 finished with value: 14.84 and parameters: {'radius': 11, 'fpsize': 384}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:28:58,900] Trial 36 finished with value: 15.120000000000001 and parameters: {'radius': 10, 'fpsize': 512}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:29:00,203] Trial 37 finished with value: 14.56 and parameters: {'radius': 12, 'fpsize': 256}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:29:02,225] Trial 38 finished with value: 49.2 and parameters: {'radius': 5, 'fpsize': 1920}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:29:03,942] Trial 39 finished with value: 22.52 and parameters: {'radius': 8, 'fpsize': 1152}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:29:05,240] Trial 40 finished with value: 13.84 and parameters: {'radius': 10, 'fpsize': 384}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:29:06,396] Trial 41 finished with value: 13.56 and parameters: {'radius': 9, 'fpsize': 256}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:29:07,422] Trial 42 finished with value: 30.28 and parameters: {'radius': 8, 'fpsize': 128}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:29:08,590] Trial 43 finished with value: 13.56 and parameters: {'radius': 9, 'fpsize': 256}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:29:09,949] Trial 44 finished with value: 14.84 and parameters: {'radius': 11, 'fpsize': 384}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:29:11,378] Trial 45 finished with value: 15.120000000000001 and parameters: {'radius': 10, 'fpsize': 512}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:29:12,637] Trial 46 finished with value: 26.4 and parameters: {'radius': 6, 'fpsize': 640}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:29:14,232] Trial 47 finished with value: 18.68 and parameters: {'radius': 11, 'fpsize': 768}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:29:14,904] Trial 48 finished with value: 214.28 and parameters: {'radius': 2, 'fpsize': 128}. Best is trial 24 with value: 12.56.\n",
+      "[I 2024-04-29 11:29:16,323] Trial 49 finished with value: 16.56 and parameters: {'radius': 14, 'fpsize': 256}. Best is trial 24 with value: 12.56.\n"
+     ]
+    }
+   ],
+   "source": [
+    "sampler = optuna.samplers.TPESampler(seed=42)\n",
+    "study = optuna.create_study(sampler=sampler, direction='minimize')\n",
+    "study.optimize(objective, n_trials=50, show_progress_bar=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Radius: 10\n",
+      "FP length: 256\n",
+      "Number of unique SMILES: 532\n",
+      "Number of unique fingerprints: 532\n",
+      "Number of SMILES with overlapping fingerprints: 0\n",
+      "Number of overlapping SMILES in train_val_df: 0\n",
+      "Number of overlapping SMILES in protac_df: 0\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "12.56"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Run objective with best params and verbose\n",
+    "objective(study.best_trial, verbose=1)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

protac_degradation_predictor/config.py CHANGED Viewed

@@ -3,7 +3,7 @@ from dataclasses import dataclass, field
 @dataclass(frozen=True)
 class Config:
     # Embeddings information
-    morgan_radius: int = 15
     fingerprint_size: int = 256 # 224
     protein_embedding_size: int = 1024
     cell_embedding_size: int = 768

 @dataclass(frozen=True)
 class Config:
     # Embeddings information
+    morgan_radius: int = 10 # 15
     fingerprint_size: int = 256 # 224
     protein_embedding_size: int = 1024
     cell_embedding_size: int = 768

protac_degradation_predictor/optuna_utils.py CHANGED Viewed

@@ -173,13 +173,11 @@ def pytorch_model_objective(
             disabled_embeddings=disabled_embeddings,
         )
         if test_df is not None:
-            _, trainer, metrics, val_pred, test_pred = ret
             test_preds.append(test_pred)
         else:
-            _, trainer, metrics, val_pred = ret
-        train_metrics = {m: v.item() for m, v in trainer.callback_metrics.items() if 'train' in m}
         stats.update(metrics)
-        stats.update(train_metrics)
         report.append(stats.copy())
         val_preds.append(val_pred)
@@ -252,7 +250,7 @@ def hyperparameter_tuning_and_training(
     batch_size_options = [4, 8, 16, 32, 64, 128]
     learning_rate_options = (1e-5, 1e-3) # min and max values for loguniform distribution
     smote_k_neighbors_options = list(range(3, 16))
-    dropout_options = (0.1, 0.9)
     # Set the verbosity of Optuna
     optuna.logging.set_verbosity(optuna.logging.WARNING)
@@ -325,13 +323,6 @@ def hyperparameter_tuning_and_training(
         metrics['test_model_id'] = i
         metrics.update(dfs_stats)
-        # Add the training metrics
-        train_metrics = {m: v.item() for m, v in trainer.callback_metrics.items() if 'train' in m}
-        logging.info(f'Training metrics: {train_metrics}')
-        logging.info(f'Training trainer.logged_metrics: {trainer.logged_metrics}')
-        logging.info(f'Training trainer.callback_metrics: {trainer.callback_metrics}')
-        metrics.update(train_metrics)
         test_report.append(metrics.copy())
         test_preds.append(test_pred)
     test_report = pd.DataFrame(test_report)

             disabled_embeddings=disabled_embeddings,
         )
         if test_df is not None:
+            _, _, metrics, val_pred, test_pred = ret
             test_preds.append(test_pred)
         else:
+            _, _, metrics, val_pred = ret
         stats.update(metrics)
         report.append(stats.copy())
         val_preds.append(val_pred)
     batch_size_options = [4, 8, 16, 32, 64, 128]
     learning_rate_options = (1e-5, 1e-3) # min and max values for loguniform distribution
     smote_k_neighbors_options = list(range(3, 16))
+    dropout_options = (0.2, 0.9)
     # Set the verbosity of Optuna
     optuna.logging.set_verbosity(optuna.logging.WARNING)
         metrics['test_model_id'] = i
         metrics.update(dfs_stats)
         test_report.append(metrics.copy())
         test_preds.append(test_pred)
     test_report = pd.DataFrame(test_report)

protac_degradation_predictor/pytorch_models.py CHANGED Viewed

@@ -207,8 +207,8 @@ class PROTAC_Model(pl.LightningModule):
             'precision': Precision(task='binary'),
             'recall': Recall(task='binary'),
             'f1_score': F1Score(task='binary'),
-            'opt_score': Accuracy(task='binary') + F1Score(task='binary'),
-            'hp_metric': Accuracy(task='binary'),
         }, prefix=s.replace('metrics', '')) for s in stages})
         # Misc settings
@@ -314,8 +314,8 @@ class PROTAC_Model(pl.LightningModule):
             'lr_scheduler': optim.lr_scheduler.ReduceLROnPlateau(
                 optimizer=optimizer,
                 mode='min',
-                factor=0.5,
-                patience=2,
             ),
             'interval': 'step',  # or 'epoch'
             'frequency': 1,
@@ -508,6 +508,7 @@ def train_model(
         logger=loggers if use_logger else False,
         callbacks=callbacks,
         max_epochs=max_epochs,
         fast_dev_run=fast_dev_run,
         enable_model_summary=False,
         enable_checkpointing=enable_checkpointing,
@@ -534,11 +535,22 @@ def train_model(
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         trainer.fit(model)
-    metrics = trainer.validate(model, verbose=False)[0]
     # Add test metrics to metrics
     if test_df is not None:
         test_metrics = trainer.test(model, verbose=False)[0]
         metrics.update(test_metrics)
     if return_predictions:
         val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
         val_pred = trainer.predict(model, val_dl)

             'precision': Precision(task='binary'),
             'recall': Recall(task='binary'),
             'f1_score': F1Score(task='binary'),
+            # 'opt_score': Accuracy(task='binary') + F1Score(task='binary'),
+            # 'hp_metric': Accuracy(task='binary'),
         }, prefix=s.replace('metrics', '')) for s in stages})
         # Misc settings
             'lr_scheduler': optim.lr_scheduler.ReduceLROnPlateau(
                 optimizer=optimizer,
                 mode='min',
+                factor=0.1,
+                patience=0,
             ),
             'interval': 'step',  # or 'epoch'
             'frequency': 1,
         logger=loggers if use_logger else False,
         callbacks=callbacks,
         max_epochs=max_epochs,
+        val_check_interval=0.5,
         fast_dev_run=fast_dev_run,
         enable_model_summary=False,
         enable_checkpointing=enable_checkpointing,
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         trainer.fit(model)
+    metrics = {}
+    # Add train metrics
+    train_metrics = {m: v.item() for m, v in trainer.callback_metrics.items() if 'train' in m}
+    metrics.update(train_metrics)
+    # Add validation metrics
+    val_metrics = trainer.validate(model, verbose=False)[0]
+    val_metrics = {m: v for m, v in val_metrics.items() if 'val' in m}
+    metrics.update(val_metrics)
     # Add test metrics to metrics
     if test_df is not None:
         test_metrics = trainer.test(model, verbose=False)[0]
+        test_metrics = {m: v for m, v in test_metrics.items() if 'test' in m}
         metrics.update(test_metrics)
+    # Return predictions
     if return_predictions:
         val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
         val_pred = trainer.predict(model, val_dl)

src/plot_experiment_results.py CHANGED Viewed

@@ -12,7 +12,7 @@ import numpy as np
 palette = ['#83B8FE', '#FFA54C', '#94ED67', '#FF7FFF']
-def plot_metrics(df, title):
     # Clean the data
     df = df.dropna(how='all', axis=1)
@@ -37,6 +37,10 @@ def plot_metrics(df, title):
     ax[1].set_ylabel('Accuracy')
     ax[1].legend(loc='lower right')
     ax[1].grid(axis='both', alpha=0.5)
     # Plot training ROC-AUC
     ax[2].plot(epoch_data.index, epoch_data['train_roc_auc_epoch'], label='Training ROC-AUC')
@@ -44,16 +48,16 @@ def plot_metrics(df, title):
     ax[2].set_ylabel('ROC-AUC')
     ax[2].legend(loc='lower right')
     ax[2].grid(axis='both', alpha=0.5)
     # Set x-axis label
     ax[2].set_xlabel('Epoch')
-    plt.title(title)
     plt.tight_layout()
-    plt.savefig(f'plots/{title}_metrics.pdf', bbox_inches='tight')
-def plot_report(df_cv, df_test, title=None):
     # Extract and prepare CV data
     cv_data = df_cv[['model_type', 'fold', 'val_acc', 'val_roc_auc', 'test_acc', 'test_roc_auc', 'split_type']]
@@ -114,7 +118,13 @@ def plot_report(df_cv, df_test, title=None):
     # Plotting
     plt.figure(figsize=(12, 6))
-    sns.barplot(data=combined_data, x='Metric', y='Score', hue='Split Type', errorbar='sd', palette=palette)
     plt.title('')
     plt.ylabel('')
     plt.xlabel('')
@@ -134,9 +144,9 @@ def plot_report(df_cv, df_test, title=None):
         if p.get_height() < 0.01:
             continue
         if i % 2 == 0:
-            value = '{:.1f}%'.format(100 * p.get_height())
         else:
-            value = '{:.2f}'.format(p.get_height())
         print(f'Plotting value: {p.get_height()} -> {value}')
         x = p.get_x() + p.get_width() / 2
@@ -146,6 +156,120 @@ def plot_report(df_cv, df_test, title=None):
     plt.savefig(f'plots/{title}.pdf', bbox_inches='tight')
 def main():
     active_col = 'Active (Dmax 0.6, pDC50 6.0)'
     test_split = 0.1
@@ -156,28 +280,50 @@ def main():
     # Load the data
     reports = {
-        'cv_train': pd.read_csv(f'reports/report_cv_train_{report_base_name}.csv'),
-        'test': pd.read_csv(f'reports/report_test_{report_base_name}.csv'),
-        'ablation': pd.read_csv(f'reports/report_ablation_{report_base_name}.csv'),
-        'hparam': pd.read_csv(f'reports/report_hparam_{report_base_name}.csv'),
     }
-    # metrics = {}
-    # for i in range(n_models_for_test):
-    #     for split_type in ['random', 'tanimoto', 'uniprot', 'e3_ligase']:
-    #         logs_dir = f'logs_{report_base_name}_{split_type}_best_model_n{i}'
-    #         metrics[f'{split_type}_{i}'] = pd.read_csv(f'logs/{logs_dir}/{logs_dir}/metrics.csv')
-    #         metrics[f'{split_type}_{i}']['model_id'] = i
-    #         # Rename 'val_' columns to 'test_' columns
-    #         metrics[f'{split_type}_{i}'] = metrics[f'{split_type}_{i}'].rename(columns={'val_loss': 'test_loss', 'val_acc': 'test_acc', 'val_roc_auc': 'test_roc_auc'})
-    #         plot_metrics(metrics[f'{split_type}_{i}'], f'{split_type}_{i}')
     df_val = reports['cv_train']
     df_test = reports['test']
-    plot_report(df_val, df_test, title=f'{active_name}_metrics')
 if __name__ == '__main__':

 palette = ['#83B8FE', '#FFA54C', '#94ED67', '#FF7FFF']
+def plot_training_curves(df, split_type):
     # Clean the data
     df = df.dropna(how='all', axis=1)
     ax[1].set_ylabel('Accuracy')
     ax[1].legend(loc='lower right')
     ax[1].grid(axis='both', alpha=0.5)
+    # Set limit to y-axis
+    ax[1].set_ylim(0, 1.0)
+    # Set y-axis to percentage
+    ax[1].yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1, decimals=0))
     # Plot training ROC-AUC
     ax[2].plot(epoch_data.index, epoch_data['train_roc_auc_epoch'], label='Training ROC-AUC')
     ax[2].set_ylabel('ROC-AUC')
     ax[2].legend(loc='lower right')
     ax[2].grid(axis='both', alpha=0.5)
+    # Set limit to y-axis
+    ax[2].set_ylim(0, 1.0)
     # Set x-axis label
     ax[2].set_xlabel('Epoch')
     plt.tight_layout()
+    plt.savefig(f'plots/training_metrics_{split_type}.pdf', bbox_inches='tight')
+def plot_performance_metrics(df_cv, df_test, title=None):
     # Extract and prepare CV data
     cv_data = df_cv[['model_type', 'fold', 'val_acc', 'val_roc_auc', 'test_acc', 'test_roc_auc', 'split_type']]
     # Plotting
     plt.figure(figsize=(12, 6))
+    sns.barplot(
+        data=combined_data,
+        x='Metric',
+        y='Score',
+        hue='Split Type',
+        errorbar=('sd', 1),
+        palette=palette)
     plt.title('')
     plt.ylabel('')
     plt.xlabel('')
         if p.get_height() < 0.01:
             continue
         if i % 2 == 0:
+            value = f'{p.get_height():.1%}'
         else:
+            value = f'{p.get_height():.3f}'
         print(f'Plotting value: {p.get_height()} -> {value}')
         x = p.get_x() + p.get_width() / 2
     plt.savefig(f'plots/{title}.pdf', bbox_inches='tight')
+def plot_ablation_study(report):
+    # Define the ablation study combinations
+    ablation_study_combinations = [
+        'disabled smiles',
+        'disabled poi',
+        'disabled e3',
+        'disabled cell',
+        'disabled poi e3 smiles',
+        'disabled poi e3 cell',
+    ]
+    for group in report['split_type'].unique():
+        baseline = report[report['disabled_embeddings'].isna()].copy()
+        baseline = baseline[baseline['split_type'] == group]
+        baseline['disabled_embeddings'] = 'all embeddings enabled'
+        # metrics_to_show = ['val_acc', 'test_acc']
+        metrics_to_show = ['test_acc']
+        # baseline = baseline.melt(id_vars=['fold', 'disabled_embeddings'], value_vars=metrics_to_show, var_name='metric', value_name='score')
+        baseline = baseline.melt(id_vars=['disabled_embeddings'], value_vars=metrics_to_show, var_name='metric', value_name='score')
+        print(f'Group: {group}, avg: {(0.755814 + 0.720930 + 0.732558) / 3:.1%}')
+        print(f'Group: {group}, avg: {(0.7558139562606812 + 0.7209302186965942 + 0.7325581312179565) / 3:.1%}')
+        print(baseline)
+        ablation_dfs = []
+        for disabled_embeddings in ablation_study_combinations:
+            if pd.isnull(disabled_embeddings):
+                continue
+            tmp = report[report['disabled_embeddings'] == disabled_embeddings].copy()
+            tmp = tmp[tmp['split_type'] == group]
+            # tmp = tmp.melt(id_vars=['fold', 'disabled_embeddings'], value_vars=metrics_to_show, var_name='metric', value_name='score')
+            tmp = tmp.melt(id_vars=['disabled_embeddings'], value_vars=metrics_to_show, var_name='metric', value_name='score')
+            ablation_dfs.append(tmp)
+        ablation_df = pd.concat(ablation_dfs)
+        # dummy_val_df = pd.DataFrame()
+        # tmp = report[report['split_type'] == group]
+        # dummy_val_df['score'] = tmp[['val_active_perc', 'val_inactive_perc']].max(axis=1)
+        # dummy_val_df['metric'] = 'val_acc'
+        # dummy_val_df['disabled_embeddings'] = 'dummy'
+        dummy_test_df = pd.DataFrame()
+        tmp = report[report['split_type'] == group]
+        dummy_test_df['score'] = tmp[['test_active_perc', 'test_inactive_perc']].max(axis=1)
+        dummy_test_df['metric'] = 'test_acc'
+        dummy_test_df['disabled_embeddings'] = 'dummy'
+        # dummy_df = pd.concat([dummy_val_df, dummy_test_df])
+        dummy_df = dummy_test_df
+        final_df = pd.concat([dummy_df, baseline, ablation_df])
+        final_df['metric'] = final_df['metric'].map({
+            'val_acc': 'Validation Accuracy',
+            'test_acc': 'Test Accuracy',
+            'val_roc_auc': 'Val ROC-AUC',
+            'test_roc_auc': 'Test ROC-AUC',
+        })
+        final_df['disabled_embeddings'] = final_df['disabled_embeddings'].map({
+            'all embeddings enabled': 'All embeddings enabled',
+            'dummy': 'Dummy model',
+            'disabled smiles': 'Disabled compound information',
+            'disabled e3': 'Disabled E3 information',
+            'disabled poi': 'Disabled target information',
+            'disabled cell': 'Disabled cell information',
+            'disabled poi e3 smiles': 'Disabled compound, E3, and target info\n(only cell information left)',
+            'disabled poi e3 cell': 'Disabled cell, E3, and target info\n(only compound information left)',
+        })
+        # Print final_df to latex
+        tmp  = final_df.groupby(['disabled_embeddings', 'metric']).mean().round(3)
+        # Remove fold column to tmp
+        tmp = tmp.reset_index() #.drop('fold', axis=1)
+        # fig, ax = plt.subplots(figsize=(5, 5))
+        fig, ax = plt.subplots()
+        sns.barplot(data=final_df,
+            y='disabled_embeddings',
+            x='score',
+            hue='metric',
+            ax=ax,
+            errorbar=('sd', 1),
+            palette=sns.color_palette(palette, len(palette)),
+            saturation=1,
+        )
+        # ax.set_title(f'{group.replace("random", "standard")} CV split')
+        ax.grid(axis='x', alpha=0.5)
+        ax.tick_params(axis='y', rotation=0)
+        ax.set_xlim(0, 1.0)
+        ax.xaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1, decimals=0))
+        ax.set_ylabel('')
+        ax.set_xlabel('')
+        # Set the legend outside the plot and below
+        # ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.08), ncol=2)
+        # Set the legend in the upper right corner
+        ax.legend(loc='upper right')
+        # For each bar, add the rotated value (as percentage), inside the bar
+        for i, p in enumerate(plt.gca().patches):
+            # TODO: For some reasons, there is an additional bar being added at
+            # the end of the plot... it's not in the dataframe
+            if i == len(plt.gca().patches) - 1:
+                continue
+            value = '{:.1f}%'.format(100 * p.get_width())
+            y = p.get_y() + p.get_height() / 2
+            x = 0.4 # p.get_height() - p.get_height() / 2
+            plt.annotate(value, (x, y), ha='center', va='center', color='black', fontsize=10, alpha=0.8)
+        plt.savefig(f'plots/ablation_study_{group}.pdf', bbox_inches='tight')
 def main():
     active_col = 'Active (Dmax 0.6, pDC50 6.0)'
     test_split = 0.1
     # Load the data
     reports = {
+        'cv_train': pd.concat([
+            pd.read_csv(f'reports/cv_report_{report_base_name}_random.csv'),
+            pd.read_csv(f'reports/cv_report_{report_base_name}_uniprot.csv'),
+            pd.read_csv(f'reports/cv_report_{report_base_name}_tanimoto.csv'),
+        ]),
+        'test': pd.concat([
+            pd.read_csv(f'reports/test_report_{report_base_name}_random.csv'),
+            pd.read_csv(f'reports/test_report_{report_base_name}_uniprot.csv'),
+            pd.read_csv(f'reports/test_report_{report_base_name}_tanimoto.csv'),
+        ]),
+        'ablation': pd.concat([
+            pd.read_csv(f'reports/ablation_report_{report_base_name}_random.csv'),
+            pd.read_csv(f'reports/ablation_report_{report_base_name}_uniprot.csv'),
+            pd.read_csv(f'reports/ablation_report_{report_base_name}_tanimoto.csv'),
+        ]),
+        'hparam': pd.concat([
+            pd.read_csv(f'reports/hparam_report_{report_base_name}_random.csv'),
+            pd.read_csv(f'reports/hparam_report_{report_base_name}_uniprot.csv'),
+            pd.read_csv(f'reports/hparam_report_{report_base_name}_tanimoto.csv'),
+        ]),
     }
+    metrics = {}
+    for i in range(n_models_for_test):
+        for split_type in ['random', 'tanimoto', 'uniprot', 'e3_ligase']:
+            logs_dir = f'logs_{report_base_name}_{split_type}_best_model_n{i}'
+            metrics[f'{split_type}_{i}'] = pd.read_csv(f'logs/{logs_dir}/{logs_dir}/metrics.csv')
+            metrics[f'{split_type}_{i}']['model_id'] = i
+            # Rename 'val_' columns to 'test_' columns
+            metrics[f'{split_type}_{i}'] = metrics[f'{split_type}_{i}'].rename(columns={'val_loss': 'test_loss', 'val_acc': 'test_acc', 'val_roc_auc': 'test_roc_auc'})
+            plot_training_curves(metrics[f'{split_type}_{i}'], f'{split_type}_{i}')
     df_val = reports['cv_train']
     df_test = reports['test']
+    plot_performance_metrics(df_val, df_test, title=f'{active_name}_metrics')
+    reports['test']['disabled_embeddings'] = pd.NA
+    plot_ablation_study(pd.concat([
+        reports['ablation'],
+        reports['test'],
+    ]))
 if __name__ == '__main__':