{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Curation"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# from IPython.display import display_html\n",
"\n",
"import logging\n",
"import warnings\n",
"import re\n",
"import os\n",
"import numpy as np\n",
"import pandas as pd\n",
"import pickle\n",
"import pickle\n",
"import requests\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from rdkit import Chem\n",
"from rdkit.Chem import AllChem\n",
"from typing import Literal, Union, List, Dict, Any, Callable\n",
"from collections import defaultdict\n",
"from tqdm.auto import tqdm\n",
"from rdkit import RDLogger\n",
"\n",
"RDLogger.DisableLog('rdApp.*')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Filter out some warnings..."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def set_global_logging_level(level=logging.ERROR, prefices=[\"\"]):\n",
" \"\"\"\n",
" Override logging levels of different modules based on their name as a prefix.\n",
" It needs to be invoked after the modules have been loaded so that their loggers have been initialized.\n",
"\n",
" Args:\n",
" - level: desired level. e.g. logging.INFO. Optional. Default is logging.ERROR\n",
" - prefices: list of one or more str prefices to match (e.g. [\"transformers\", \"torch\"]). Optional.\n",
" Default is `[\"\"]` to match all active loggers.\n",
" The match is a case-sensitive `module_name.startswith(prefix)`\n",
" \"\"\"\n",
" prefix_re = re.compile(fr'^(?:{ \"|\".join(prefices) })')\n",
" for name in logging.root.manager.loggerDict:\n",
" if re.match(prefix_re, name):\n",
" logging.getLogger(name).setLevel(level)\n",
"\n",
"\n",
"# Filter out annoying Pytorch Lightning printouts\n",
"warnings.filterwarnings('ignore')\n",
"warnings.filterwarnings(\n",
" 'ignore', '.*Covariance of the parameters could not be estimated.*')\n",
"warnings.filterwarnings(\n",
" 'ignore', '.*You seem to be using the pipelines sequentially on GPU.*')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Setup working directories:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"data_dir = os.path.join(os.getcwd(), '..', 'data')\n",
"src_dir = os.path.join(os.getcwd(), '..', 'src')\n",
"fig_dir = os.path.join(data_dir, 'figures')\n",
"checkpoint_dir = os.path.join(os.getcwd(), '..', 'checkpoints')\n",
"dirs_to_make = [\n",
" data_dir,\n",
" os.path.join(data_dir, 'raw'),\n",
" os.path.join(data_dir, 'processed'),\n",
" fig_dir,\n",
" # os.path.join(data_dir, 'train'),\n",
" # os.path.join(data_dir, 'val'),\n",
" # os.path.join(data_dir, 'test'),\n",
" # src_dir,\n",
" # checkpoint_dir,\n",
"]\n",
"for d in dirs_to_make:\n",
" if not os.path.exists(d):\n",
" os.makedirs(d)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Download or load the raw PROTAC-DB dataset:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loaded protac.csv\n"
]
}
],
"source": [
"protacdb_file = os.path.join(data_dir, 'PROTAC-DB.csv')\n",
"protacdb_url = 'http://cadd.zju.edu.cn/protacdb/statics/binaryDownload/csv/protac/protac.csv'\n",
"if os.path.exists(protacdb_file):\n",
" protac_df = pd.read_csv(protacdb_file).reset_index(drop=True)\n",
" print(f'Loaded protac.csv')\n",
"else:\n",
" print(f'Downloading {protacdb_url}')\n",
" !wget {protacdb_url} {protacdb_file}\n",
" protac_df = pd.read_csv(protacdb_file).reset_index(drop=True)\n",
" print(f'PROTAC-DB loaded')\n",
"\n",
"old2new = {\n",
" 'E3 ligase': 'E3 Ligase',\n",
"}\n",
"protac_df = protac_df.rename(columns=old2new)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Compound ID
\n",
"
Target
\n",
"
Percent degradation (%)
\n",
"
Article DOI
\n",
"
DC50 (nM)
\n",
"
Dmax (%)
\n",
"
Assay (DC50/Dmax)
\n",
"
Assay (Percent degradation)
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
1
\n",
"
BRD7
\n",
"
20/12 (WB)
\n",
"
10.1021/acs.jmedchem.8b01413
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
% BRD7 degradation in HeLa cells after 4/16 h ...
\n",
"
\n",
"
\n",
"
1
\n",
"
1
\n",
"
BRD9
\n",
"
19/30 (WB)
\n",
"
10.1021/acs.jmedchem.8b01413
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
% BRD9 degradation in HeLa cells after 4/16 h ...
\n",
"
\n",
"
\n",
"
2
\n",
"
2
\n",
"
BRD7
\n",
"
19/27 (WB)
\n",
"
10.1021/acs.jmedchem.8b01413
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
% BRD7 degradation in HeLa cells after 4/16 h ...
\n",
"
\n",
"
\n",
"
3
\n",
"
2
\n",
"
BRD9
\n",
"
5/21 (WB)
\n",
"
10.1021/acs.jmedchem.8b01413
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
% BRD9 degradation in HeLa cells after 4/16 h ...
\n",
"
\n",
"
\n",
"
4
\n",
"
3
\n",
"
BRD9
\n",
"
94/93 (WB)
\n",
"
10.1021/acs.jmedchem.8b01413
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
% BRD9 degradation in HeLa cells after 4/16 h ...
\n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Compound ID Target Percent degradation (%) Article DOI \\\n",
"0 1 BRD7 20/12 (WB) 10.1021/acs.jmedchem.8b01413 \n",
"1 1 BRD9 19/30 (WB) 10.1021/acs.jmedchem.8b01413 \n",
"2 2 BRD7 19/27 (WB) 10.1021/acs.jmedchem.8b01413 \n",
"3 2 BRD9 5/21 (WB) 10.1021/acs.jmedchem.8b01413 \n",
"4 3 BRD9 94/93 (WB) 10.1021/acs.jmedchem.8b01413 \n",
"\n",
" DC50 (nM) Dmax (%) Assay (DC50/Dmax) \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"\n",
" Assay (Percent degradation) \n",
"0 % BRD7 degradation in HeLa cells after 4/16 h ... \n",
"1 % BRD9 degradation in HeLa cells after 4/16 h ... \n",
"2 % BRD7 degradation in HeLa cells after 4/16 h ... \n",
"3 % BRD9 degradation in HeLa cells after 4/16 h ... \n",
"4 % BRD9 degradation in HeLa cells after 4/16 h ... "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"PROTAC-DB scraped len: 6000\n"
]
}
],
"source": [
"scraped_protac_df = pd.read_csv(os.path.join(\n",
" data_dir, 'PROTAC-DB-Scraped.csv'))\n",
"# Rename columns\n",
"old2new = {\n",
" \"Assay (Percent degradation)\": \"Percent degradation (%)\",\n",
" \"PROTAC to Target\": \"Assay (Percent degradation)\",\n",
" \"DOI\": \"Article DOI\",\n",
" \"DC50\": \"DC50 (nM)\",\n",
" \"Dmax\": \"Dmax (%)\",\n",
"}\n",
"scraped_protac_df = scraped_protac_df.rename(columns=old2new)\n",
"\n",
"display(scraped_protac_df.head())\n",
"print(f'PROTAC-DB scraped len: {len(scraped_protac_df)}')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Merged df len: 5343\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Compound ID
\n",
"
Uniprot
\n",
"
Target
\n",
"
E3 Ligase
\n",
"
PDB
\n",
"
Name
\n",
"
Smiles
\n",
"
IC50 (nM, Protac to Target)
\n",
"
Assay (Protac to Target, IC50)
\n",
"
EC50 (nM, Protac to Target)
\n",
"
...
\n",
"
Rotatable Bond Count
\n",
"
Topological Polar Surface Area
\n",
"
Molecular Formula
\n",
"
InChI
\n",
"
InChI Key
\n",
"
Percent degradation (%)
\n",
"
DC50 (nM)
\n",
"
Dmax (%)
\n",
"
Assay (DC50/Dmax)
\n",
"
Assay (Percent degradation)
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
1
\n",
"
Q9NPI1
\n",
"
BRD7
\n",
"
VHL
\n",
"
NaN
\n",
"
NaN
\n",
"
COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
...
\n",
"
19
\n",
"
189.92
\n",
"
C50H64N8O9S
\n",
"
InChI=1S/C50H64N8O9S/c1-32-45(68-31-53-32)34-1...
\n",
"
RPMQBLMPGMFXLD-PDUNVWSESA-N
\n",
"
20/12 (WB)
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
% BRD7 degradation in HeLa cells after 4/16 h ...
\n",
"
\n",
"
\n",
"
1
\n",
"
1
\n",
"
Q9H8M2
\n",
"
BRD9
\n",
"
VHL
\n",
"
NaN
\n",
"
NaN
\n",
"
COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
...
\n",
"
19
\n",
"
189.92
\n",
"
C50H64N8O9S
\n",
"
InChI=1S/C50H64N8O9S/c1-32-45(68-31-53-32)34-1...
\n",
"
RPMQBLMPGMFXLD-PDUNVWSESA-N
\n",
"
19/30 (WB)
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
% BRD9 degradation in HeLa cells after 4/16 h ...
\n",
"
\n",
"
\n",
"
2
\n",
"
2
\n",
"
Q9NPI1
\n",
"
BRD7
\n",
"
VHL
\n",
"
NaN
\n",
"
NaN
\n",
"
COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
...
\n",
"
25
\n",
"
208.38
\n",
"
C54H72N8O11S
\n",
"
InChI=1S/C54H72N8O11S/c1-36-49(74-35-57-36)38-...
\n",
"
NGWWVKZONFCNQP-SHPBXJAASA-N
\n",
"
19/27 (WB)
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
% BRD7 degradation in HeLa cells after 4/16 h ...
\n",
"
\n",
"
\n",
"
3
\n",
"
2
\n",
"
Q9H8M2
\n",
"
BRD9
\n",
"
VHL
\n",
"
NaN
\n",
"
NaN
\n",
"
COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
...
\n",
"
25
\n",
"
208.38
\n",
"
C54H72N8O11S
\n",
"
InChI=1S/C54H72N8O11S/c1-36-49(74-35-57-36)38-...
\n",
"
NGWWVKZONFCNQP-SHPBXJAASA-N
\n",
"
5/21 (WB)
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
% BRD9 degradation in HeLa cells after 4/16 h ...
\n",
"
\n",
"
\n",
"
4
\n",
"
3
\n",
"
Q9H8M2
\n",
"
BRD9
\n",
"
CRBN
\n",
"
NaN
\n",
"
NaN
\n",
"
COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
...
\n",
"
18
\n",
"
202.97
\n",
"
C43H50N8O10
\n",
"
InChI=1S/C43H50N8O10/c1-48-24-31(28-9-10-44-23...
\n",
"
RMBNUDOJPQLHMV-UHFFFAOYSA-N
\n",
"
94/93 (WB)
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
% BRD9 degradation in HeLa cells after 4/16 h ...
\n",
"
\n",
" \n",
"
\n",
"
5 rows × 89 columns
\n",
"
"
],
"text/plain": [
" Compound ID Uniprot Target E3 Ligase PDB Name \\\n",
"0 1 Q9NPI1 BRD7 VHL NaN NaN \n",
"1 1 Q9H8M2 BRD9 VHL NaN NaN \n",
"2 2 Q9NPI1 BRD7 VHL NaN NaN \n",
"3 2 Q9H8M2 BRD9 VHL NaN NaN \n",
"4 3 Q9H8M2 BRD9 CRBN NaN NaN \n",
"\n",
" Smiles \\\n",
"0 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... \n",
"1 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... \n",
"2 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... \n",
"3 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... \n",
"4 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... \n",
"\n",
" IC50 (nM, Protac to Target) Assay (Protac to Target, IC50) \\\n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN \n",
"\n",
" EC50 (nM, Protac to Target) ... Rotatable Bond Count \\\n",
"0 NaN ... 19 \n",
"1 NaN ... 19 \n",
"2 NaN ... 25 \n",
"3 NaN ... 25 \n",
"4 NaN ... 18 \n",
"\n",
" Topological Polar Surface Area Molecular Formula \\\n",
"0 189.92 C50H64N8O9S \n",
"1 189.92 C50H64N8O9S \n",
"2 208.38 C54H72N8O11S \n",
"3 208.38 C54H72N8O11S \n",
"4 202.97 C43H50N8O10 \n",
"\n",
" InChI \\\n",
"0 InChI=1S/C50H64N8O9S/c1-32-45(68-31-53-32)34-1... \n",
"1 InChI=1S/C50H64N8O9S/c1-32-45(68-31-53-32)34-1... \n",
"2 InChI=1S/C54H72N8O11S/c1-36-49(74-35-57-36)38-... \n",
"3 InChI=1S/C54H72N8O11S/c1-36-49(74-35-57-36)38-... \n",
"4 InChI=1S/C43H50N8O10/c1-48-24-31(28-9-10-44-23... \n",
"\n",
" InChI Key Percent degradation (%) DC50 (nM) Dmax (%) \\\n",
"0 RPMQBLMPGMFXLD-PDUNVWSESA-N 20/12 (WB) NaN NaN \n",
"1 RPMQBLMPGMFXLD-PDUNVWSESA-N 19/30 (WB) NaN NaN \n",
"2 NGWWVKZONFCNQP-SHPBXJAASA-N 19/27 (WB) NaN NaN \n",
"3 NGWWVKZONFCNQP-SHPBXJAASA-N 5/21 (WB) NaN NaN \n",
"4 RMBNUDOJPQLHMV-UHFFFAOYSA-N 94/93 (WB) NaN NaN \n",
"\n",
" Assay (DC50/Dmax) Assay (Percent degradation) \n",
"0 NaN % BRD7 degradation in HeLa cells after 4/16 h ... \n",
"1 NaN % BRD9 degradation in HeLa cells after 4/16 h ... \n",
"2 NaN % BRD7 degradation in HeLa cells after 4/16 h ... \n",
"3 NaN % BRD9 degradation in HeLa cells after 4/16 h ... \n",
"4 NaN % BRD9 degradation in HeLa cells after 4/16 h ... \n",
"\n",
"[5 rows x 89 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Merge scraped data with PROTAC-DB on Compound ID and get non-assay columns\n",
"param_cols = [\n",
" \"Percent degradation (%)\",\n",
" \"Dmax (%)\",\n",
" \"DC50 (nM)\",\n",
" \"Assay (Percent degradation)\",\n",
" \"Assay (DC50/Dmax)\",\n",
"]\n",
"cols = [c for c in protac_df.columns if c not in param_cols]\n",
"on_cols = [c for c in scraped_protac_df.columns if c not in param_cols]\n",
"scraped_protac_df = protac_df[cols].merge(\n",
" scraped_protac_df,\n",
" on=on_cols,\n",
").drop_duplicates()\n",
"print(f'Merged df len: {len(scraped_protac_df)}')\n",
"scraped_protac_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"362\n",
"737\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Percent degradation (%)
\n",
"
Assay (Percent degradation)
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
20/12 (WB)
\n",
"
% BRD7 degradation in HeLa cells after 4/16 h ...
\n",
"
\n",
"
\n",
"
1
\n",
"
19/30 (WB)
\n",
"
% BRD9 degradation in HeLa cells after 4/16 h ...
\n",
"
\n",
"
\n",
"
2
\n",
"
19/27 (WB)
\n",
"
% BRD7 degradation in HeLa cells after 4/16 h ...
\n",
"
\n",
"
\n",
"
3
\n",
"
5/21 (WB)
\n",
"
% BRD9 degradation in HeLa cells after 4/16 h ...
\n",
"
\n",
"
\n",
"
4
\n",
"
94/93 (WB)
\n",
"
% BRD9 degradation in HeLa cells after 4/16 h ...
\n",
"
\n",
"
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
\n",
"
\n",
"
4861
\n",
"
54 (WB)
\n",
"
% JAK3 degradation in MHH-CALL-4 cells at 100 nM
\n",
"
\n",
"
\n",
"
4863
\n",
"
42 (WB)
\n",
"
% JAK3 degradation in MHH-CALL-4 cells at 100 nM
\n",
"
\n",
"
\n",
"
4864
\n",
"
1/32/69/19 (WB)
\n",
"
% JAK2 degradation in MHH-CALL-4 cells at 1/10...
\n",
"
\n",
"
\n",
"
4881
\n",
"
0/30/30/53/29 (WB)
\n",
"
% EGFR del19 degradation in HCC827 cells at 5/...
\n",
"
\n",
"
\n",
"
4882
\n",
"
10/7/7/27/34 (WB)
\n",
"
% EGFR EGFR L858R/T790M degradation in H1975 c...
\n",
"
\n",
" \n",
"
\n",
"
334 rows × 2 columns
\n",
"
"
],
"text/plain": [
" Percent degradation (%) \\\n",
"0 20/12 (WB) \n",
"1 19/30 (WB) \n",
"2 19/27 (WB) \n",
"3 5/21 (WB) \n",
"4 94/93 (WB) \n",
"... ... \n",
"4861 54 (WB) \n",
"4863 42 (WB) \n",
"4864 1/32/69/19 (WB) \n",
"4881 0/30/30/53/29 (WB) \n",
"4882 10/7/7/27/34 (WB) \n",
"\n",
" Assay (Percent degradation) \n",
"0 % BRD7 degradation in HeLa cells after 4/16 h ... \n",
"1 % BRD9 degradation in HeLa cells after 4/16 h ... \n",
"2 % BRD7 degradation in HeLa cells after 4/16 h ... \n",
"3 % BRD9 degradation in HeLa cells after 4/16 h ... \n",
"4 % BRD9 degradation in HeLa cells after 4/16 h ... \n",
"... ... \n",
"4861 % JAK3 degradation in MHH-CALL-4 cells at 100 nM \n",
"4863 % JAK3 degradation in MHH-CALL-4 cells at 100 nM \n",
"4864 % JAK2 degradation in MHH-CALL-4 cells at 1/10... \n",
"4881 % EGFR del19 degradation in HCC827 cells at 5/... \n",
"4882 % EGFR EGFR L858R/T790M degradation in H1975 c... \n",
"\n",
"[334 rows x 2 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(len(scraped_protac_df.dropna(\n",
" subset=['DC50 (nM)', 'Dmax (%)']).dropna(how='all')))\n",
"print(len(scraped_protac_df.dropna(\n",
" subset=['Percent degradation (%)']).dropna(how='all')))\n",
"\n",
"tmp = scraped_protac_df.dropna(subset=['Percent degradation (%)'])\n",
"\n",
"tmp[tmp['Percent degradation (%)'].str.contains(\n",
" 'WB')][['Percent degradation (%)', 'Assay (Percent degradation)']].drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"protac_pedia_df len: 1203\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
PROTACDB ID
\n",
"
PROTAC SMILES
\n",
"
Active/Inactive
\n",
"
Best PROTAC
\n",
"
Cells
\n",
"
cLogP
\n",
"
Comments
\n",
"
Curator
\n",
"
Dc50
\n",
"
Dmax
\n",
"
...
\n",
"
Proteomics Data Available
\n",
"
Secondary Pubmed
\n",
"
Status
\n",
"
Target
\n",
"
Tested A Non Binding E3 Control
\n",
"
Tested Competition With Ligand
\n",
"
Tested Engagement In Cells
\n",
"
Tested Proteaseome Inhibitor
\n",
"
Time
\n",
"
TPSA
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
1
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
Inactive
\n",
"
No
\n",
"
MOLT-4
\n",
"
10.83732
\n",
"
IC50's are for cell viability assays
\n",
"
Ronen Gabizon
\n",
"
NaN
\n",
"
NaN
\n",
"
...
\n",
"
No
\n",
"
NaN
\n",
"
Reviewed
\n",
"
Q07817
\n",
"
No
\n",
"
No
\n",
"
No
\n",
"
No
\n",
"
48
\n",
"
251.07
\n",
"
\n",
"
\n",
"
1
\n",
"
2
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
Inactive
\n",
"
No
\n",
"
MOLT-4
\n",
"
11.22742
\n",
"
IC50's are for cell viability assays
\n",
"
Ronen Gabizon
\n",
"
NaN
\n",
"
NaN
\n",
"
...
\n",
"
No
\n",
"
NaN
\n",
"
Reviewed
\n",
"
Q07817
\n",
"
No
\n",
"
No
\n",
"
No
\n",
"
No
\n",
"
48
\n",
"
251.07
\n",
"
\n",
"
\n",
"
2
\n",
"
3
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
Inactive
\n",
"
No
\n",
"
MOLT-4
\n",
"
11.61752
\n",
"
IC50's are for cell viability assays
\n",
"
Ronen Gabizon
\n",
"
NaN
\n",
"
NaN
\n",
"
...
\n",
"
No
\n",
"
NaN
\n",
"
Reviewed
\n",
"
Q07817
\n",
"
No
\n",
"
No
\n",
"
No
\n",
"
No
\n",
"
48
\n",
"
251.07
\n",
"
\n",
"
\n",
"
3
\n",
"
4
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
Active
\n",
"
No
\n",
"
MOLT-4
\n",
"
12.00762
\n",
"
IC50's are for cell viability assays
\n",
"
Ronen Gabizon
\n",
"
NaN
\n",
"
NaN
\n",
"
...
\n",
"
No
\n",
"
NaN
\n",
"
Reviewed
\n",
"
Q07817
\n",
"
No
\n",
"
No
\n",
"
No
\n",
"
No
\n",
"
48
\n",
"
251.07
\n",
"
\n",
"
\n",
"
4
\n",
"
5
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
Active
\n",
"
No
\n",
"
MOLT-4
\n",
"
12.39772
\n",
"
IC50's are for cell viability assays
\n",
"
Ronen Gabizon
\n",
"
53 nM
\n",
"
~ 100 %
\n",
"
...
\n",
"
No
\n",
"
NaN
\n",
"
Reviewed
\n",
"
Q07817
\n",
"
No
\n",
"
No
\n",
"
Yes
\n",
"
No
\n",
"
48
\n",
"
251.07
\n",
"
\n",
" \n",
"
\n",
"
5 rows × 43 columns
\n",
"
"
],
"text/plain": [
" PROTACDB ID PROTAC SMILES \\\n",
"0 1 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"1 2 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"2 3 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"3 4 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"4 5 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"\n",
" Active/Inactive Best PROTAC Cells cLogP \\\n",
"0 Inactive No MOLT-4 10.83732 \n",
"1 Inactive No MOLT-4 11.22742 \n",
"2 Inactive No MOLT-4 11.61752 \n",
"3 Active No MOLT-4 12.00762 \n",
"4 Active No MOLT-4 12.39772 \n",
"\n",
" Comments Curator Dc50 Dmax ... \\\n",
"0 IC50's are for cell viability assays Ronen Gabizon NaN NaN ... \n",
"1 IC50's are for cell viability assays Ronen Gabizon NaN NaN ... \n",
"2 IC50's are for cell viability assays Ronen Gabizon NaN NaN ... \n",
"3 IC50's are for cell viability assays Ronen Gabizon NaN NaN ... \n",
"4 IC50's are for cell viability assays Ronen Gabizon 53 nM ~ 100 % ... \n",
"\n",
" Proteomics Data Available Secondary Pubmed Status Target \\\n",
"0 No NaN Reviewed Q07817 \n",
"1 No NaN Reviewed Q07817 \n",
"2 No NaN Reviewed Q07817 \n",
"3 No NaN Reviewed Q07817 \n",
"4 No NaN Reviewed Q07817 \n",
"\n",
" Tested A Non Binding E3 Control Tested Competition With Ligand \\\n",
"0 No No \n",
"1 No No \n",
"2 No No \n",
"3 No No \n",
"4 No No \n",
"\n",
" Tested Engagement In Cells Tested Proteaseome Inhibitor Time TPSA \n",
"0 No No 48 251.07 \n",
"1 No No 48 251.07 \n",
"2 No No 48 251.07 \n",
"3 No No 48 251.07 \n",
"4 Yes No 48 251.07 \n",
"\n",
"[5 rows x 43 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_file = os.path.join(data_dir, 'PROTAC-Pedia.csv')\n",
"protac_pedia_df = pd.read_csv(df_file)\n",
"print(f'protac_pedia_df len: {len(protac_pedia_df)}')\n",
"protac_pedia_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Utilities"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[-100.0, -5.0, nan, 90.317, 1000.0, nan]\n",
"[0.0]\n",
"[96.0, 73.0]\n",
"[1.0, 3.14]\n"
]
}
],
"source": [
"def clean_string(s: str) -> str:\n",
" \"\"\" Clean a string by removing <, >, =, NaN, and ranges like 100-200.\n",
" Args:\n",
" s(str): string to clean\n",
" Returns:\n",
" str: cleaned string\n",
" \"\"\"\n",
" if pd.isnull(s) or s in {'nan', 'n/a', 'NaN', ''}:\n",
" return np.nan\n",
" if 'N.D.' in s:\n",
" return '0'\n",
" s = s.strip('(WB)').strip()\n",
" # # Combine regex operations for efficiency\n",
" # s = re.sub(r'[<=>]|NaN|[\\d]+[-~]', '', s) # Remove <, >, =, NaN, and ranges like 100-200\n",
" # Remove <, >, =, NaN\n",
" s = re.sub(r'[<=>]|NaN', '', s)\n",
" # Replace ranges like 100-200 or 1~3 with the left-most value in the range\n",
" s = re.sub(r'\\b(\\d+)[-~]\\d+\\b', r'\\1', s)\n",
" # Replace (n/a) with nan\n",
" s = s.replace('(n/a)', 'nan')\n",
" s = re.sub(r'[~<=>% ]', '', s) # Remove ~, <, >, =, % and spaces\n",
" return s\n",
"\n",
"\n",
"def split_clean_str(s: str, return_floats: bool = False) -> Union[List[str], List[float]]:\n",
" \"\"\" Split a string by '/' and clean each part.\n",
" Args:\n",
" s(str): string to split\n",
" return_floats(bool): whether to return floats or strings\n",
" Returns:\n",
" list: list of cleaned strings or floats\n",
" \"\"\"\n",
" if pd.isnull(s) or s in {'nan', 'n/a', 'NaN', ''}:\n",
" return np.nan\n",
" cleaned_values = [clean_string(part.strip())\n",
" for part in s.replace('(n/a)', 'nan').split('/')]\n",
" return [float(value) if return_floats else value for value in cleaned_values]\n",
"\n",
"\n",
"print(split_clean_str('-100-200/-5/(n/a)/<=90.317/>1000/NaN', return_floats=True))\n",
"print(split_clean_str('N.D.', return_floats=True))\n",
"print(split_clean_str('96/73 (WB)', return_floats=True))\n",
"print(split_clean_str('1.0~3/3.14', return_floats=True))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n",
"False\n"
]
}
],
"source": [
"def is_active(DC50: float, Dmax: float, oring=False) -> bool:\n",
" \"\"\" Check if a PROTAC is active based on DC50 and Dmax.\t\n",
" Args:\n",
" DC50(float): DC50 in nM\n",
" Dmax(float): Dmax in %\n",
" Returns:\n",
" bool: True if active, False if inactive, np.nan if either DC50 or Dmax is NaN\n",
" \"\"\"\n",
" pDC50 = -np.log10(DC50 * 1e-9) if pd.notnull(DC50) else np.nan\n",
" Dmax = Dmax / 100\n",
" if pd.notnull(pDC50):\n",
" if pDC50 < 7.0:\n",
" return False\n",
" if pd.notnull(Dmax):\n",
" if Dmax < 0.8:\n",
" return False\n",
" if oring:\n",
" if pd.notnull(pDC50):\n",
" return True if pDC50 >= 7.0 else False\n",
" elif pd.notnull(Dmax):\n",
" return True if Dmax >= 0.8 else False\n",
" else:\n",
" return np.nan\n",
" else:\n",
" if pd.notnull(pDC50) and pd.notnull(Dmax):\n",
" return True if pDC50 >= 7.0 and Dmax >= 0.8 else False\n",
" else:\n",
" return np.nan\n",
"\n",
"\n",
"print(is_active(20, 80))\n",
"print(is_active(100, 70))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Clean PROTAC-Pedia"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# Rename columns in PROTAC-Pedia dataframe\n",
"old2new = {\n",
" \"Cells\": \"Cell Type\",\n",
" \"MW\": \"Molecular Weight\",\n",
" \"TPSA\": \"Topological Polar Surface Area\",\n",
" \"Time\": \"Treatment Time (h)\",\n",
" \"Dc50\": \"DC50 (nM)\",\n",
" \"Dmax\": \"Dmax (%)\",\n",
" \"Hbond acceptors\": \"Hydrogen Bond Acceptor Count\",\n",
" \"Hbond donors\": \"Hydrogen Bond Donor Count\",\n",
" \"PROTACDB ID\": \"Compound ID\",\n",
" \"PROTAC SMILES\": \"Smiles\",\n",
" 'Target': 'Uniprot',\n",
"}\n",
"protac_pedia_df = protac_pedia_df.rename(columns=old2new)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# Rename Cereblon to CRBN in E3 Ligase column\n",
"protac_pedia_df['E3 Ligase'] = protac_pedia_df['E3 Ligase'].replace(\n",
" 'Cereblon', 'CRBN')\n",
"protac_pedia_df['E3 Ligase'] = protac_pedia_df['E3 Ligase'].str.replace('Iap', 'IAP')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of rows with single cell types: 925\n"
]
}
],
"source": [
"# Get all rows that do not contain \", \" nor \"; \" in \"Cell Type\" column\n",
"multiple_cells_idx = protac_pedia_df['Cell Type'].str.contains(\n",
" ', |; |\\sand\\s|/', regex=True, na=False,\n",
")\n",
"nan_comments_idx = protac_pedia_df['Comments'].isna()\n",
"protac_pedia_df[multiple_cells_idx & ~\n",
" nan_comments_idx][['Compound ID', 'Cell Type', 'Comments', 'DC50 (nM)', 'Dmax (%)']].to_csv(\n",
" os.path.join(data_dir, 'processed', 'multiple_cell_types.csv'), index=False)\n",
"\n",
"multiple_cells_df = protac_pedia_df[multiple_cells_idx].copy()\n",
"\n",
"protac_pedia_df = protac_pedia_df[~multiple_cells_idx]\n",
"print(f'Number of rows with single cell types: {len(protac_pedia_df)}')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Compound ID', 'Smiles', 'Active/Inactive', 'Best PROTAC', 'Cell Type',\n",
" 'cLogP', 'Comments', 'Curator', 'DC50 (nM)', 'Dmax (%)',\n",
" 'E3 Binder SMILES', 'E3 Ligase', 'Ec50 of Ligand Cells',\n",
" 'Ec50 of PROTAC Cells', 'exit_vector', 'Hydrogen Bond Acceptor Count',\n",
" 'Hydrogen Bond Donor Count', 'Ic50 of Ligand', 'Ic50 of PROTAC',\n",
" 'Ligand Name', 'Ligand SMILES', 'Linker', 'Linker Type', 'linker_ha',\n",
" 'linker_no', 'linker_rb', 'Molecular Weight', 'Off Targets Reported',\n",
" 'PATENT', 'Ligand PDB', 'Ligand ID', 'Pubmed', 'PROTAC Name',\n",
" 'Proteomics Data Available', 'Secondary Pubmed', 'Status', 'Uniprot',\n",
" 'Tested A Non Binding E3 Control', 'Tested Competition With Ligand',\n",
" 'Tested Engagement In Cells', 'Tested Proteaseome Inhibitor',\n",
" 'Treatment Time (h)', 'Topological Polar Surface Area'],\n",
" dtype='object')"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"multiple_cells_df.columns"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RS4;11, MOLM-13\n",
"MV4-11, MOLM-13, KG1\n",
"MOLT-4, MOLM14\n",
"MOLM14, MV4-11\n",
"Panc02.13, H23, A549, H1792, H2110, HCC827\n",
"NAMALWA, XLA\n",
"MV4;11, SUM149, SUM159, MOLM13, MM1.SCRBN−/−, MM1.SW\n",
"MV4;11, 293FT, 293FTCRBN−/−\n",
"MV-4-11 SK-MEL-5, NCI-H1568\n",
"HeLa, DLBCL\n",
"Jurkat and Panc Tu‐I\n",
"HeLa, Mino, Jeko-1, HUVEC, MDA-MB-231, MM.1S\n",
"Hela, MM.1S\n",
"HeLa, MV4;11, A549\n",
"AML cells, CRBN-/-\n",
"SU-DHL-1 and NCI-H2228\n",
"MCF-7, MM.1S\n",
"DU145/Cy\n",
"PC3, MDA-MB-231\n",
"Namalwa, CA-46, Ramos\n",
"HBL1, Ramos, Mino, IgE MM\n",
"Ramos, THP-1\n",
"MV4-11, MOLM-14\n",
"MDA-MB-231, HeLa\n",
"HEK293, CAMA-1, ZR-75-1\n",
"MiaPaCa2, HPNE\n",
"MCF-7, MDA-MB-231, HepG2, LO2, B16\n",
"Karpas 422,ULA, SUDHL4, OCI-Ly1, Ramos\n",
"SR, H2228, NCI–H69, NCI–H1688, NCI–H446\n",
"OVCAR8 (WT EGFR), HeLa (EGFR Exon 20 Ins), SKBr3 (HER2)\n",
"HCC827 (Exon 19 del), H3255 (L858R)\n",
"MDA-MB-231, GTL16\n",
"RS4;11, MV4;11, MOLM-13,\n",
"MCF-7, U2OS, MDA-MB-231, MDA-MB-435, Flag-Cdc20, MDA-MB-231, 22Rv1, LNCaP\n",
"Hella, RI-1\n",
"Hella, RI-1, EOL-1, A-204\n",
"MM1S, HEK293T, CRBNY384A/W386A\n",
"Molm-16, SU-DHL-1\n",
"RS4;11, MV4;11\n",
"LNCaP, VCaP, and 22Rv\n",
"MV4-11, Molm-13\n",
"Mino, Ramos\n",
"MCF7, A549\n",
"HOP62/INC-H23\n",
"E14 mouse embryonic stem cells; Human colon cancer cell line HCT116\n",
"Pancreatic cancer cell lines BxPC-3; MIA PaCa-2\n",
"KYSE520 esophageal cancer cell line; MV4;11 cell line\n",
"HeLa, HL60, MV4;11\n",
"HeLa, HEK293, U2OS\n",
"MDA-MB-231, A549, A549/DDP, HUVEC\n",
"HeLa, HEK293\n",
"231MFP breast cancer cells, HeLa\n",
"K562, A549, HCT116, MCF-7, HEK293T\n",
"MM1S; Mouse 4935 cells\n",
"PC9; HCC827; H1975\n",
"T47D, BT474, MDAMB231, MCF7, HS578T, LS180, HCT116, HCT15, DLD1, SW480, LOVO, SW620, HEL, KCL22, THP1, K562, ML2, NOMO1, MONOMAC6, MOLM13, LAMA84, SHI1, MV411, KU812, KG1, KBM7, HL60, BV173, HUH7, NCIH460, A549, NCIH2228, PC9, NCIH446, HELA, T98G, MCF10A, HAP1, DU145, PC3, LNCaP, A375, LOXIMVI\n",
"5W1573; NCI-H2030; NCI-H358; HCT116; MIA-PaCa-2\n",
"Fibroblasts from patients; HeLa\n",
"SU-DHL-1; H3122\n",
"IgEMM; hTERT-RPE1; HeLa; Jurkat; RPMI8226; MM.1S\n",
"Jurkat; Molt4; Granta-519; Mino; Jeko; Rec1; Maver\n",
"Mice primary Sertoli cells and primary Germ cells\n",
"Ba/F3; H1975\n",
"Ba/F3\n",
"HT1080, IMR-32, MCF-7\n",
"HT1080, IMR-32\n",
"BxPC3, DLD-1, HCT-116, MDA-MB-231, A549, PC-9\n",
"MDA-MB-231; K562\n",
"MOLT4; BT549; HCC1806; COV362; Kuramochi; OVCAR8\n",
"SRD15; Huh7\n",
"SW480, HCT116\n",
"K562, BaF3\n",
"BBL358, T47D\n",
"A549; H1299; B16-F10; MDAMB231; Jurkat\n"
]
}
],
"source": [
"for c in multiple_cells_df['Cell Type'].unique():\n",
" print(c)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Direct degration studies in this paper only conducted for first and final compounds; during the med chem campaign cell viability was used as the read out. Complex structure with the ligand was modeled based on 4Z93. IC50 of the ligand is between 2nM and 7nM, depending on which BRD. DC50 is between 3nM-10nM.\n",
"RS4;11, MOLM-13\n",
"100 %\n",
"< 10 nM\n",
"--------------------\n",
"General kinase PROTAC, DCmax is for the most degraded kinase. IC50 of the ligand is for 193 kinases in the panel. IC50 of the PROTAC is by FLT3 kinase activity.\n",
"MOLT-4, MOLM14\n",
"> 85 %\n",
"< 100 nM\n",
"--------------------\n",
"DC50 is 9.1 nM (WT BTK, NAMALWA cells), 14.6 nM (WT BTK, XLA cells), 14.9 nM (C481S, XLA cells). IC50 of PROTAC is 46.9 (WT BTK), 20.9 (C481S). IC50 of ligand is 51.0 nM (WT BTK), 30.7 (C481S)\n",
"NAMALWA, XLA\n",
"> 99 %\n",
"< 14.9 nM\n",
"--------------------\n",
"EC50 of PROTAC is 28nM in MV-4-11, 68nM in NCI-H1568. DC50 is (SMARCA2 6nM, SMARCA4 11nM, PBRM1 32nM in MV-4-11; SMARCA2 3.3nM, PBRM1 15.6nM in NCI-H1568)\n",
"MV-4-11 SK-MEL-5, NCI-H1568\n",
"nan\n",
"< 32 nM\n",
"--------------------\n",
"DC50 value is for HeLa cells. DC50 is 0.61 uM for DLBCL cells. DMAX value is for HeLa cells. DMAX is 96 % for DLBCL cells.\n",
"HeLa, DLBCL\n",
"92 %\n",
"0.79 uM\n",
"--------------------\n",
"proteomics of PDEdelta degradation show upregulation of enzymes involved in lipid metabolism - deltasonamide 1 also causes this. Dmax was measured in Panc-Tu-1 cell line. DC50 is 83.4% (24 h, Panc-Tu-1), 85% (24 h, 1 uM, Jurkat). IC50 Deltasonamide 1 is 203 pM, and of the Bn derivative is 8 nM.\n",
"Jurkat and Panc Tu‐I\n",
"> 83.4 %\n",
"48 nM\n",
"--------------------\n",
"DC50 is 1-3nM\n",
"HeLa, MV4;11, A549\n",
"100 %\n",
"< 3 nM\n",
"--------------------\n",
"DC50 is 10-30nM\n",
"HeLa, MV4;11, A549\n",
"99 %\n",
"< 30 nM\n",
"--------------------\n",
"competition with ligand only slightly rescued protein degradation. EC50 of ligand and PROTAC is measured on SUDHL-1 cell line. DC50 is SU-DHL-1: 3 ± 1 nM, NCI-H2228: 34 ± 9 nM.\n",
"SU-DHL-1 and NCI-H2228\n",
"> 90 %\n",
"< 34 nM\n",
"--------------------\n",
"competition with ligand only slightly rescued protein degradation. EC50 of ligand and PROTAC is measured on SUDHL-1 cell line. DC50 is SU-DHL-1: 11 ± 2 nM, NCI-H2228: 59 ± 16 nM.\n",
"SU-DHL-1 and NCI-H2228\n",
"> 90 %\n",
"< 59 nM\n",
"--------------------\n",
"\n",
"\n",
"DC50 and Dmax are for MCF-7 cells\n",
"MCF-7, MM.1S\n",
"70.5 %\n",
"34 nM\n",
"--------------------\n",
"also degrades ibrutinib-resistant C481S BTK. DC50 is 6.3 nM (HBL1), 8.5 nM (Ramos), 9.2 nM (Mino), 11.4 nM (IgE MM)\n",
"HBL1, Ramos, Mino, IgE MM\n",
"94 %\n",
"< 11.4 nM\n",
"--------------------\n",
"Ligand name is from PMID 24068666. DCmax is 18-33%\n",
"Ramos, THP-1\n",
"< 33 %\n",
"nan\n",
"--------------------\n",
"Ligand name is from PMID 24068666. DCmax is 15-34%\n",
"Ramos, THP-1\n",
"< 34 %\n",
"nan\n",
"--------------------\n",
"Ligand name is from PMID 24068666. DCmax is 36-64%. DC50 is 1487 - 1994.5 nM.\n",
"Ramos, THP-1\n",
"< 64 %\n",
"< 1994.5 nM\n",
"--------------------\n",
"Ligand name is from PMID 24068666. DCmax is 68-85%. DC50 is 36.9 - 398.5 nM.\n",
"Ramos, THP-1\n",
"> 68 %\n",
"< 398.5 nM\n",
"--------------------\n",
"Ligand name is from PMID 24068666. DCmax is 63-84%. DC50 is 21.8 - 469.9 nM.\n",
"Ramos, THP-1\n",
"> 63 %\n",
"< 469.9 nM\n",
"--------------------\n",
"Ligand name is from PMID 24068666. DCmax is 75-87%. DC50 is 4.5 - 90.5 nM.\n",
"Ramos, THP-1\n",
"> 75 %\n",
"< 90.5 nM\n",
"--------------------\n",
"Ligand name is from PMID 24068666. DCmax is 71-85%. DC50 is 5.9 - 217.7 nM.\n",
"Ramos, THP-1\n",
"> 71 %\n",
"< 217.7 nM\n",
"--------------------\n",
"Ligand name is from PMID 24068666. DCmax is 80-87%. DC50 is 1.1 - 37.4 nM.\n",
"Ramos, THP-1\n",
"> 80 %\n",
"< 37.4 nM\n",
"--------------------\n",
"\n",
"\n",
"Ligand name is from PMID 24068666. DCmax is 70-85%. DC50 is 9.7 - 184.1 nM.\n",
"Ramos, THP-1\n",
"> 70 %\n",
"< 184.1 nM\n",
"--------------------\n",
"DC50 error ± 1.0 nM. DMAX error ± 1.1 %.\n",
"MDA-MB-231, HeLa\n",
"99.6 %\n",
"9.5 nM\n",
"--------------------\n",
"DC50 error ± 81.3 nM. DMAX error ± 10.1 %.\n",
"MDA-MB-231, HeLa\n",
"34.5 %\n",
"45.9 nM\n",
"--------------------\n",
"DC50 is for WT EGFR. DC50 for EGFR Exon 20 Ins is 736.2 nM. DMAX is for WT EGFR. DMAX for EGFR Exon 20 Ins is 68.8 %.\n",
"OVCAR8 (WT EGFR), HeLa (EGFR Exon 20 Ins), SKBr3 (HER2)\n",
"97.6 %\n",
"39.2 nM\n",
"--------------------\n",
"DC50 is for EGFR (Exon 19 del). DC50 for EGFR (L858R) 22.3 nM. DMAX is for EGFR (Exon 19 del). DMAX for EGFR (L858R) 96.6 %.\n",
"HCC827 (Exon 19 del), H3255 (L858R)\n",
"98.9 %\n",
"11.7 nM\n",
"--------------------\n",
"DC50 is 1.76 nM and 4.5 nM\n",
"Hella, RI-1, EOL-1, A-204\n",
"90 %\n",
"< 4.5 nM\n",
"--------------------\n",
"DC50 is 0.86nM in LNCaP, 0.76 in VCaP and 10.4 nM at 1uM in 22Rv1. EC50 in cells is 0.25nM for LNCaP, 0.34 nM for VCaP, 183nM for 22Rv1.\n",
"LNCaP, VCaP, and 22Rv\n",
"> 95 %\n",
"< 10.4 nM\n",
"--------------------\n",
"XD2-149 was initially designed to degrade STAT3. However, experiments showed that XD2-149 down-regulate STAT3 level in a proteasome-independent manner. Proteomics data revealed that an E3 ligase, ZFP91, was the true substrate for this PROTAC. This paper reported a total of 22 PROTACs, which differed from each other in linker design and E3 binder choices (pomalidomide/thalidomide/lenalidomide). However, the authors didn't mention whether the remaining 21 molecules could degrade ZFP91 or not. It is also interesting that pomalidomide itself can induce the degradation of CRBN neo-substrates like ZFP91 (DC50: 0.42uM, 5-fold less potent than XD2-149), since pomalidomide can remodel CRBN surface for binding proteins like ZFP91 (Nat Med., 2019, doi: 10.1038/s41591-019-0668-z).\n",
"Pancreatic cancer cell lines BxPC-3; MIA PaCa-2\n",
"> 90 %\n",
"~ 70 nM\n",
"--------------------\n",
"DC50 for KYSE520 cell: 6.0nM;DC50 for MV4;11 cell: 2.6nM; EC50 for KYSE520 cell: 0.66uM; EC50 for MV4;11 cell: 9.9nM;\n",
"KYSE520 esophageal cancer cell line; MV4;11 cell line\n",
"100 %\n",
"6.0 nM\n",
"--------------------\n",
"pEC50 for MV4;11 and HL60 cells: 6.75±0.03 and 5.84±0.06, respectively.\n",
"pDC50 for Brd4 short/Brd4 long/Brd3/Brd2: 7.0/7.0/6.5/6.2, respectively (24h, HeLa cells).\n",
"Dmax for Brd4 short/Brd4 long/Brd3/Brd2: 96%/97%/97%/93%, respectively (HeLa cells).\n",
"HeLa, HL60, MV4;11\n",
"> 93 %\n",
"< 0.1 uM\n",
"--------------------\n",
"\n",
"\n",
"pEC50 for MV4;11 and HL60 cells: 7.57±0.03 and 6.66±0.05, respectively. pDC50 for Brd4 short/Brd4 long/Brd3/Brd2: 8.1/8.6/7.0/7.4, respectively (24h, HeLa cells). Dmax for Brd4 short/Brd4 long/Brd3/Brd2: 98%/100%/100%/98%, respectively (HeLa cells).\n",
"HeLa, HL60, MV4;11\n",
"> 98 %\n",
"< 2.5 nM\n",
"--------------------\n",
"pEC50 for MV4;11 and HL60 cells: 6.91±0.04 and 5.90±0.05, respectively. pDC50 for Brd4 short/Brd4 long/Brd3/Brd2: 8.4/8.0/6.5/6.7, respectively (24h, HeLa cells). Dmax for Brd4 short/Brd4 long/Brd3/Brd2: 99%/100%/99%/97%, respectively (HeLa cells).\n",
"HeLa, HL60, MV4;11\n",
"> 97 %\n",
"< 4 nM\n",
"--------------------\n",
"pEC50 for MV4;11 and HL60 cells: 7.77±0.06 and 7.46±0.03, respectively. pDC50 for Brd4 short/Brd4 long/Brd3/Brd2: 9.2/9.0/9.1/8.2, respectively (24h, HeLa cells). Dmax for Brd4 short/Brd4 long/Brd3/Brd2: 97%/100%/98%/83%, respectively (HeLa cells).\n",
"HeLa, HL60, MV4;11\n",
"> 83 %\n",
"< 1 nM\n",
"--------------------\n",
"pEC50 for MV4;11 and HL60 cells: 6.24±0.05 and 6.17±0.03, respectively. pDC50 for Brd4 short/Brd4 long/Brd3/Brd2: 6.9/6.7/6.8/NA, respectively (24h, HeLa cells). Dmax for Brd4 short/Brd4 long/Brd3/Brd2: 94%/78%/74%/37%, respectively (HeLa cells).\n",
"HeLa, HL60, MV4;11\n",
"nan\n",
"~ 0.1 uM\n",
"--------------------\n",
"pEC50 for MV4;11 and HL60 cells: 7.31±0.03 and 6.57±0.02, respectively. pDC50 for Brd4 short/Brd4 long/Brd3/Brd2: 8.1/7.6/7.3/NA, respectively (24h, HeLa cells). Dmax for Brd4 short/Brd4 long/Brd3/Brd2: 98%/95%/91%/43%, respectively (HeLa cells).\n",
"HeLa, HL60, MV4;11\n",
"> 90 %\n",
"~ 7.9 nM\n",
"--------------------\n",
"pEC50 for MV4;11 and HL60 cells: 7.08±0.05 and 6.37±0.03, respectively. pDC50 for Brd4 short/Brd4 long/Brd3/Brd2: 8.1/7.5/7.7/NA, respectively (24h, HeLa cells). Dmax for Brd4 short/Brd4 long/Brd3/Brd2: 95%/93%/92%/26%, respectively (HeLa cells).\n",
"HeLa, HL60, MV4;11\n",
"> 90 %\n",
"~ 7.9 nM\n",
"--------------------\n",
"Reported DC50 and Dmax above are in HeLa cells. DC50 for HEK293 cells: 230nM; Dmax for HEK293 cells: 98%. 14a can degrade VHL at higher concentration. See Target UniprotID P40337 for details.\n",
"HeLa, HEK293\n",
"88 %\n",
"200 nM\n",
"--------------------\n",
"EC50/DC50/Dmax reported above were obtained using NCI-H2030 cells. DC50: 0.25~0.76uM; Dmax: ~75%-90%, specific value depends on the cell line.\n",
"5W1573; NCI-H2030; NCI-H358; HCT116; MIA-PaCa-2\n",
"~ 80 %\n",
"0.59 uM\n",
"--------------------\n",
"38\n"
]
}
],
"source": [
"cnt = 0\n",
"for i, row in multiple_cells_df.dropna(subset=['Comments', 'Cell Type', 'Dmax (%)', 'DC50 (nM)'], how='all').iterrows():\n",
" if pd.isnull(row['Comments']):\n",
" continue\n",
" if 'DC' in row['Comments'].upper() or 'max' in row['Comments'].upper():\n",
" cnt += 1\n",
" print(row['Comments'])\n",
" print(row['Cell Type'])\n",
" print(row['Dmax (%)'])\n",
" print(row['DC50 (nM)'])\n",
" print('-' * 20)\n",
" if cnt % 10 == 0:\n",
" print('\\n')\n",
"print(cnt)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Cell Type
\n",
"
Comments
\n",
"
Dmax (%)
\n",
"
DC50 (nM)
\n",
"
\n",
" \n",
" \n",
"
\n",
"
105
\n",
"
RS4;11, MOLM-13
\n",
"
Direct degration studies in this paper only co...
\n",
"
100 %
\n",
"
< 10 nM
\n",
"
\n",
"
\n",
"
106
\n",
"
RS4;11, MOLM-13
\n",
"
Complex structure with the ligand was modeled ...
\n",
"
100 %
\n",
"
< 1 nM
\n",
"
\n",
"
\n",
"
107
\n",
"
RS4;11, MOLM-13
\n",
"
Complex structure with the ligand was modeled ...
\n",
"
100 %
\n",
"
< 1 nM
\n",
"
\n",
"
\n",
"
108
\n",
"
RS4;11, MOLM-13
\n",
"
Complex structure with the ligand was modeled ...
\n",
"
NaN
\n",
"
~ 3 nM
\n",
"
\n",
"
\n",
"
109
\n",
"
RS4;11, MOLM-13
\n",
"
Complex structure with the ligand was modeled ...
\n",
"
NaN
\n",
"
~ 10 nM
\n",
"
\n",
"
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
\n",
"
\n",
"
1182
\n",
"
A549; H1299; B16-F10; MDAMB231; Jurkat
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
\n",
"
\n",
"
1185
\n",
"
A549; H1299; B16-F10; MDAMB231; Jurkat
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
\n",
"
\n",
"
1194
\n",
"
A549; H1299; B16-F10; MDAMB231; Jurkat
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
\n",
"
\n",
"
1195
\n",
"
A549; H1299; B16-F10; MDAMB231; Jurkat
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
\n",
"
\n",
"
1198
\n",
"
A549; H1299; B16-F10; MDAMB231; Jurkat
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
\n",
" \n",
"
\n",
"
278 rows × 4 columns
\n",
"
"
],
"text/plain": [
" Cell Type \\\n",
"105 RS4;11, MOLM-13 \n",
"106 RS4;11, MOLM-13 \n",
"107 RS4;11, MOLM-13 \n",
"108 RS4;11, MOLM-13 \n",
"109 RS4;11, MOLM-13 \n",
"... ... \n",
"1182 A549; H1299; B16-F10; MDAMB231; Jurkat \n",
"1185 A549; H1299; B16-F10; MDAMB231; Jurkat \n",
"1194 A549; H1299; B16-F10; MDAMB231; Jurkat \n",
"1195 A549; H1299; B16-F10; MDAMB231; Jurkat \n",
"1198 A549; H1299; B16-F10; MDAMB231; Jurkat \n",
"\n",
" Comments Dmax (%) DC50 (nM) \n",
"105 Direct degration studies in this paper only co... 100 % < 10 nM \n",
"106 Complex structure with the ligand was modeled ... 100 % < 1 nM \n",
"107 Complex structure with the ligand was modeled ... 100 % < 1 nM \n",
"108 Complex structure with the ligand was modeled ... NaN ~ 3 nM \n",
"109 Complex structure with the ligand was modeled ... NaN ~ 10 nM \n",
"... ... ... ... \n",
"1182 NaN NaN NaN \n",
"1185 NaN NaN NaN \n",
"1194 NaN NaN NaN \n",
"1195 NaN NaN NaN \n",
"1198 NaN NaN NaN \n",
"\n",
"[278 rows x 4 columns]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"multiple_cells_df[['Cell Type', 'Comments', 'Dmax (%)', 'DC50 (nM)']]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def convert_mole_to_nM(s: str) -> float:\n",
" \"\"\" Clean a string by removing <, >, =, NaN, and ranges like 100-200. Then convert to nM.\n",
" Args:\n",
" s(str): string to clean\n",
" Returns:\n",
" float: cleaned string\n",
" \"\"\"\n",
" s = clean_string(str(s))\n",
" if pd.isnull(s) or s in {'nan', 'n/a', 'NaN', ''}:\n",
" return np.nan\n",
" if 'nM' in s:\n",
" return float(s.replace('nM', '').strip())\n",
" elif 'uM' in s:\n",
" tmp = float(s.replace('uM', '').strip())\n",
" # Convert from uM to nM\n",
" return float(tmp) * 1e3\n",
" else:\n",
" return float(s) * 1e9\n",
"\n",
"\n",
"protac_pedia_df['DC50 (nM)'] = protac_pedia_df['DC50 (nM)'].apply(\n",
" convert_mole_to_nM)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"protac_pedia_df['Dmax (%)'] = protac_pedia_df['Dmax (%)'].apply(\n",
" lambda s: float(clean_string(s)))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Non-Nan Active: 297\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"
"
],
"text/plain": [
" Assay (Ternary complex, IC50) \\\n",
"539 Inhibition of CRBN in the presence of BRD4 BD1... \n",
"1185 TR-FRET \n",
"1186 TR-FRET/AlphaLISA \n",
"1187 TR-FRET/AlphaLISA \n",
"1188 TR-FRET \n",
"1189 TR-FRET/AlphaLISA \n",
"1190 TR-FRET/AlphaLISA \n",
"1191 TR-FRET/AlphaLISA \n",
"1192 TR-FRET/AlphaLISA \n",
"1193 TR-FRET \n",
"1508 FP \n",
"1526 FP \n",
"1528 FP \n",
"2300 Titrations of PROTAC over an immobilised VCB s... \n",
"2858 FP \n",
"2859 FP \n",
"2860 FP \n",
"4883 IC50 between VHL and the complex of protac and... \n",
"4884 IC50 between VHL and the complex of protac and... \n",
"4885 IC50 between VHL and the complex of protac and... \n",
"4886 IC50 between VHL and the complex of protac and... \n",
"4887 IC50 between VHL and the complex of protac and... \n",
"4888 IC50 between VHL and the complex of protac and... \n",
"\n",
" IC50 (nM, Ternary complex) \n",
"539 1800/4100 \n",
"1185 220 \n",
"1186 205/120 \n",
"1187 230/330 \n",
"1188 28 \n",
"1189 45/120 \n",
"1190 76/240 \n",
"1191 26/730 \n",
"1192 51/720 \n",
"1193 42 \n",
"1508 3750 \n",
"1526 2570 \n",
"1528 2170 \n",
"2300 290 \n",
"2858 3040 \n",
"2859 2980 \n",
"2860 2460 \n",
"4883 5.01 \n",
"4884 2 \n",
"4885 7.94 \n",
"4886 6.31 \n",
"4887 10 \n",
"4888 1.26 "
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"protac_df[[\"Assay (Cellular activities, EC50)\",\n",
" \"EC50 (nM, Cellular activities)\"]].dropna(how='all').drop_duplicates()\n",
"\n",
"protac_df[[\"Assay (Ternary complex, IC50)\", \"IC50 (nM, Ternary complex)\"]].dropna(\n",
" how='all').drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Non-assay columns: ['E3 Ligase', 'Heavy Atom Count', 'Hydrogen Bond Acceptor Count', 'Compound ID', 'Molecular Formula', 'Rotatable Bond Count', 'Molecular Weight', 'InChI Key', 'Target', 'XLogP3', 'Smiles', 'Article DOI', 'Ring Count', 'Name', 'Hydrogen Bond Donor Count', 'Uniprot', 'PDB', 'InChI', 'Topological Polar Surface Area', 'Exact Mass']\n",
"Assay columns: ['DC50 (nM)', 'Dmax (%)', 'Assay (DC50/Dmax)', 'Percent degradation (%)', 'Assay (Percent degradation)', 'IC50 (nM, Protac to Target)', 'Assay (Protac to Target, IC50)', 'EC50 (nM, Protac to Target)', 'Assay (Protac to Target, EC50)', 'Kd (nM, Protac to Target)', 'Assay (Protac to Target, Kd)', 'Ki (nM, Protac to Target)', 'Assay (Protac to Target, Ki)', 'delta G (kcal/mol, Protac to Target)', 'delta H (kcal/mol, Protac to Target)', '-T*delta S (kcal/mol, Protac to Target)', 'Assay (Protac to Target, G/H/-TS)', 'kon (1/Ms, Protac to Target)', 'koff (1/s, Protac to Target)', 't1/2 (s, Protac to Target)', 'Assay (Protac to Target, kon/koff/t1/2)', 'IC50 (nM, Protac to E3)', 'Assay (Protac to E3, IC50)', 'EC50 (nM, Protac to E3)', 'Assay (Protac to E3, EC50)', 'Kd (nM, Protac to E3)', 'Assay (Protac to E3, Kd)', 'Ki (nM, Protac to E3)', 'Assay (Protac to E3, Ki)', 'delta G (kcal/mol, Protac to E3)', 'delta H (kcal/mol, Protac to E3)', '-T*delta S (kcal/mol, Protac to E3)', 'Assay (Protac to E3, G/H/-TS)', 'kon (1/Ms, Protac to E3)', 'koff (1/s, Protac to E3)', 't1/2 (s, Protac to E3)', 'Assay (Protac to E3, kon/koff/t1/2)', 'IC50 (nM, Ternary complex)', 'Assay (Ternary complex, IC50)', 'EC50 (nM, Ternary complex)', 'Assay (Ternary complex, EC50)', 'Kd (nM, Ternary complex)', 'Assay (Ternary complex, Kd)', 'Ki (nM, Ternary complex)', 'Assay (Ternary complex, Ki)', 'delta G (kcal/mol, Ternary complex)', 'delta H (kcal/mol, Ternary complex)', '-T*delta S (kcal/mol, Ternary complex)', 'Assay (Ternary complex, G/H/-TS)', 'kon (1/Ms, Ternary complex)', 'koff (1/s, Ternary complex)', 't1/2 (s, Ternary complex)', 'Assay (Ternary complex, kon/koff/t1/2)', 'IC50 (nM, Cellular activities)', 'Assay (Cellular activities, IC50)', 'EC50 (nM, Cellular activities)', 'Assay (Cellular activities, EC50)', 'GI50 (nM, Cellular activities)', 'Assay (Cellular activities, GI50)', 'ED50 (nM, Cellular activities)', 'Assay (Cellular activities, ED50)', 'GR50 (nM, Cellular activities)', 'Assay (Cellular activities, GR50)', 'PAMPA Papp (nm/s, Permeability)', 'Assay (Permeability, PAMPA Papp)', 'Caco-2 A2B Papp (nm/s, Permeability)', 'Assay (Permeability, Caco-2 A2B Papp)', 'Caco-2 B2A Papp (nm/s, Permeability)', 'Assay (Permeability, Caco-2 B2A Papp)']\n"
]
}
],
"source": [
"assay_to_parameters = {\n",
" \"Assay (DC50/Dmax)\": [\"DC50 (nM)\", \"Dmax (%)\"],\n",
" \"Assay (Percent degradation)\": [\"Percent degradation (%)\"],\n",
" \"Assay (Protac to Target, IC50)\": [\"IC50 (nM, Protac to Target)\"],\n",
" \"Assay (Protac to Target, EC50)\": [\"EC50 (nM, Protac to Target)\"],\n",
" \"Assay (Protac to Target, Kd)\": [\"Kd (nM, Protac to Target)\"],\n",
" \"Assay (Protac to Target, Ki)\": [\"Ki (nM, Protac to Target)\"],\n",
" \"Assay (Protac to Target, G/H/-TS)\": [\"delta G (kcal/mol, Protac to Target)\", \"delta H (kcal/mol, Protac to Target)\", \"-T*delta S (kcal/mol, Protac to Target)\"],\n",
" \"Assay (Protac to Target, kon/koff/t1/2)\": [\"kon (1/Ms, Protac to Target)\", \"koff (1/s, Protac to Target)\", \"t1/2 (s, Protac to Target)\"],\n",
" \"Assay (Protac to E3, IC50)\": [\"IC50 (nM, Protac to E3)\"],\n",
" \"Assay (Protac to E3, EC50)\": [\"EC50 (nM, Protac to E3)\"],\n",
" \"Assay (Protac to E3, Kd)\": [\"Kd (nM, Protac to E3)\"],\n",
" \"Assay (Protac to E3, Ki)\": [\"Ki (nM, Protac to E3)\"],\n",
" \"Assay (Protac to E3, G/H/-TS)\": [\"delta G (kcal/mol, Protac to E3)\", \"delta H (kcal/mol, Protac to E3)\", \"-T*delta S (kcal/mol, Protac to E3)\"],\n",
" \"Assay (Protac to E3, kon/koff/t1/2)\": [\"kon (1/Ms, Protac to E3)\", \"koff (1/s, Protac to E3)\", \"t1/2 (s, Protac to E3)\"],\n",
" \"Assay (Ternary complex, IC50)\": [\"IC50 (nM, Ternary complex)\"],\n",
" \"Assay (Ternary complex, EC50)\": [\"EC50 (nM, Ternary complex)\"],\n",
" \"Assay (Ternary complex, Kd)\": [\"Kd (nM, Ternary complex)\"],\n",
" \"Assay (Ternary complex, Ki)\": [\"Ki (nM, Ternary complex)\"],\n",
" \"Assay (Ternary complex, G/H/-TS)\": [\"delta G (kcal/mol, Ternary complex)\", \"delta H (kcal/mol, Ternary complex)\", \"-T*delta S (kcal/mol, Ternary complex)\"],\n",
" \"Assay (Ternary complex, kon/koff/t1/2)\": [\"kon (1/Ms, Ternary complex)\", \"koff (1/s, Ternary complex)\", \"t1/2 (s, Ternary complex)\"],\n",
" \"Assay (Cellular activities, IC50)\": [\"IC50 (nM, Cellular activities)\"],\n",
" \"Assay (Cellular activities, EC50)\": [\"EC50 (nM, Cellular activities)\"],\n",
" \"Assay (Cellular activities, GI50)\": [\"GI50 (nM, Cellular activities)\"],\n",
" \"Assay (Cellular activities, ED50)\": [\"ED50 (nM, Cellular activities)\"],\n",
" \"Assay (Cellular activities, GR50)\": [\"GR50 (nM, Cellular activities)\"],\n",
" \"Assay (Permeability, PAMPA Papp)\": [\"PAMPA Papp (nm/s, Permeability)\"],\n",
" \"Assay (Permeability, Caco-2 A2B Papp)\": [\"Caco-2 A2B Papp (nm/s, Permeability)\"],\n",
" \"Assay (Permeability, Caco-2 B2A Papp)\": [\"Caco-2 B2A Papp (nm/s, Permeability)\"]\n",
"}\n",
"assay_cols = []\n",
"for assay_col, param_cols in assay_to_parameters.items():\n",
" assay_cols += param_cols + [assay_col]\n",
"non_assay_cols = list(set(protac_df) - set(assay_cols))\n",
"print(f'Non-assay columns: {non_assay_cols}')\n",
"print(f'Assay columns: {assay_cols}')"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Assay (DC50/Dmax)
\n",
"
DC50 (nM)
\n",
"
Dmax (%)
\n",
"
\n",
" \n",
" \n",
"
\n",
"
19
\n",
"
Degradation of BRD9 in HeLa cells after 4 h tr...
\n",
"
560
\n",
"
80
\n",
"
\n",
"
\n",
"
40
\n",
"
Degradation of BRD9 in RI-1 cells after 8 h tr...
\n",
"
1.76
\n",
"
95
\n",
"
\n",
"
\n",
"
41
\n",
"
Degradation of HiBiT-BRD9 in HEK293 cells afte...
\n",
"
4
\n",
"
NaN
\n",
"
\n",
"
\n",
"
42
\n",
"
Degradation of BRD9 in EOL-1/A-204 cells after...
\n",
"
2/8
\n",
"
NaN
\n",
"
\n",
"
\n",
"
43
\n",
"
Degradation of BRD7 in RI-1 cells after 8 h tr...
\n",
"
4.5
\n",
"
95
\n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Assay (DC50/Dmax) DC50 (nM) Dmax (%)\n",
"19 Degradation of BRD9 in HeLa cells after 4 h tr... 560 80\n",
"40 Degradation of BRD9 in RI-1 cells after 8 h tr... 1.76 95\n",
"41 Degradation of HiBiT-BRD9 in HEK293 cells afte... 4 NaN\n",
"42 Degradation of BRD9 in EOL-1/A-204 cells after... 2/8 NaN\n",
"43 Degradation of BRD7 in RI-1 cells after 8 h tr... 4.5 95"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of (unique) rows: 854\n",
"--------------------------------------------------------------------------------\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Assay (Percent degradation)
\n",
"
Percent degradation (%)
\n",
"
\n",
" \n",
" \n",
"
\n",
"
129
\n",
"
% AR degradation in LNCaP cells after 6 h trea...
\n",
"
26/35/28
\n",
"
\n",
"
\n",
"
130
\n",
"
% AR degradation in LNCaP cells after 6 h trea...
\n",
"
15/23/23
\n",
"
\n",
"
\n",
"
131
\n",
"
% AR degradation in LNCaP cells after 6 h trea...
\n",
"
16/20/25
\n",
"
\n",
"
\n",
"
132
\n",
"
% AR degradation in LNCaP cells after 6 h trea...
\n",
"
11/25/29
\n",
"
\n",
"
\n",
"
133
\n",
"
% AR degradation in LNCaP cells after 6 h trea...
\n",
"
54/84/64
\n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Assay (Percent degradation) Percent degradation (%)\n",
"129 % AR degradation in LNCaP cells after 6 h trea... 26/35/28\n",
"130 % AR degradation in LNCaP cells after 6 h trea... 15/23/23\n",
"131 % AR degradation in LNCaP cells after 6 h trea... 16/20/25\n",
"132 % AR degradation in LNCaP cells after 6 h trea... 11/25/29\n",
"133 % AR degradation in LNCaP cells after 6 h trea... 54/84/64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of (unique) rows: 298\n",
"--------------------------------------------------------------------------------\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Assay (Protac to Target, IC50)
\n",
"
IC50 (nM, Protac to Target)
\n",
"
\n",
" \n",
" \n",
"
\n",
"
234
\n",
"
Inhibition of SIRT2 in a fluorescence-based de...
\n",
"
250
\n",
"
\n",
"
\n",
"
307
\n",
"
Inhibition of CDK6/Cyciln D3 by cell-free kina...
\n",
"
6.61
\n",
"
\n",
"
\n",
"
311
\n",
"
Inhibition of ERRalpha by TR-FRET Coactivator ...
\n",
"
7.33
\n",
"
\n",
"
\n",
"
312
\n",
"
Inhibition of ERRalpha by TR-FRET Coactivator ...
\n",
"
6.33
\n",
"
\n",
"
\n",
"
313
\n",
"
Inhibition of ERRalpha by TR-FRET Coactivator ...
\n",
"
12.67
\n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Assay (Protac to Target, IC50) \\\n",
"234 Inhibition of SIRT2 in a fluorescence-based de... \n",
"307 Inhibition of CDK6/Cyciln D3 by cell-free kina... \n",
"311 Inhibition of ERRalpha by TR-FRET Coactivator ... \n",
"312 Inhibition of ERRalpha by TR-FRET Coactivator ... \n",
"313 Inhibition of ERRalpha by TR-FRET Coactivator ... \n",
"\n",
" IC50 (nM, Protac to Target) \n",
"234 250 \n",
"307 6.61 \n",
"311 7.33 \n",
"312 6.33 \n",
"313 12.67 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of (unique) rows: 728\n",
"--------------------------------------------------------------------------------\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Assay (Protac to Target, EC50)
\n",
"
EC50 (nM, Protac to Target)
\n",
"
\n",
" \n",
" \n",
"
\n",
"
545
\n",
"
EC50 of BRD4 BD1/2 was tested by TR-FRET
\n",
"
95/298
\n",
"
\n",
"
\n",
"
5282
\n",
"
EC50 was tested by TR-FRET coactivator assays
\n",
"
31
\n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Assay (Protac to Target, EC50) \\\n",
"545 EC50 of BRD4 BD1/2 was tested by TR-FRET \n",
"5282 EC50 was tested by TR-FRET coactivator assays \n",
"\n",
" EC50 (nM, Protac to Target) \n",
"545 95/298 \n",
"5282 31 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of (unique) rows: 2\n",
"--------------------------------------------------------------------------------\n"
]
},
{
"data": {
"text/html": [
"
"
],
"text/plain": [
" Assay (Permeability, PAMPA Papp) \\\n",
"18 Compounds were administered at 10000 nM and in... \n",
"30 Compounds were administered at 10000 nM and in... \n",
"395 Compounds were administered at 10000 nM and in... \n",
"482 PAMPA assay \n",
"513 PAMPA assay \n",
"\n",
" PAMPA Papp (nm/s, Permeability) \n",
"18 0.01 \n",
"30 0.02 \n",
"395 <1 \n",
"482 0.2 \n",
"513 0.01 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of (unique) rows: 10\n",
"--------------------------------------------------------------------------------\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Assay (Permeability, Caco-2 A2B Papp)
\n",
"
Caco-2 A2B Papp (nm/s, Permeability)
\n",
"
\n",
" \n",
" \n",
"
\n",
"
395
\n",
"
Compounds were administered at 10000 nM and in...
\n",
"
0.35
\n",
"
\n",
"
\n",
"
397
\n",
"
Compounds were administered at 5000 nM and inc...
\n",
"
0.081
\n",
"
\n",
"
\n",
"
506
\n",
"
Compounds were administered at 10000 nM and in...
\n",
"
11.6
\n",
"
\n",
"
\n",
"
711
\n",
"
Compounds were administered at 10000 nM and in...
\n",
"
0.02
\n",
"
\n",
"
\n",
"
2188
\n",
"
Compounds were administered at 10000 nM and in...
\n",
"
1.7
\n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Assay (Permeability, Caco-2 A2B Papp) \\\n",
"395 Compounds were administered at 10000 nM and in... \n",
"397 Compounds were administered at 5000 nM and inc... \n",
"506 Compounds were administered at 10000 nM and in... \n",
"711 Compounds were administered at 10000 nM and in... \n",
"2188 Compounds were administered at 10000 nM and in... \n",
"\n",
" Caco-2 A2B Papp (nm/s, Permeability) \n",
"395 0.35 \n",
"397 0.081 \n",
"506 11.6 \n",
"711 0.02 \n",
"2188 1.7 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of (unique) rows: 25\n",
"--------------------------------------------------------------------------------\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Assay (Permeability, Caco-2 B2A Papp)
\n",
"
Caco-2 B2A Papp (nm/s, Permeability)
\n",
"
\n",
" \n",
" \n",
"
\n",
"
395
\n",
"
Compounds were administered at 10000 nM and in...
\n",
"
0.24
\n",
"
\n",
"
\n",
"
711
\n",
"
Compounds were administered at 10000 nM and in...
\n",
"
0.07
\n",
"
\n",
"
\n",
"
2188
\n",
"
Compounds were administered at 10000 nM and in...
\n",
"
14.1
\n",
"
\n",
"
\n",
"
2190
\n",
"
Compounds were administered at 10000 nM and in...
\n",
"
1.4
\n",
"
\n",
"
\n",
"
2191
\n",
"
Compounds were administered at 10000 nM and in...
\n",
"
<0.79
\n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Assay (Permeability, Caco-2 B2A Papp) \\\n",
"395 Compounds were administered at 10000 nM and in... \n",
"711 Compounds were administered at 10000 nM and in... \n",
"2188 Compounds were administered at 10000 nM and in... \n",
"2190 Compounds were administered at 10000 nM and in... \n",
"2191 Compounds were administered at 10000 nM and in... \n",
"\n",
" Caco-2 B2A Papp (nm/s, Permeability) \n",
"395 0.24 \n",
"711 0.07 \n",
"2188 14.1 \n",
"2190 1.4 \n",
"2191 <0.79 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of (unique) rows: 12\n",
"--------------------------------------------------------------------------------\n"
]
}
],
"source": [
"for assay, cols in assay_to_parameters.items():\n",
" tmp = protac_df[[assay] + cols].dropna(how='all').drop_duplicates()\n",
" display(tmp.head())\n",
" print(f'Number of (unique) rows: {len(tmp)}')\n",
" print('-' * 80)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### DC50 and Dmax"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Degradation of ENL in MV4;11 cells after 24 h treatment\n",
"['ENL'] | ['MV4;11'] | [24.0]\n",
"--------------------------------------------------------------------------------\n",
"Degradation of BRD4 in HEK293T cells after 8 h treatment\n",
"['BRD4'] | ['HEK293T'] | [8.0]\n",
"--------------------------------------------------------------------------------\n",
"Degradation of BRD4 in 231MFP/HAP1 cells after 8 h treatment\n",
"['BRD4'] | ['231MFP', 'HAP1'] | [8.0]\n",
"--------------------------------------------------------------------------------\n",
"Degradation of ER in ER-positive breast cancer cell lines\n",
"['ER'] | ['ER-positive breast cancer cell lines'] | nan\n",
"--------------------------------------------------------------------------------\n",
"Degradation of AR in LNCaP cells\n",
"['AR'] | ['LNCaP'] | nan\n",
"--------------------------------------------------------------------------------\n",
"Degradation of AR in LNCaP/VCaP AR+ cells after 6 h treatment\n",
"['AR'] | ['LNCaP', 'VCaP AR+'] | [6.0]\n",
"--------------------------------------------------------------------------------\n",
"Degradation of BRD4 BD1/2 assessed by EGFP/mCherry reporter assay\n",
"['BRD4 BD1', 'BRD4 BD2'] | nan | nan\n",
"--------------------------------------------------------------------------------\n",
"Degradation of BRD4 BD1 assessed by EGFP/mCherry reporter assay\n",
"['BRD4 BD1'] | nan | nan\n",
"--------------------------------------------------------------------------------\n",
"Degradation of PARP1 in Primary Cardiomyocytes after 24 h treatment\n",
"['PARP1'] | ['Primary Cardiomyocytes'] | [24.0]\n",
"--------------------------------------------------------------------------------\n",
"Degradation of HDAC6 in MM1S after 6 h treatment by in-cell ELISA analysis\n",
"['HDAC6'] | ['MM.1S'] | [6.0]\n",
"--------------------------------------------------------------------------------\n",
"Degradation of total tau/P-tau in A152T neurons after 24 h treatment\n",
"['tau/P-tau'] | ['A152T neurons'] | [24.0]\n",
"--------------------------------------------------------------------------------\n",
"Degradation of Rpn13 in MM.1S after 16 h treatment\n",
"['Rpn13'] | ['MM.1S'] | [16.0]\n",
"--------------------------------------------------------------------------------\n",
"Degradation of HDAC6 in MM1S after 6 h treatment by in-cell ELISA analysis\n",
"['HDAC6'] | ['MM.1S'] | [6.0]\n",
"--------------------------------------------------------------------------------\n"
]
}
],
"source": [
"def extract_dc50_info(sentence):\n",
" # Regex patterns for proteins/genes, cell types, and treatment hours\n",
" protein_regex = r\"Degradation of total\\s(.+?)\\s(in|after|using|proteins)\"\n",
" cell_regex = r\"in\\s([A-Za-z0-9-/.;\\(\\)\\s\\+]+)\\scells\"\n",
" treatment_regex = r\"after\\s(\\d+/?\\d*?/?\\d*?\\s?h)\"\n",
"\n",
" # Extracting protein information\n",
" if 'total' in sentence.lower():\n",
" protein_match = re.search(protein_regex, sentence)\n",
" proteins = protein_match.group(1).split(' and ') if protein_match else [\n",
" re.search(r\"Degradation of\\s([A-Za-z0-9-]+)\", sentence).group(1)]\n",
" else:\n",
" if ' in ' in sentence.lower():\n",
" proteins = sentence.split(' in ')[0].split('Degradation of ')[-1]\n",
" proteins = proteins.split('/') if '/' in proteins else [proteins]\n",
" else:\n",
" protein_match = re.search(protein_regex, sentence)\n",
" proteins = protein_match.group(1).split(\n",
" '/') if protein_match else [re.search(r\"Degradation of\\s([A-Za-z0-9-\\/]+)\", sentence).group(1)]\n",
" # Handle special cases...\n",
" if 'BRD4 short/long' in sentence:\n",
" proteins = ['BRD4 short', 'BRD4 long']\n",
" if 'BRD4 BD1/2' in sentence:\n",
" proteins = ['BRD4 BD1', 'BRD4 BD2']\n",
" elif 'BRD4 BD1' in sentence:\n",
" proteins = ['BRD4 BD1']\n",
" if 'EGFR L858R/T790M' in sentence:\n",
" proteins = ['EGFR L858R/T790M']\n",
" if 'EGFR del19/T790M/C797S' in sentence:\n",
" proteins = ['EGFR del19/T790M/C797S']\n",
"\n",
" # Extracting cell types\n",
" cell_match = re.search(cell_regex, sentence)\n",
" cells = cell_match.group(1).split('/') if cell_match else np.nan\n",
" # Handle special cases...\n",
" if 'Ba/F3' in sentence:\n",
" # Replace any occurences that contain 'Ba' or 'F3' with 'Ba/F3' and remove duplicates while preserving the order in the other cells\n",
" cells = ['Ba/F3' if 'Ba' in c or 'F3' in c else c for c in cells]\n",
" cells.pop(cells.index('Ba/F3'))\n",
" if 'ER-positive breast cancer cell lines' in sentence:\n",
" cells = ['ER-positive breast cancer cell lines']\n",
" if 'LNCaP (AR T878A)' in sentence:\n",
" cells = ['LNCaP']\n",
" if 'in A152T neurons' in sentence:\n",
" cells = ['A152T neurons']\n",
" if 'of Rpn13 in MM.1S after' in sentence:\n",
" cells = ['MM.1S']\n",
" if 'Primary Cardiomyocytes' in sentence:\n",
" cells = ['Primary Cardiomyocytes']\n",
" if ' HDAC6 in MM1S after' in sentence:\n",
" cells = ['MM.1S']\n",
"\n",
" # Extracting treatment hours\n",
" treatment_hours_match = re.search(treatment_regex, sentence)\n",
" if treatment_hours_match:\n",
" treatment_hours = treatment_hours_match.group(1).strip('h')\n",
" treatment_hours = split_clean_str(treatment_hours, return_floats=True)\n",
" else:\n",
" treatment_hours = np.nan\n",
"\n",
" return {\n",
" 'Target (Parsed)': proteins,\n",
" 'Cell Type': cells,\n",
" 'Treatment Time (h)': treatment_hours,\n",
" }\n",
"\n",
"\n",
"corner_cases = [\n",
" # 'Degradation of BRD4',\n",
" # 'Degradation of BRD4 short/long in HeLa cells after 24 h treatment',\n",
" # 'Degradation of BRD4 BD1 assessed by EGFP/mCherry reporter assay',\n",
" # 'Degradation of BRD4 BD1/2 assessed by EGFP/mCherry reporter assay',\n",
" # 'Degradation of WT/Exon 20 Ins EGFR in OVCAR8/HeLa cells after 24 h treatment',\n",
" # 'Degradation of TPM3-TRKA/TRKA in KM12/HEL cells after 6 h treatment',\n",
" # 'Degradation of Exon 19 del/L858R EGFR in HCC827/H3255 cells after 24 h treatment',\n",
" # 'Degradation of NPM-ALK/EML4-ALK in SU-DHL-1/NCI-H2228 cells after 16 h treatment',\n",
" # 'Degradation of BCR-ABL T315I in Ba/F3 cells after 24 h treatment',\n",
" # 'Degradation of BCR-ABL T315I in MOL/(Ba/F3)/R4;11 cells after 24 h treatment',\n",
" # 'Degradation of ALK in H3122/Karpas 299/Kelly cells 16 h treatment',\n",
" 'Degradation of AR in LNCaP/VCaP AR+ cells after 6 h treatment',\n",
" 'Degradation of BRD4 BD1/2 assessed by EGFP/mCherry reporter assay',\n",
" 'Degradation of BRD4 BD1 assessed by EGFP/mCherry reporter assay',\n",
" 'Degradation of PARP1 in Primary Cardiomyocytes after 24 h treatment',\n",
" 'Degradation of HDAC6 in MM1S after 6 h treatment by in-cell ELISA analysis',\n",
" 'Degradation of total tau/P-tau in A152T neurons after 24 h treatment',\n",
" 'Degradation of Rpn13 in MM.1S after 16 h treatment',\n",
" 'Degradation of HDAC6 in MM1S after 6 h treatment by in-cell ELISA analysis',\n",
"]\n",
"\n",
"for assay in assays[\"Assay (DC50/Dmax)\"][-5:] + corner_cases:\n",
" if len(assay) < 5:\n",
" continue\n",
" print(assay)\n",
" extracted_info = extract_dc50_info(assay)\n",
" proteins, cells, treatment_hours = extracted_info[\n",
" 'Target (Parsed)'], extracted_info['Cell Type'], extracted_info['Treatment Time (h)']\n",
" print(proteins, \"|\", cells, \"|\", treatment_hours)\n",
" print('-' * 80)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's first remove all entries with _all_ missing values in the `DC50 (nM)`, `Dmax (%)`, and `Assay (DC50/Dmax)` columns:"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1019"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"param_cols = assay_to_parameters[\"Assay (DC50/Dmax)\"]\n",
"dc50_dmax_df = pd.concat([protac_df, scraped_protac_df]).dropna(\n",
" subset=param_cols + [\"Assay (DC50/Dmax)\"], how='all')\n",
"dc50_dmax_df = dc50_dmax_df[dc50_dmax_df[\"Assay (DC50/Dmax)\"].notnull()]\n",
"# Drop all \"assay columns\" in assay_to_parameters except for \"Assay (DC50/Dmax)\" and its parameters\n",
"dc50_dmax_df = dc50_dmax_df.drop(\n",
" columns=[col for col in assay_cols if col not in param_cols + [\"Assay (DC50/Dmax)\"]])\n",
"dc50_dmax_df = dc50_dmax_df.drop_duplicates()\n",
"len(dc50_dmax_df)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "8dc21c335d764482b9c202ad0ddce4c5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Extracting DC50/Dmax info: 0%| | 0/1019 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Compound ID
\n",
"
Uniprot
\n",
"
Target
\n",
"
E3 Ligase
\n",
"
PDB
\n",
"
Name
\n",
"
Smiles
\n",
"
DC50 (nM)
\n",
"
Dmax (%)
\n",
"
Assay (DC50/Dmax)
\n",
"
...
\n",
"
Hydrogen Bond Acceptor Count
\n",
"
Hydrogen Bond Donor Count
\n",
"
Rotatable Bond Count
\n",
"
Topological Polar Surface Area
\n",
"
Molecular Formula
\n",
"
InChI
\n",
"
InChI Key
\n",
"
Target (Parsed)
\n",
"
Cell Type
\n",
"
Treatment Time (h)
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
11
\n",
"
Q9H8M2
\n",
"
BRD9
\n",
"
VHL
\n",
"
NaN
\n",
"
NaN
\n",
"
COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...
\n",
"
560.00
\n",
"
80.0
\n",
"
Degradation of BRD9 in HeLa cells after 4 h tr...
\n",
"
...
\n",
"
16
\n",
"
3
\n",
"
22
\n",
"
199.15
\n",
"
C54H69FN8O10S
\n",
"
InChI=1S/C54H69FN8O10S/c1-34-47(74-33-58-34)35...
\n",
"
MXAKQOVZPDLCDK-UDVNCTHFSA-N
\n",
"
BRD9
\n",
"
HeLa
\n",
"
4.0
\n",
"
\n",
"
\n",
"
1
\n",
"
22
\n",
"
Q9H8M2
\n",
"
BRD9
\n",
"
VHL
\n",
"
NaN
\n",
"
VZ185
\n",
"
COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...
\n",
"
1.76
\n",
"
95.0
\n",
"
Degradation of BRD9 in RI-1 cells after 8 h tr...
\n",
"
...
\n",
"
14
\n",
"
3
\n",
"
19
\n",
"
180.69
\n",
"
C53H67FN8O8S
\n",
"
InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...
\n",
"
ZAGCLFXBHOXXEN-JPTLTNPLSA-N
\n",
"
BRD9
\n",
"
RI-1
\n",
"
8.0
\n",
"
\n",
"
\n",
"
2
\n",
"
22
\n",
"
Q9H8M2
\n",
"
BRD9
\n",
"
VHL
\n",
"
NaN
\n",
"
VZ185
\n",
"
COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...
\n",
"
4.00
\n",
"
NaN
\n",
"
Degradation of HiBiT-BRD9 in HEK293 cells afte...
\n",
"
...
\n",
"
14
\n",
"
3
\n",
"
19
\n",
"
180.69
\n",
"
C53H67FN8O8S
\n",
"
InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...
\n",
"
ZAGCLFXBHOXXEN-JPTLTNPLSA-N
\n",
"
HiBiT-BRD9
\n",
"
HEK293
\n",
"
24.0
\n",
"
\n",
"
\n",
"
3
\n",
"
22
\n",
"
Q9H8M2
\n",
"
BRD9
\n",
"
VHL
\n",
"
NaN
\n",
"
VZ185
\n",
"
COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...
\n",
"
2.00
\n",
"
NaN
\n",
"
Degradation of BRD9 in EOL-1/A-204 cells after...
\n",
"
...
\n",
"
14
\n",
"
3
\n",
"
19
\n",
"
180.69
\n",
"
C53H67FN8O8S
\n",
"
InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...
\n",
"
ZAGCLFXBHOXXEN-JPTLTNPLSA-N
\n",
"
BRD9
\n",
"
EOL-1
\n",
"
18.0
\n",
"
\n",
"
\n",
"
4
\n",
"
22
\n",
"
Q9H8M2
\n",
"
BRD9
\n",
"
VHL
\n",
"
NaN
\n",
"
VZ185
\n",
"
COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...
\n",
"
8.00
\n",
"
NaN
\n",
"
Degradation of BRD9 in EOL-1/A-204 cells after...
\n",
"
...
\n",
"
14
\n",
"
3
\n",
"
19
\n",
"
180.69
\n",
"
C53H67FN8O8S
\n",
"
InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...
\n",
"
ZAGCLFXBHOXXEN-JPTLTNPLSA-N
\n",
"
BRD9
\n",
"
A-204
\n",
"
18.0
\n",
"
\n",
" \n",
"
\n",
"
5 rows × 26 columns
\n",
"
"
],
"text/plain": [
" Compound ID Uniprot Target E3 Ligase PDB Name \\\n",
"0 11 Q9H8M2 BRD9 VHL NaN NaN \n",
"1 22 Q9H8M2 BRD9 VHL NaN VZ185 \n",
"2 22 Q9H8M2 BRD9 VHL NaN VZ185 \n",
"3 22 Q9H8M2 BRD9 VHL NaN VZ185 \n",
"4 22 Q9H8M2 BRD9 VHL NaN VZ185 \n",
"\n",
" Smiles DC50 (nM) Dmax (%) \\\n",
"0 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... 560.00 80.0 \n",
"1 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... 1.76 95.0 \n",
"2 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... 4.00 NaN \n",
"3 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... 2.00 NaN \n",
"4 COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN... 8.00 NaN \n",
"\n",
" Assay (DC50/Dmax) ... \\\n",
"0 Degradation of BRD9 in HeLa cells after 4 h tr... ... \n",
"1 Degradation of BRD9 in RI-1 cells after 8 h tr... ... \n",
"2 Degradation of HiBiT-BRD9 in HEK293 cells afte... ... \n",
"3 Degradation of BRD9 in EOL-1/A-204 cells after... ... \n",
"4 Degradation of BRD9 in EOL-1/A-204 cells after... ... \n",
"\n",
" Hydrogen Bond Acceptor Count Hydrogen Bond Donor Count \\\n",
"0 16 3 \n",
"1 14 3 \n",
"2 14 3 \n",
"3 14 3 \n",
"4 14 3 \n",
"\n",
" Rotatable Bond Count Topological Polar Surface Area Molecular Formula \\\n",
"0 22 199.15 C54H69FN8O10S \n",
"1 19 180.69 C53H67FN8O8S \n",
"2 19 180.69 C53H67FN8O8S \n",
"3 19 180.69 C53H67FN8O8S \n",
"4 19 180.69 C53H67FN8O8S \n",
"\n",
" InChI \\\n",
"0 InChI=1S/C54H69FN8O10S/c1-34-47(74-33-58-34)35... \n",
"1 InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-... \n",
"2 InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-... \n",
"3 InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-... \n",
"4 InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-... \n",
"\n",
" InChI Key Target (Parsed) Cell Type Treatment Time (h) \n",
"0 MXAKQOVZPDLCDK-UDVNCTHFSA-N BRD9 HeLa 4.0 \n",
"1 ZAGCLFXBHOXXEN-JPTLTNPLSA-N BRD9 RI-1 8.0 \n",
"2 ZAGCLFXBHOXXEN-JPTLTNPLSA-N HiBiT-BRD9 HEK293 24.0 \n",
"3 ZAGCLFXBHOXXEN-JPTLTNPLSA-N BRD9 EOL-1 18.0 \n",
"4 ZAGCLFXBHOXXEN-JPTLTNPLSA-N BRD9 A-204 18.0 \n",
"\n",
"[5 rows x 26 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Parsed table len: 1216\n"
]
}
],
"source": [
"parsed_table = []\n",
"\n",
"for i, row in tqdm(dc50_dmax_df.iterrows(), total=len(dc50_dmax_df), desc='Extracting DC50/Dmax info'):\n",
" assay = row[\"Assay (DC50/Dmax)\"]\n",
" if len(assay) < 5:\n",
" continue\n",
" extracted_info = extract_dc50_info(assay)\n",
" extracted_info['DC50 (nM)'] = split_clean_str(\n",
" row['DC50 (nM)'], return_floats=True)\n",
" extracted_info['Dmax (%)'] = split_clean_str(\n",
" row['Dmax (%)'], return_floats=True)\n",
"\n",
" # Get the max len of each list in the extracted info\n",
" max_len = max([len(v)\n",
" for v in extracted_info.values() if isinstance(v, list)])\n",
" for i in range(max_len):\n",
" row_tmp = row.copy().to_dict()\n",
" row_tmp.update({k: v[i % len(v)] if isinstance(v, list)\n",
" else v for k, v in extracted_info.items()})\n",
" parsed_table.append(row_tmp)\n",
"\n",
"parsed_table = pd.DataFrame(parsed_table)\n",
"display(parsed_table.head())\n",
"print(f'Parsed table len: {len(parsed_table)}')"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Assay (DC50/Dmax)
\n",
"
Target
\n",
"
Target (Parsed)
\n",
"
\n",
" \n",
" \n",
"
\n",
"
2
\n",
"
Degradation of HiBiT-BRD9 in HEK293 cells afte...
\n",
"
BRD9
\n",
"
HiBiT-BRD9
\n",
"
\n",
"
\n",
"
6
\n",
"
Degradation of HiBiT-BRD7 in HEK293 cells afte...
\n",
"
BRD7
\n",
"
HiBiT-BRD7
\n",
"
\n",
"
\n",
"
77
\n",
"
Degradation of NPM-ALK/EML4-ALK in SU-DHL-1/NC...
\n",
"
ALK
\n",
"
NPM-ALK
\n",
"
\n",
"
\n",
"
78
\n",
"
Degradation of NPM-ALK/EML4-ALK in SU-DHL-1/NC...
\n",
"
ALK
\n",
"
EML4-ALK
\n",
"
\n",
"
\n",
"
102
\n",
"
Degradation of Fak in PC3 cells after 24 h tre...
\n",
"
FAK
\n",
"
Fak
\n",
"
\n",
"
\n",
"
111
\n",
"
Degradation of WT/Exon 20 Ins EGFR in OVCAR8/H...
\n",
"
EGFR
\n",
"
WT
\n",
"
\n",
"
\n",
"
112
\n",
"
Degradation of WT/Exon 20 Ins EGFR in OVCAR8/H...
\n",
"
EGFR
\n",
"
Exon 20 Ins EGFR
\n",
"
\n",
"
\n",
"
113
\n",
"
Degradation of Exon 19 del/L858R EGFR in HCC82...
\n",
"
EGFR
\n",
"
Exon 19 del
\n",
"
\n",
"
\n",
"
114
\n",
"
Degradation of Exon 19 del/L858R EGFR in HCC82...
\n",
"
EGFR
\n",
"
L858R EGFR
\n",
"
\n",
"
\n",
"
115
\n",
"
Degradation of L858R, T790M EGFR in H1975 cell...
\n",
"
EGFR
\n",
"
L858R, T790M EGFR
\n",
"
\n",
"
\n",
"
122
\n",
"
Degradation of C481S BTK in XLA cells after 24...
\n",
"
BTK C481S
\n",
"
C481S BTK
\n",
"
\n",
"
\n",
"
141
\n",
"
Degradation of BRD4 short/long in HeLa cells a...
\n",
"
BRD4
\n",
"
BRD4 short
\n",
"
\n",
"
\n",
"
142
\n",
"
Degradation of BRD4 short/long in HeLa cells a...
\n",
"
BRD4
\n",
"
BRD4 long
\n",
"
\n",
"
\n",
"
144
\n",
"
Degradation of BRD4 BD1/2 assessed by EGFP/mCh...
\n",
"
BRD4
\n",
"
BRD4 BD1
\n",
"
\n",
"
\n",
"
145
\n",
"
Degradation of BRD4 BD1/2 assessed by EGFP/mCh...
\n",
"
BRD4
\n",
"
BRD4 BD2
\n",
"
\n",
"
\n",
"
208
\n",
"
Degradation of total STAT3 and p-STAT3Y705 pro...
\n",
"
STAT3
\n",
"
p-STAT3Y705
\n",
"
\n",
"
\n",
"
212
\n",
"
Degradation of total STAT3 and p-STAT3Y705 pro...
\n",
"
STAT3
\n",
"
p-STAT3Y705
\n",
"
\n",
"
\n",
"
262
\n",
"
Degradation of G1202R ALK in 293T cells
\n",
"
ALK G1202R
\n",
"
G1202R ALK
\n",
"
\n",
"
\n",
"
356
\n",
"
Degradation of SMARCA2 in MV-4-11 cells after ...
\n",
"
SMARCA4
\n",
"
SMARCA2
\n",
"
\n",
"
\n",
"
365
\n",
"
Degradation of ERalpha in MCF-7/T47D cells aft...
\n",
"
ER
\n",
"
ERalpha
\n",
"
\n",
"
\n",
"
367
\n",
"
Degradation of ERalpha in MCF-7 cells after 6 ...
\n",
"
ER
\n",
"
ERalpha
\n",
"
\n",
"
\n",
"
368
\n",
"
Degradation of ERalpha in MCF-7 cells after 6h...
\n",
"
ER
\n",
"
ERalpha
\n",
"
\n",
"
\n",
"
387
\n",
"
Degradation of pVHL30 in HeLa cells after 24 h...
\n",
"
VHL
\n",
"
pVHL30
\n",
"
\n",
"
\n",
"
388
\n",
"
Degradation of TrkC in Hs578t cells after 24 h...
\n",
"
TRKC
\n",
"
TrkC
\n",
"
\n",
"
\n",
"
389
\n",
"
Degradation of HADC6 in MM.1S cells after 24 h...
\n",
"
HDAC6
\n",
"
HADC6
\n",
"
\n",
"
\n",
"
408
\n",
"
Degradation of ERalpha in MCF-7 cells after 4/...
\n",
"
ER
\n",
"
ERalpha
\n",
"
\n",
"
\n",
"
411
\n",
"
Degradation of ERalpha in MCF-7 cells after 4h...
\n",
"
ER
\n",
"
ERalpha
\n",
"
\n",
"
\n",
"
412
\n",
"
Degradation of ERalpha in MCF-7 cells after 4/...
\n",
"
ER
\n",
"
ERalpha
\n",
"
\n",
"
\n",
"
495
\n",
"
Degradation of total tau/P-tau in A152T neuron...
\n",
"
Tau
\n",
"
tau/P-tau
\n",
"
\n",
"
\n",
"
523
\n",
"
Degradation of ERalpha in MCF7 cells
\n",
"
ER
\n",
"
ERalpha
\n",
"
\n",
"
\n",
"
536
\n",
"
Degradation of Fak in primary Sertoli/germ cel...
\n",
"
FAK
\n",
"
Fak
\n",
"
\n",
"
\n",
"
620
\n",
"
Degradation of Fak in HLE/HuH-7/SNU-423 cells ...
\n",
"
FAK
\n",
"
Fak
\n",
"
\n",
"
\n",
"
623
\n",
"
Degradation of Fak in HUH-1/HepG2/SK-Hep-1 cel...
\n",
"
FAK
\n",
"
Fak
\n",
"
\n",
"
\n",
"
626
\n",
"
Degradation of Fak in A549/Hep3B2.1-7/SNU-387 ...
\n",
"
FAK
\n",
"
Fak
\n",
"
\n",
"
\n",
"
629
\n",
"
Degradation of Fak in HLF/SNU-398/HUCCT1 cells...
\n",
"
FAK
\n",
"
Fak
\n",
"
\n",
"
\n",
"
663
\n",
"
Degradation of TPM3-TRKA/TRKA in KM12/HEL cell...
\n",
"
TRKA
\n",
"
TPM3-TRKA
\n",
"
\n",
"
\n",
"
718
\n",
"
Degradation of EGFR del19 in HCC827 cells afte...
\n",
"
EGFR e19d
\n",
"
EGFR del19
\n",
"
\n",
"
\n",
"
769
\n",
"
Degradation of ERalpha in MCF-7 cells
\n",
"
ER
\n",
"
ERalpha
\n",
"
\n",
"
\n",
"
806
\n",
"
Degradation of WDR5-HiBiT in MV4-11 (WDR5-HiBi...
\n",
"
WDR5
\n",
"
WDR5-HiBiT
\n",
"
\n",
"
\n",
"
824
\n",
"
Degradation of BRD4 long in HEK293 cells after...
\n",
"
BRD4
\n",
"
BRD4 long
\n",
"
\n",
"
\n",
"
892
\n",
"
Degradation of PYK2 in SR cells after 24 h tre...
\n",
"
PTK2B
\n",
"
PYK2
\n",
"
\n",
"
\n",
"
896
\n",
"
Degradation of RSK1 in NCI-H2228/A549/Calu-1 c...
\n",
"
RPS6KA1
\n",
"
RSK1
\n",
"
\n",
"
\n",
"
899
\n",
"
Degradation of EGFR del19 in HCC-827 cells aft...
\n",
"
EGFR e19d
\n",
"
EGFR del19
\n",
"
\n",
"
\n",
"
1014
\n",
"
Degradation of total AKT in BT474 cells after ...
\n",
"
AKT2
\n",
"
AKT
\n",
"
\n",
"
\n",
"
1015
\n",
"
Degradation of total AKT in BT474 cells after ...
\n",
"
AKT1
\n",
"
AKT
\n",
"
\n",
"
\n",
"
1016
\n",
"
Degradation of total AKT in BT474 cells after ...
\n",
"
AKT3
\n",
"
AKT
\n",
"
\n",
"
\n",
"
1058
\n",
"
Degradation of GSK3B in SH-SY5Y cells after 24...
\n",
"
GSK-3beta
\n",
"
GSK3B
\n",
"
\n",
"
\n",
"
1145
\n",
"
Degradation of SMARCA2 HiBiT in HT1080 cells a...
\n",
"
SMARCA2
\n",
"
SMARCA2 HiBiT
\n",
"
\n",
"
\n",
"
1191
\n",
"
Degradation of Tau5 in HEK293-hTau cells after...
\n",
"
Tau
\n",
"
Tau5
\n",
"
\n",
"
\n",
"
1192
\n",
"
Degradation of ENL in MV4;11 cells after 24 h ...
\n",
"
MLLT1
\n",
"
ENL
\n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Assay (DC50/Dmax) Target \\\n",
"2 Degradation of HiBiT-BRD9 in HEK293 cells afte... BRD9 \n",
"6 Degradation of HiBiT-BRD7 in HEK293 cells afte... BRD7 \n",
"77 Degradation of NPM-ALK/EML4-ALK in SU-DHL-1/NC... ALK \n",
"78 Degradation of NPM-ALK/EML4-ALK in SU-DHL-1/NC... ALK \n",
"102 Degradation of Fak in PC3 cells after 24 h tre... FAK \n",
"111 Degradation of WT/Exon 20 Ins EGFR in OVCAR8/H... EGFR \n",
"112 Degradation of WT/Exon 20 Ins EGFR in OVCAR8/H... EGFR \n",
"113 Degradation of Exon 19 del/L858R EGFR in HCC82... EGFR \n",
"114 Degradation of Exon 19 del/L858R EGFR in HCC82... EGFR \n",
"115 Degradation of L858R, T790M EGFR in H1975 cell... EGFR \n",
"122 Degradation of C481S BTK in XLA cells after 24... BTK C481S \n",
"141 Degradation of BRD4 short/long in HeLa cells a... BRD4 \n",
"142 Degradation of BRD4 short/long in HeLa cells a... BRD4 \n",
"144 Degradation of BRD4 BD1/2 assessed by EGFP/mCh... BRD4 \n",
"145 Degradation of BRD4 BD1/2 assessed by EGFP/mCh... BRD4 \n",
"208 Degradation of total STAT3 and p-STAT3Y705 pro... STAT3 \n",
"212 Degradation of total STAT3 and p-STAT3Y705 pro... STAT3 \n",
"262 Degradation of G1202R ALK in 293T cells ALK G1202R \n",
"356 Degradation of SMARCA2 in MV-4-11 cells after ... SMARCA4 \n",
"365 Degradation of ERalpha in MCF-7/T47D cells aft... ER \n",
"367 Degradation of ERalpha in MCF-7 cells after 6 ... ER \n",
"368 Degradation of ERalpha in MCF-7 cells after 6h... ER \n",
"387 Degradation of pVHL30 in HeLa cells after 24 h... VHL \n",
"388 Degradation of TrkC in Hs578t cells after 24 h... TRKC \n",
"389 Degradation of HADC6 in MM.1S cells after 24 h... HDAC6 \n",
"408 Degradation of ERalpha in MCF-7 cells after 4/... ER \n",
"411 Degradation of ERalpha in MCF-7 cells after 4h... ER \n",
"412 Degradation of ERalpha in MCF-7 cells after 4/... ER \n",
"495 Degradation of total tau/P-tau in A152T neuron... Tau \n",
"523 Degradation of ERalpha in MCF7 cells ER \n",
"536 Degradation of Fak in primary Sertoli/germ cel... FAK \n",
"620 Degradation of Fak in HLE/HuH-7/SNU-423 cells ... FAK \n",
"623 Degradation of Fak in HUH-1/HepG2/SK-Hep-1 cel... FAK \n",
"626 Degradation of Fak in A549/Hep3B2.1-7/SNU-387 ... FAK \n",
"629 Degradation of Fak in HLF/SNU-398/HUCCT1 cells... FAK \n",
"663 Degradation of TPM3-TRKA/TRKA in KM12/HEL cell... TRKA \n",
"718 Degradation of EGFR del19 in HCC827 cells afte... EGFR e19d \n",
"769 Degradation of ERalpha in MCF-7 cells ER \n",
"806 Degradation of WDR5-HiBiT in MV4-11 (WDR5-HiBi... WDR5 \n",
"824 Degradation of BRD4 long in HEK293 cells after... BRD4 \n",
"892 Degradation of PYK2 in SR cells after 24 h tre... PTK2B \n",
"896 Degradation of RSK1 in NCI-H2228/A549/Calu-1 c... RPS6KA1 \n",
"899 Degradation of EGFR del19 in HCC-827 cells aft... EGFR e19d \n",
"1014 Degradation of total AKT in BT474 cells after ... AKT2 \n",
"1015 Degradation of total AKT in BT474 cells after ... AKT1 \n",
"1016 Degradation of total AKT in BT474 cells after ... AKT3 \n",
"1058 Degradation of GSK3B in SH-SY5Y cells after 24... GSK-3beta \n",
"1145 Degradation of SMARCA2 HiBiT in HT1080 cells a... SMARCA2 \n",
"1191 Degradation of Tau5 in HEK293-hTau cells after... Tau \n",
"1192 Degradation of ENL in MV4;11 cells after 24 h ... MLLT1 \n",
"\n",
" Target (Parsed) \n",
"2 HiBiT-BRD9 \n",
"6 HiBiT-BRD7 \n",
"77 NPM-ALK \n",
"78 EML4-ALK \n",
"102 Fak \n",
"111 WT \n",
"112 Exon 20 Ins EGFR \n",
"113 Exon 19 del \n",
"114 L858R EGFR \n",
"115 L858R, T790M EGFR \n",
"122 C481S BTK \n",
"141 BRD4 short \n",
"142 BRD4 long \n",
"144 BRD4 BD1 \n",
"145 BRD4 BD2 \n",
"208 p-STAT3Y705 \n",
"212 p-STAT3Y705 \n",
"262 G1202R ALK \n",
"356 SMARCA2 \n",
"365 ERalpha \n",
"367 ERalpha \n",
"368 ERalpha \n",
"387 pVHL30 \n",
"388 TrkC \n",
"389 HADC6 \n",
"408 ERalpha \n",
"411 ERalpha \n",
"412 ERalpha \n",
"495 tau/P-tau \n",
"523 ERalpha \n",
"536 Fak \n",
"620 Fak \n",
"623 Fak \n",
"626 Fak \n",
"629 Fak \n",
"663 TPM3-TRKA \n",
"718 EGFR del19 \n",
"769 ERalpha \n",
"806 WDR5-HiBiT \n",
"824 BRD4 long \n",
"892 PYK2 \n",
"896 RSK1 \n",
"899 EGFR del19 \n",
"1014 AKT \n",
"1015 AKT \n",
"1016 AKT \n",
"1058 GSK3B \n",
"1145 SMARCA2 HiBiT \n",
"1191 Tau5 \n",
"1192 ENL "
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"parsed_table[parsed_table['Target'] !=\n",
" parsed_table['Target (Parsed)']][['Assay (DC50/Dmax)', 'Target', 'Target (Parsed)']].drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Non-Nan Active: 790\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"