{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4",
"authorship_tag": "ABX9TyMLv05e/gGMLdaS7UI/GPTc",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
""
]
},
{
"cell_type": "markdown",
"source": [
"# PROTAC-Degradation-Predictor: Tutorial\n",
"\n",
"This notebook includes a short tutorial and a collection of code snippets for predicting protein degradation by PROTACs.\n",
"\n",
"The underlined deep learning models have been trained and evaluated as reported in [_\"Modeling PROTAC Degradation Activity with Machine Learning\"_](https://arxiv.org/abs/2406.02637)."
],
"metadata": {
"id": "N_0Bfod51dDY"
}
},
{
"cell_type": "markdown",
"source": [
"## Clone Repository and Install Package"
],
"metadata": {
"id": "nLP8GZMp2cSh"
}
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "DNmrlIcqvOu1",
"outputId": "1c06335e-0d72-4790-f575-679b084a88a6"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Cloning into 'PROTAC-Degradation-Predictor'...\n",
"remote: Enumerating objects: 1052, done.\u001b[K\n",
"remote: Counting objects: 100% (262/262), done.\u001b[K\n",
"remote: Compressing objects: 100% (136/136), done.\u001b[K\n",
"remote: Total 1052 (delta 117), reused 244 (delta 102), pack-reused 790\u001b[K\n",
"Receiving objects: 100% (1052/1052), 284.74 MiB | 13.45 MiB/s, done.\n",
"Resolving deltas: 100% (523/523), done.\n",
"Updating files: 100% (156/156), done.\n",
" Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m802.3/802.3 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m34.9/34.9 MB\u001b[0m \u001b[31m21.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h\u001b[33mWARNING: Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))': /simple/optuna/\u001b[0m\u001b[33m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m380.1/380.1 kB\u001b[0m \u001b[31m20.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m868.8/868.8 kB\u001b[0m \u001b[31m31.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m233.4/233.4 kB\u001b[0m \u001b[31m20.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.3/21.3 MB\u001b[0m \u001b[31m30.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.6/78.6 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Building wheel for protac-degradation-predictor (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
]
}
],
"source": [
"!git clone https://github.com/ribesstefano/PROTAC-Degradation-Predictor.git\n",
"!pip install ./PROTAC-Degradation-Predictor -qqq"
]
},
{
"cell_type": "markdown",
"source": [
"## Setup\n",
"\n",
"Let's import the required packages:"
],
"metadata": {
"id": "Kvq5ljqLKwzI"
}
},
{
"cell_type": "code",
"source": [
"import protac_degradation_predictor as pdp\n",
"import torch\n",
"from rdkit import Chem"
],
"metadata": {
"id": "hkucKE6SKweE"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## Loading Curated Dataset\n",
"\n",
"The curated dataset can be loaded via the following code:"
],
"metadata": {
"id": "egrArJi_KdBU"
}
},
{
"cell_type": "code",
"source": [
"protac_df = pdp.load_curated_dataset()\n",
"display(protac_df.head())"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 627
},
"id": "_jzpk3GcKjqI",
"outputId": "7b02b3ff-a85f-40b0-b2ec-dc4c72f92fbf"
},
"execution_count": 3,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
" Compound ID Uniprot Smiles \\\n",
"0 1 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"1 2 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"2 3 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"3 4 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"4 5 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"\n",
" E3 Ligase InChI \\\n",
"0 VHL InChI=1S/C73H88ClF3N10O10S4/c1-47(49-13-15-51(... \n",
"1 VHL InChI=1S/C74H90ClF3N10O10S4/c1-48(50-13-15-52(... \n",
"2 VHL InChI=1S/C75H92ClF3N10O10S4/c1-49(51-16-18-53(... \n",
"3 VHL InChI=1S/C76H94ClF3N10O10S4/c1-50(52-17-19-54(... \n",
"4 VHL InChI=1S/C77H96ClF3N10O10S4/c1-51(53-18-20-55(... \n",
"\n",
" InChI Key Molecular Weight Heavy Atom Count \\\n",
"0 SXPDUCVNMGMWBJ-FMZBIETASA-N 1486.282 101 \n",
"1 HQKUMELJMUNTTF-NMKDNUEVSA-N 1500.309 102 \n",
"2 ATQCEJKUPSBDMA-QARNUTPLSA-N 1514.336 103 \n",
"3 FNKQAGMHNFFSEI-DTTPTBRMSA-N 1528.363 104 \n",
"4 PXVFFBGSTYQHRO-REQIQPEASA-N 1542.390 105 \n",
"\n",
" Ring Count Rotatable Bond Count ... Name Assay (DC50/Dmax) Exact Mass \\\n",
"0 10 24 ... NaN NaN NaN \n",
"1 10 25 ... NaN NaN NaN \n",
"2 10 26 ... NaN NaN NaN \n",
"3 10 27 ... NaN NaN NaN \n",
"4 10 28 ... NaN NaN NaN \n",
"\n",
" XLogP3 Target (Parsed) POI Sequence \\\n",
"0 NaN NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"1 NaN NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"2 NaN NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"3 NaN NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"4 NaN NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"\n",
" E3 Ligase Uniprot E3 Ligase Sequence \\\n",
"0 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"1 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"2 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"3 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"4 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"\n",
" Cell Line Identifier Active - OR \n",
"0 MOLT-4 NaN \n",
"1 MOLT-4 NaN \n",
"2 MOLT-4 NaN \n",
"3 MOLT-4 NaN \n",
"4 MOLT-4 True \n",
"\n",
"[5 rows x 35 columns]"
],
"text/html": [
"\n",
"
\n", " | Compound ID | \n", "Uniprot | \n", "Smiles | \n", "E3 Ligase | \n", "InChI | \n", "InChI Key | \n", "Molecular Weight | \n", "Heavy Atom Count | \n", "Ring Count | \n", "Rotatable Bond Count | \n", "... | \n", "Name | \n", "Assay (DC50/Dmax) | \n", "Exact Mass | \n", "XLogP3 | \n", "Target (Parsed) | \n", "POI Sequence | \n", "E3 Ligase Uniprot | \n", "E3 Ligase Sequence | \n", "Cell Line Identifier | \n", "Active - OR | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "Q07817 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "InChI=1S/C73H88ClF3N10O10S4/c1-47(49-13-15-51(... | \n", "SXPDUCVNMGMWBJ-FMZBIETASA-N | \n", "1486.282 | \n", "101 | \n", "10 | \n", "24 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... | \n", "P40337 | \n", "MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... | \n", "MOLT-4 | \n", "NaN | \n", "
1 | \n", "2 | \n", "Q07817 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "InChI=1S/C74H90ClF3N10O10S4/c1-48(50-13-15-52(... | \n", "HQKUMELJMUNTTF-NMKDNUEVSA-N | \n", "1500.309 | \n", "102 | \n", "10 | \n", "25 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... | \n", "P40337 | \n", "MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... | \n", "MOLT-4 | \n", "NaN | \n", "
2 | \n", "3 | \n", "Q07817 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "InChI=1S/C75H92ClF3N10O10S4/c1-49(51-16-18-53(... | \n", "ATQCEJKUPSBDMA-QARNUTPLSA-N | \n", "1514.336 | \n", "103 | \n", "10 | \n", "26 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... | \n", "P40337 | \n", "MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... | \n", "MOLT-4 | \n", "NaN | \n", "
3 | \n", "4 | \n", "Q07817 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "InChI=1S/C76H94ClF3N10O10S4/c1-50(52-17-19-54(... | \n", "FNKQAGMHNFFSEI-DTTPTBRMSA-N | \n", "1528.363 | \n", "104 | \n", "10 | \n", "27 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... | \n", "P40337 | \n", "MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... | \n", "MOLT-4 | \n", "NaN | \n", "
4 | \n", "5 | \n", "Q07817 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "InChI=1S/C77H96ClF3N10O10S4/c1-51(53-18-20-55(... | \n", "PXVFFBGSTYQHRO-REQIQPEASA-N | \n", "1542.390 | \n", "105 | \n", "10 | \n", "28 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... | \n", "P40337 | \n", "MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... | \n", "MOLT-4 | \n", "True | \n", "
5 rows × 35 columns
\n", "\n", " | Uniprot | \n", "Cell Line Identifier | \n", "Smiles | \n", "E3 Ligase | \n", "DC50 (nM) | \n", "Dmax (%) | \n", "Active | \n", "
---|---|---|---|---|---|---|---|
4 | \n", "Q07817 | \n", "MOLT-4 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "53.0 | \n", "100.0 | \n", "True | \n", "
7 | \n", "Q07817 | \n", "MOLT-4 | \n", "Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... | \n", "VHL | \n", "93.0 | \n", "90.0 | \n", "True | \n", "
60 | \n", "P00533 | \n", "H1975/WR | \n", "C=CC(=O)Nc1cccc(-n2c(=O)cc(C)c3cnc(Nc4ccc(N5CC... | \n", "VHL | \n", "25.3 | \n", "90.0 | \n", "True | \n", "
69 | \n", "P00533 | \n", "H1975/WR | \n", "C=CC(=O)Nc1cccc(-n2c(=O)cc(C)c3cnc(Nc4ccc(N5CC... | \n", "VHL | \n", "5.9 | \n", "100.0 | \n", "True | \n", "
72 | \n", "Q9NWZ3 | \n", "PH1-PBMCs-hiPSC4F1 | \n", "COc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(C#CCCCCCCCCCCC... | \n", "VHL | \n", "3000.0 | \n", "50.0 | \n", "False | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
2121 | \n", "O60885 | \n", "HEK293T | \n", "Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(=O)NCCO... | \n", "FEM1B | \n", "820.0 | \n", "81.0 | \n", "True | \n", "
2122 | \n", "O60885 | \n", "HEK293T | \n", "Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(=O)NCCC... | \n", "FEM1B | \n", "250.0 | \n", "94.0 | \n", "True | \n", "
2125 | \n", "O60885 | \n", "HEK293T | \n", "Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(=O)NCCO... | \n", "FEM1B | \n", "1100.0 | \n", "85.0 | \n", "False | \n", "
2126 | \n", "O60885 | \n", "HEK293T | \n", "Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(=O)NCCO... | \n", "FEM1B | \n", "3600.0 | \n", "60.0 | \n", "False | \n", "
2127 | \n", "O60885 | \n", "HEK293T | \n", "Cc1sc2c(c1C)C(c1ccc(Cl)cc1)=N[C@@H](CC(=O)NCCO... | \n", "FEM1B | \n", "1600.0 | \n", "80.0 | \n", "False | \n", "
857 rows × 7 columns
\n", "\n", " | Uniprot | \n", "Cell Line Identifier | \n", "Smiles | \n", "E3 Ligase | \n", "DC50 (nM) | \n", "Dmax (%) | \n", "Active | \n", "
---|---|---|---|---|---|---|---|
1871 | \n", "Q92769 | \n", "HCT116-53BPI(+/-) | \n", "Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O... | \n", "VHL | \n", "NaN | \n", "23.0 | \n", "False | \n", "