"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"print('Proteins as amino-acid counts performances:')\n",
"plot_performance_metrics(\n",
" df_cv=reports['aminoacidcnt_cv_train'],\n",
" df_test=reports['aminoacidcnt_test'],\n",
" df_test_majority=reports['aminoacidcnt_majority_vote'],\n",
" title=f'aminoacidcnt_performance',\n",
" show_plot=False,\n",
" metrics_to_plot = {\n",
" 'val_acc': 'Validation Accuracy',\n",
" 'val_roc_auc': 'Validation ROC AUC',\n",
" 'val_f1_score': 'Validation F1 Score',\n",
" 'val_precision': 'Validation Precision',\n",
" 'val_recall': 'Validation Recall',\n",
" 'test_acc': 'Test Accuracy',\n",
" 'test_roc_auc': 'Test ROC AUC',\n",
" 'test_f1_score': 'Test F1 Score',\n",
" 'test_precision': 'Test Precision',\n",
" 'test_recall': 'Test Recall',\n",
" },\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Compare Performance"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1899217/2462072186.py:5: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" tmp['Experiment'] = r\n",
"/tmp/ipykernel_1899217/2462072186.py:5: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" tmp['Experiment'] = r\n",
"/tmp/ipykernel_1899217/2462072186.py:5: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" tmp['Experiment'] = r\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"| Experiment | Study | Test Accuracy | Test ROC AUC |\n",
"|:------------------------------|:-----------|----------------:|---------------:|\n",
"| Baseline | Standard | 0.782051 | 0.845847 |\n",
"| Baseline | Target | 0.526316 | 0.595819 |\n",
"| Baseline | Similarity | 0.779221 | 0.854336 |\n",
"| Cells as one-hot | Standard | 0.820513 | 0.875083 |\n",
"| Cells as one-hot | Target | 0.618421 | 0.61324 |\n",
"| Cells as one-hot | Similarity | 0.74026 | 0.842141 |\n",
"| Proteins as amino-acid counts | Standard | 0.705128 | 0.828571 |\n",
"| Proteins as amino-acid counts | Target | 0.605263 | 0.543554 |\n",
"| Proteins as amino-acid counts | Similarity | 0.74026 | 0.815718 |\n",
"--------------------------------------------------------------------------------\n",
"Comparison of the best models majority vote:\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = []\n",
"for r in ['majority_vote', 'cellsonehot_majority_vote', 'aminoacidcnt_majority_vote']:\n",
" tmp = reports[r]\n",
" tmp = tmp[tmp['cv_models'].isna()]\n",
" tmp['Experiment'] = r\n",
" df.append(tmp)\n",
"df = pd.concat(df)\n",
"# Rename split_type to paper names\n",
"df['split_type'] = df['split_type'].replace({\n",
" 'random': 'Standard',\n",
" 'uniprot': 'Target',\n",
" 'tanimoto': 'Similarity',\n",
" 'standard': 'Standard',\n",
" 'target': 'Target',\n",
" 'similarity': 'Similarity',\n",
"})\n",
"# Rename columns to paper names\n",
"df.rename(columns={\n",
" 'split_type': 'Study',\n",
" 'test_acc': 'Test Accuracy',\n",
" 'test_roc_auc': 'Test ROC AUC',\n",
"}, inplace=True)\n",
"# Rename experiment names to paper names\n",
"df['Experiment'] = df['Experiment'].replace({\n",
" 'majority_vote': 'Baseline',\n",
" 'cellsonehot_majority_vote': 'Cells as one-hot',\n",
" 'aminoacidcnt_majority_vote': 'Proteins as amino-acid counts',\n",
"})\n",
"print(df[['Experiment', 'Study', 'Test Accuracy', 'Test ROC AUC']].to_markdown(index=False))\n",
"df['Experiment'] = df['Experiment'] = df['Experiment'].replace({\n",
" 'Cells as one-hot': 'Cells as\\none-hot',\n",
" 'Proteins as amino-acid counts': 'Proteins as\\nAA counts',\n",
"})\n",
"\n",
"def plot_comparison_df(df, filename=None):\n",
" # Plot the test accuracy and ROC AUC in two bar-plots side by side, with Study as hue\n",
" _, axes = plt.subplots(1, 2, figsize=(8, 5))\n",
" sns.barplot(\n",
" data=df,\n",
" x='Experiment',\n",
" y='Test Accuracy',\n",
" hue='Study',\n",
" errorbar=('sd', 1),\n",
" palette=palette[:3],\n",
" ax=axes[0])\n",
" # Set ax[0] y-axis to percentage\n",
" axes[0].yaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1, decimals=0))\n",
" # Set ax[0] y-axis limit from 0 to 100\n",
" axes[0].set_ylim(0, 1.0)\n",
" # Remove the x-axis label\n",
" axes[0].set_xlabel('')\n",
" axes[0].grid(axis='y', alpha=0.5, linewidth=0.5)\n",
"\n",
" sns.barplot(\n",
" data=df,\n",
" x='Experiment',\n",
" y='Test ROC AUC',\n",
" hue='Study',\n",
" errorbar=('sd', 1),\n",
" palette=palette[:3],\n",
" ax=axes[1])\n",
" axes[1].set_ylim(0, 1.0)\n",
" # Remove the legend from the first plot\n",
" axes[0].legend().remove()\n",
" # Set the legend outside the plot in the middle of the two subplots (3 columns)\n",
" axes[1].legend(loc='upper center', bbox_to_anchor=(-0.15, -0.12), ncol=3)\n",
" # Remove the x-axis label\n",
" axes[1].set_xlabel('')\n",
" axes[1].grid(axis='y', alpha=0.5, linewidth=0.5)\n",
"\n",
" # Add values to the bar plots rotated 90 degrees at 0.5 height\n",
" for i, ax in enumerate(axes):\n",
" for p in ax.patches:\n",
" if p.get_height() < 0.01:\n",
" continue\n",
" if i % 2 == 0:\n",
" value = f'{p.get_height():.1%}'\n",
" else:\n",
" value = f'{p.get_height():.3f}'\n",
" \n",
" x = p.get_x() + p.get_width() / 2\n",
" y = 0.3\n",
" ax.annotate(value, (x, y), ha='center', va='center', color='black', fontsize=10, rotation=90, alpha=0.8)\n",
"\n",
" if filename is not None:\n",
" plt.savefig(f'plots/{filename}.pdf', bbox_inches='tight')\n",
" # plt.savefig(f'plots/{filename}.png', bbox_inches='tight')\n",
" plt.show()\n",
"\n",
"print('-' * 80)\n",
"print('Comparison of the best models majority vote:')\n",
"plot_comparison_df(df, 'embedding_comparison_majority_vote')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"| Experiment | Study | Test Accuracy | Test ROC AUC |\n",
"|:------------------------------|:-----------|----------------:|---------------:|\n",
"| Baseline | Similarity | 0.800866 | 0.857498 |\n",
"| Baseline | Standard | 0.786325 | 0.851163 |\n",
"| Baseline | Target | 0.618421 | 0.588386 |\n",
"| Cells as one-hot | Similarity | 0.748918 | 0.826107 |\n",
"| Cells as one-hot | Standard | 0.807692 | 0.864895 |\n",
"| Cells as one-hot | Target | 0.622807 | 0.604413 |\n",
"| Proteins as amino-acid counts | Similarity | 0.753247 | 0.810298 |\n",
"| Proteins as amino-acid counts | Standard | 0.747863 | 0.831672 |\n",
"| Proteins as amino-acid counts | Target | 0.578947 | 0.540999 |\n",
"--------------------------------------------------------------------------------\n",
"Comparison of the best models mean values:\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = []\n",
"for r in ['test', 'cellsonehot_test', 'aminoacidcnt_test']:\n",
" tmp = reports[r]\n",
" tmp['Experiment'] = r\n",
" df.append(tmp)\n",
"df = pd.concat(df)\n",
"# Rename split_type to paper names\n",
"df['split_type'] = df['split_type'].replace({\n",
" 'random': 'Standard',\n",
" 'uniprot': 'Target',\n",
" 'tanimoto': 'Similarity',\n",
" 'standard': 'Standard',\n",
" 'target': 'Target',\n",
" 'similarity': 'Similarity',\n",
"})\n",
"# Rename columns to paper names\n",
"df.rename(columns={\n",
" 'split_type': 'Study',\n",
" 'test_acc': 'Test Accuracy',\n",
" 'test_roc_auc': 'Test ROC AUC',\n",
"}, inplace=True)\n",
"# Group by experiment and split type then get the mean\n",
"df = df.groupby(['Experiment', 'Study']).mean(['Test Accuracy', 'Test ROC AUC']).reset_index()\n",
"# Rename experiment names to paper names\n",
"df['Experiment'] = df['Experiment'].replace({\n",
" 'test': 'Baseline',\n",
" 'cellsonehot_test': 'Cells as one-hot',\n",
" 'aminoacidcnt_test': 'Proteins as amino-acid counts',\n",
"})\n",
"# Order df by Experiment\n",
"df = df.sort_values(['Experiment'])\n",
"# Order Study by ['Standard', 'Target', 'Similarity']\n",
"df['Study'] = pd.Categorical(df['Study'], ['Standard', 'Target', 'Similarity'])\n",
"\n",
"print(df[['Experiment', 'Study', 'Test Accuracy', 'Test ROC AUC']].to_markdown(index=False))\n",
"df['Experiment'] = df['Experiment'] = df['Experiment'].replace({\n",
" 'Cells as one-hot': 'Cells as\\none-hot',\n",
" 'Proteins as amino-acid counts': 'Proteins as\\nAA counts',\n",
"})\n",
"\n",
"print('-' * 80)\n",
"print('Comparison of the best models mean values:')\n",
"plot_comparison_df(df, 'embedding_comparison_mean')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Ablation Studies"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--------------------------------------------------------------------------------\n",
"Plotting ablation study for standard CV split\n",
"--------------------------------------------------------------------------------\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1899217/2296657062.py:75: UserWarning: The palette list has more values (4) than needed (1), which may not be intended.\n",
" sns.barplot(data=final_df,\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"--------------------------------------------------------------------------------\n",
"Plotting ablation study for target CV split\n",
"--------------------------------------------------------------------------------\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1899217/2296657062.py:75: UserWarning: The palette list has more values (4) than needed (1), which may not be intended.\n",
" sns.barplot(data=final_df,\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"--------------------------------------------------------------------------------\n",
"Plotting ablation study for similarity CV split\n",
"--------------------------------------------------------------------------------\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1899217/2296657062.py:75: UserWarning: The palette list has more values (4) than needed (1), which may not be intended.\n",
" sns.barplot(data=final_df,\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"--------------------------------------------------------------------------------\n",
"Plotting ablation study for standard CV split\n",
"--------------------------------------------------------------------------------\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1899217/2296657062.py:75: UserWarning: The palette list has more values (4) than needed (1), which may not be intended.\n",
" sns.barplot(data=final_df,\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"--------------------------------------------------------------------------------\n",
"Plotting ablation study for target CV split\n",
"--------------------------------------------------------------------------------\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1899217/2296657062.py:75: UserWarning: The palette list has more values (4) than needed (1), which may not be intended.\n",
" sns.barplot(data=final_df,\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"--------------------------------------------------------------------------------\n",
"Plotting ablation study for similarity CV split\n",
"--------------------------------------------------------------------------------\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1899217/2296657062.py:75: UserWarning: The palette list has more values (4) than needed (1), which may not be intended.\n",
" sns.barplot(data=final_df,\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"--------------------------------------------------------------------------------\n",
"Plotting ablation study for standard CV split\n",
"--------------------------------------------------------------------------------\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1899217/2296657062.py:75: UserWarning: The palette list has more values (4) than needed (1), which may not be intended.\n",
" sns.barplot(data=final_df,\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"--------------------------------------------------------------------------------\n",
"Plotting ablation study for target CV split\n",
"--------------------------------------------------------------------------------\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1899217/2296657062.py:75: UserWarning: The palette list has more values (4) than needed (1), which may not be intended.\n",
" sns.barplot(data=final_df,\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"--------------------------------------------------------------------------------\n",
"Plotting ablation study for similarity CV split\n",
"--------------------------------------------------------------------------------\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1899217/2296657062.py:75: UserWarning: The palette list has more values (4) than needed (1), which may not be intended.\n",
" sns.barplot(data=final_df,\n"
]
}
],
"source": [
"def plot_ablation_study(report, title=''):\n",
" # Define the ablation study combinations\n",
" ablation_study_combinations = [\n",
" 'disabled smiles',\n",
" 'disabled poi',\n",
" 'disabled e3',\n",
" 'disabled cell',\n",
" 'disabled poi e3',\n",
" 'disabled poi e3 smiles',\n",
" 'disabled poi e3 cell',\n",
" ]\n",
"\n",
" for group in report['split_type'].unique():\n",
" print('-' * 80)\n",
" print(f'Plotting ablation study for {group} CV split')\n",
" print('-' * 80)\n",
" baseline = report[report['disabled_embeddings'].isna()].copy()\n",
" baseline = baseline[baseline['split_type'] == group]\n",
" baseline['disabled_embeddings'] = 'all embeddings enabled'\n",
" # metrics_to_show = ['val_acc', 'test_acc']\n",
" metrics_to_show = ['test_acc']\n",
" # baseline = baseline.melt(id_vars=['fold', 'disabled_embeddings'], value_vars=metrics_to_show, var_name='metric', value_name='score')\n",
" baseline = baseline.melt(id_vars=['disabled_embeddings'], value_vars=metrics_to_show, var_name='metric', value_name='score')\n",
"\n",
" ablation_dfs = []\n",
" for disabled_embeddings in ablation_study_combinations:\n",
" tmp = report[report['disabled_embeddings'] == disabled_embeddings].copy()\n",
" tmp = tmp[tmp['split_type'] == group]\n",
" # tmp = tmp.melt(id_vars=['fold', 'disabled_embeddings'], value_vars=metrics_to_show, var_name='metric', value_name='score')\n",
" tmp = tmp.melt(id_vars=['disabled_embeddings'], value_vars=metrics_to_show, var_name='metric', value_name='score')\n",
" ablation_dfs.append(tmp)\n",
" ablation_df = pd.concat(ablation_dfs)\n",
"\n",
" dummy_test_df = pd.DataFrame()\n",
" tmp = report[report['split_type'] == group]\n",
" dummy_test_df['score'] = tmp[['test_active_perc', 'test_inactive_perc']].max(axis=1)\n",
" dummy_test_df['metric'] = 'test_acc'\n",
" dummy_test_df['disabled_embeddings'] = 'dummy'\n",
"\n",
" # dummy_df = pd.concat([dummy_val_df, dummy_test_df])\n",
" dummy_df = dummy_test_df\n",
"\n",
" final_df = pd.concat([dummy_df, baseline, ablation_df])\n",
"\n",
" final_df['metric'] = final_df['metric'].map({\n",
" 'val_acc': 'Validation Accuracy',\n",
" 'test_acc': 'Test Accuracy',\n",
" 'val_roc_auc': 'Val ROC-AUC',\n",
" 'test_roc_auc': 'Test ROC-AUC',\n",
" })\n",
"\n",
" final_df['disabled_embeddings'] = final_df['disabled_embeddings'].map({\n",
" 'all embeddings enabled': 'All embeddings enabled',\n",
" 'dummy': 'Dummy model',\n",
" 'disabled smiles': 'Disabled PROTAC information',\n",
" 'disabled e3': 'Disabled E3 information',\n",
" 'disabled poi': 'Disabled POI information',\n",
" 'disabled cell': 'Disabled cell information',\n",
" 'disabled poi e3': 'Disabled E3 and POI info',\n",
" 'disabled poi e3 smiles': 'Disabled compound, E3, and POI info\\n(only cell information left)',\n",
" 'disabled poi e3 cell': 'Disabled cell, E3, and POI info\\n(only PROTAC information left)',\n",
" })\n",
"\n",
" # Print final_df to latex\n",
" tmp = final_df.groupby(['disabled_embeddings', 'metric']).mean().round(3)\n",
" # Remove fold column to tmp\n",
" tmp = tmp.reset_index() #.drop('fold', axis=1)\n",
"\n",
" # print('DF to plot:\\n', tmp.to_markdown(index=False))\n",
"\n",
" fig, ax = plt.subplots(figsize=(3, 5))\n",
" \n",
" # fig, ax = plt.subplots()\n",
"\n",
" sns.barplot(data=final_df,\n",
" y='disabled_embeddings',\n",
" x='score',\n",
" hue='metric',\n",
" ax=ax,\n",
" errorbar=('sd', 1),\n",
" palette=sns.color_palette(palette, len(palette)),\n",
" saturation=1,\n",
" )\n",
"\n",
" # ax.set_title(f'{group.replace(\"random\", \"standard\")} CV split')\n",
" ax.grid(axis='x', alpha=0.5)\n",
" ax.tick_params(axis='y', rotation=0)\n",
" ax.set_xlim(0, 1.0)\n",
" ax.xaxis.set_major_formatter(plt.matplotlib.ticker.PercentFormatter(1, decimals=0))\n",
" ax.set_ylabel('')\n",
" ax.set_xlabel('')\n",
"\n",
" # Plot the legend below the x-axis, outside the plot\n",
" ax.legend(loc='upper center', bbox_to_anchor=(0.02, -0.1))\n",
"\n",
" # For each bar, add the rotated value (as percentage), inside the bar\n",
" for i, p in enumerate(plt.gca().patches):\n",
" # TODO: For some reasons, there is an additional bar being added at\n",
" # the end of the plot... it's not in the dataframe\n",
" if i == len(plt.gca().patches) - 1:\n",
" continue\n",
" value = '{:.1f}%'.format(100 * p.get_width())\n",
" y = p.get_y() + p.get_height() / 2\n",
" x = 0.2 # p.get_height() - p.get_height() / 2\n",
" plt.annotate(value, (x, y), ha='center', va='center', color='black', fontsize=10, alpha=0.8)\n",
"\n",
" plt.savefig(f'plots/{title}{group}.pdf', bbox_inches='tight')\n",
" plt.close()\n",
"\n",
"for experiment in ['', 'cellsonehot_', 'aminoacidcnt_']:\n",
" reports[f'{experiment}test']['disabled_embeddings'] = pd.NA\n",
" experiment_name = 'pytorch_' if experiment == '' else experiment\n",
" plot_ablation_study(\n",
" pd.concat([\n",
" reports[f'{experiment}ablation'],\n",
" reports[f'{experiment}test'],\n",
" ]),\n",
" title=f'{experiment_name}ablation_study_'\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Others"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/cephyr/users/ribes/Alvis/PROTAC-Degradation-Predictor\n"
]
}
],
"source": [
"!pwd"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Compound ID
\n",
"
Uniprot
\n",
"
Smiles
\n",
"
E3 Ligase
\n",
"
InChI
\n",
"
InChI Key
\n",
"
Molecular Weight
\n",
"
Heavy Atom Count
\n",
"
Ring Count
\n",
"
Rotatable Bond Count
\n",
"
...
\n",
"
Name
\n",
"
Assay (DC50/Dmax)
\n",
"
Exact Mass
\n",
"
XLogP3
\n",
"
Target (Parsed)
\n",
"
POI Sequence
\n",
"
E3 Ligase Uniprot
\n",
"
E3 Ligase Sequence
\n",
"
Cell Line Identifier
\n",
"
Active - OR
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
1
\n",
"
Q07817
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
VHL
\n",
"
InChI=1S/C73H88ClF3N10O10S4/c1-47(49-13-15-51(...
\n",
"
SXPDUCVNMGMWBJ-FMZBIETASA-N
\n",
"
1486.282
\n",
"
101
\n",
"
10
\n",
"
24
\n",
"
...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
MOLT-4
\n",
"
NaN
\n",
"
\n",
"
\n",
"
1
\n",
"
2
\n",
"
Q07817
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
VHL
\n",
"
InChI=1S/C74H90ClF3N10O10S4/c1-48(50-13-15-52(...
\n",
"
HQKUMELJMUNTTF-NMKDNUEVSA-N
\n",
"
1500.309
\n",
"
102
\n",
"
10
\n",
"
25
\n",
"
...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
MOLT-4
\n",
"
NaN
\n",
"
\n",
"
\n",
"
2
\n",
"
3
\n",
"
Q07817
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
VHL
\n",
"
InChI=1S/C75H92ClF3N10O10S4/c1-49(51-16-18-53(...
\n",
"
ATQCEJKUPSBDMA-QARNUTPLSA-N
\n",
"
1514.336
\n",
"
103
\n",
"
10
\n",
"
26
\n",
"
...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
MOLT-4
\n",
"
NaN
\n",
"
\n",
"
\n",
"
3
\n",
"
4
\n",
"
Q07817
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
VHL
\n",
"
InChI=1S/C76H94ClF3N10O10S4/c1-50(52-17-19-54(...
\n",
"
FNKQAGMHNFFSEI-DTTPTBRMSA-N
\n",
"
1528.363
\n",
"
104
\n",
"
10
\n",
"
27
\n",
"
...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
MOLT-4
\n",
"
NaN
\n",
"
\n",
"
\n",
"
4
\n",
"
5
\n",
"
Q07817
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
VHL
\n",
"
InChI=1S/C77H96ClF3N10O10S4/c1-51(53-18-20-55(...
\n",
"
PXVFFBGSTYQHRO-REQIQPEASA-N
\n",
"
1542.390
\n",
"
105
\n",
"
10
\n",
"
28
\n",
"
...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
MOLT-4
\n",
"
True
\n",
"
\n",
"
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
\n",
"
\n",
"
2136
\n",
"
2342
\n",
"
O60885
\n",
"
Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...
\n",
"
VHL
\n",
"
InChI=1S/C50H61ClN8O8S2/c1-29-31(3)69-49-42(29...
\n",
"
VRVWHAZIBGEPEK-DPSJZEHMSA-N
\n",
"
1001.673
\n",
"
69
\n",
"
7
\n",
"
20
\n",
"
...
\n",
"
NaN
\n",
"
Degradation of BRD4 long in HEK293 cells after...
\n",
"
1000.374231
\n",
"
6.76
\n",
"
BRD4 long
\n",
"
MSAESGPGTRLRNLPVMGDGLETSQMSTTQAQAQPQPANAASTNPP...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
HEK293
\n",
"
True
\n",
"
\n",
"
\n",
"
2137
\n",
"
2887
\n",
"
Q05397
\n",
"
CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCCOCCOCCO...
\n",
"
VHL
\n",
"
InChI=1S/C58H75F3N10O10S/c1-37(39-12-14-40(15-...
\n",
"
FOOHAGZPIHCYKX-ZSFXBAAMSA-N
\n",
"
1161.359
\n",
"
82
\n",
"
7
\n",
"
27
\n",
"
...
\n",
"
NaN
\n",
"
Degradation of FAK in A549 cells after 24 h tr...
\n",
"
1160.534044
\n",
"
6.81
\n",
"
FAK
\n",
"
MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
A549 Cas9
\n",
"
False
\n",
"
\n",
"
\n",
"
2138
\n",
"
2889
\n",
"
Q05397
\n",
"
CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCCOCC(=O)...
\n",
"
VHL
\n",
"
InChI=1S/C54H67F3N10O8S/c1-33(35-12-14-36(15-1...
\n",
"
RDCVMTUYWQXPEC-FSHOLZCKSA-N
\n",
"
1073.253
\n",
"
76
\n",
"
7
\n",
"
21
\n",
"
...
\n",
"
NaN
\n",
"
Degradation of FAK in A549 cells after 24 h tr...
\n",
"
1072.481615
\n",
"
7.11
\n",
"
FAK
\n",
"
MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
A549 Cas9
\n",
"
False
\n",
"
\n",
"
\n",
"
2139
\n",
"
2890
\n",
"
Q05397
\n",
"
CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCC(=O)N[C...
\n",
"
VHL
\n",
"
InChI=1S/C52H63F3N10O7S/c1-31(33-12-14-34(15-1...
\n",
"
SLSLLSIRBMAERC-MGVZSLQJSA-N
\n",
"
1029.200
\n",
"
73
\n",
"
7
\n",
"
18
\n",
"
...
\n",
"
NaN
\n",
"
Degradation of FAK in A549 cells after 24 h tr...
\n",
"
1028.455400
\n",
"
7.26
\n",
"
FAK
\n",
"
MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
A549 Cas9
\n",
"
True
\n",
"
\n",
"
\n",
"
2140
\n",
"
2891
\n",
"
Q05397
\n",
"
CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCC(=O)N[C@H...
\n",
"
VHL
\n",
"
InChI=1S/C51H61F3N10O6S/c1-30(32-12-14-33(15-1...
\n",
"
ASRIXACKPXMNKY-FCFVTTBASA-N
\n",
"
999.174
\n",
"
71
\n",
"
7
\n",
"
16
\n",
"
...
\n",
"
NaN
\n",
"
Degradation of FAK in A549 cells after 24 h tr...
\n",
"
998.444835
\n",
"
7.31
\n",
"
FAK
\n",
"
MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
A549 Cas9
\n",
"
True
\n",
"
\n",
" \n",
"
\n",
"
2141 rows × 35 columns
\n",
"
"
],
"text/plain": [
" Compound ID Uniprot Smiles \\\n",
"0 1 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"1 2 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"2 3 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"3 4 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"4 5 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"... ... ... ... \n",
"2136 2342 O60885 Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O... \n",
"2137 2887 Q05397 CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCCOCCOCCO... \n",
"2138 2889 Q05397 CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCCOCC(=O)... \n",
"2139 2890 Q05397 CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCC(=O)N[C... \n",
"2140 2891 Q05397 CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCC(=O)N[C@H... \n",
"\n",
" E3 Ligase InChI \\\n",
"0 VHL InChI=1S/C73H88ClF3N10O10S4/c1-47(49-13-15-51(... \n",
"1 VHL InChI=1S/C74H90ClF3N10O10S4/c1-48(50-13-15-52(... \n",
"2 VHL InChI=1S/C75H92ClF3N10O10S4/c1-49(51-16-18-53(... \n",
"3 VHL InChI=1S/C76H94ClF3N10O10S4/c1-50(52-17-19-54(... \n",
"4 VHL InChI=1S/C77H96ClF3N10O10S4/c1-51(53-18-20-55(... \n",
"... ... ... \n",
"2136 VHL InChI=1S/C50H61ClN8O8S2/c1-29-31(3)69-49-42(29... \n",
"2137 VHL InChI=1S/C58H75F3N10O10S/c1-37(39-12-14-40(15-... \n",
"2138 VHL InChI=1S/C54H67F3N10O8S/c1-33(35-12-14-36(15-1... \n",
"2139 VHL InChI=1S/C52H63F3N10O7S/c1-31(33-12-14-34(15-1... \n",
"2140 VHL InChI=1S/C51H61F3N10O6S/c1-30(32-12-14-33(15-1... \n",
"\n",
" InChI Key Molecular Weight Heavy Atom Count \\\n",
"0 SXPDUCVNMGMWBJ-FMZBIETASA-N 1486.282 101 \n",
"1 HQKUMELJMUNTTF-NMKDNUEVSA-N 1500.309 102 \n",
"2 ATQCEJKUPSBDMA-QARNUTPLSA-N 1514.336 103 \n",
"3 FNKQAGMHNFFSEI-DTTPTBRMSA-N 1528.363 104 \n",
"4 PXVFFBGSTYQHRO-REQIQPEASA-N 1542.390 105 \n",
"... ... ... ... \n",
"2136 VRVWHAZIBGEPEK-DPSJZEHMSA-N 1001.673 69 \n",
"2137 FOOHAGZPIHCYKX-ZSFXBAAMSA-N 1161.359 82 \n",
"2138 RDCVMTUYWQXPEC-FSHOLZCKSA-N 1073.253 76 \n",
"2139 SLSLLSIRBMAERC-MGVZSLQJSA-N 1029.200 73 \n",
"2140 ASRIXACKPXMNKY-FCFVTTBASA-N 999.174 71 \n",
"\n",
" Ring Count Rotatable Bond Count ... Name \\\n",
"0 10 24 ... NaN \n",
"1 10 25 ... NaN \n",
"2 10 26 ... NaN \n",
"3 10 27 ... NaN \n",
"4 10 28 ... NaN \n",
"... ... ... ... ... \n",
"2136 7 20 ... NaN \n",
"2137 7 27 ... NaN \n",
"2138 7 21 ... NaN \n",
"2139 7 18 ... NaN \n",
"2140 7 16 ... NaN \n",
"\n",
" Assay (DC50/Dmax) Exact Mass XLogP3 \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"... ... ... ... \n",
"2136 Degradation of BRD4 long in HEK293 cells after... 1000.374231 6.76 \n",
"2137 Degradation of FAK in A549 cells after 24 h tr... 1160.534044 6.81 \n",
"2138 Degradation of FAK in A549 cells after 24 h tr... 1072.481615 7.11 \n",
"2139 Degradation of FAK in A549 cells after 24 h tr... 1028.455400 7.26 \n",
"2140 Degradation of FAK in A549 cells after 24 h tr... 998.444835 7.31 \n",
"\n",
" Target (Parsed) POI Sequence \\\n",
"0 NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"1 NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"2 NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"3 NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"4 NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"... ... ... \n",
"2136 BRD4 long MSAESGPGTRLRNLPVMGDGLETSQMSTTQAQAQPQPANAASTNPP... \n",
"2137 FAK MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN... \n",
"2138 FAK MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN... \n",
"2139 FAK MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN... \n",
"2140 FAK MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN... \n",
"\n",
" E3 Ligase Uniprot E3 Ligase Sequence \\\n",
"0 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"1 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"2 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"3 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"4 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"... ... ... \n",
"2136 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"2137 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"2138 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"2139 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"2140 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"\n",
" Cell Line Identifier Active - OR \n",
"0 MOLT-4 NaN \n",
"1 MOLT-4 NaN \n",
"2 MOLT-4 NaN \n",
"3 MOLT-4 NaN \n",
"4 MOLT-4 True \n",
"... ... ... \n",
"2136 HEK293 True \n",
"2137 A549 Cas9 False \n",
"2138 A549 Cas9 False \n",
"2139 A549 Cas9 True \n",
"2140 A549 Cas9 True \n",
"\n",
"[2141 rows x 35 columns]"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import sys\n",
"import protac_degradation_predictor as pdp\n",
"\n",
"protac_df = pdp.load_curated_dataset()\n",
"protac_df"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from rdkit import Chem\n",
"\n",
"def canon_smiles(smi):\n",
" mol = Chem.MolFromSmiles(smi)\n",
" if mol is None:\n",
" return None\n",
" return Chem.MolToSmiles(mol)\n",
"\n",
"# Canonicalize SMILES\n",
"protac_df['canon_smiles'] = protac_df['Smiles'].apply(lambda x: canon_smiles(x))\n",
"# Check that all canon_smiles is equal to the Smiles column\n",
"protac_df['canon_smiles'].equals(protac_df['Smiles'])"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Compound ID
\n",
"
Uniprot
\n",
"
Smiles
\n",
"
E3 Ligase
\n",
"
InChI
\n",
"
InChI Key
\n",
"
Molecular Weight
\n",
"
Heavy Atom Count
\n",
"
Ring Count
\n",
"
Rotatable Bond Count
\n",
"
...
\n",
"
Name
\n",
"
Assay (DC50/Dmax)
\n",
"
Exact Mass
\n",
"
XLogP3
\n",
"
Target (Parsed)
\n",
"
POI Sequence
\n",
"
E3 Ligase Uniprot
\n",
"
E3 Ligase Sequence
\n",
"
Cell Line Identifier
\n",
"
Active - OR
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
1
\n",
"
Q07817
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
VHL
\n",
"
InChI=1S/C73H88ClF3N10O10S4/c1-47(49-13-15-51(...
\n",
"
SXPDUCVNMGMWBJ-FMZBIETASA-N
\n",
"
1486.282
\n",
"
101
\n",
"
10
\n",
"
24
\n",
"
...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
MOLT-4
\n",
"
NaN
\n",
"
\n",
"
\n",
"
1
\n",
"
2
\n",
"
Q07817
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
VHL
\n",
"
InChI=1S/C74H90ClF3N10O10S4/c1-48(50-13-15-52(...
\n",
"
HQKUMELJMUNTTF-NMKDNUEVSA-N
\n",
"
1500.309
\n",
"
102
\n",
"
10
\n",
"
25
\n",
"
...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
MOLT-4
\n",
"
NaN
\n",
"
\n",
"
\n",
"
2
\n",
"
3
\n",
"
Q07817
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
VHL
\n",
"
InChI=1S/C75H92ClF3N10O10S4/c1-49(51-16-18-53(...
\n",
"
ATQCEJKUPSBDMA-QARNUTPLSA-N
\n",
"
1514.336
\n",
"
103
\n",
"
10
\n",
"
26
\n",
"
...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
MOLT-4
\n",
"
NaN
\n",
"
\n",
"
\n",
"
3
\n",
"
4
\n",
"
Q07817
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
VHL
\n",
"
InChI=1S/C76H94ClF3N10O10S4/c1-50(52-17-19-54(...
\n",
"
FNKQAGMHNFFSEI-DTTPTBRMSA-N
\n",
"
1528.363
\n",
"
104
\n",
"
10
\n",
"
27
\n",
"
...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
MOLT-4
\n",
"
NaN
\n",
"
\n",
"
\n",
"
4
\n",
"
5
\n",
"
Q07817
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
VHL
\n",
"
InChI=1S/C77H96ClF3N10O10S4/c1-51(53-18-20-55(...
\n",
"
PXVFFBGSTYQHRO-REQIQPEASA-N
\n",
"
1542.390
\n",
"
105
\n",
"
10
\n",
"
28
\n",
"
...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
MOLT-4
\n",
"
True
\n",
"
\n",
"
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
\n",
"
\n",
"
2136
\n",
"
2342
\n",
"
O60885
\n",
"
Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...
\n",
"
VHL
\n",
"
InChI=1S/C50H61ClN8O8S2/c1-29-31(3)69-49-42(29...
\n",
"
VRVWHAZIBGEPEK-DPSJZEHMSA-N
\n",
"
1001.673
\n",
"
69
\n",
"
7
\n",
"
20
\n",
"
...
\n",
"
NaN
\n",
"
Degradation of BRD4 long in HEK293 cells after...
\n",
"
1000.374231
\n",
"
6.76
\n",
"
BRD4 long
\n",
"
MSAESGPGTRLRNLPVMGDGLETSQMSTTQAQAQPQPANAASTNPP...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
HEK293
\n",
"
True
\n",
"
\n",
"
\n",
"
2137
\n",
"
2887
\n",
"
Q05397
\n",
"
CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCCOCCOCCO...
\n",
"
VHL
\n",
"
InChI=1S/C58H75F3N10O10S/c1-37(39-12-14-40(15-...
\n",
"
FOOHAGZPIHCYKX-ZSFXBAAMSA-N
\n",
"
1161.359
\n",
"
82
\n",
"
7
\n",
"
27
\n",
"
...
\n",
"
NaN
\n",
"
Degradation of FAK in A549 cells after 24 h tr...
\n",
"
1160.534044
\n",
"
6.81
\n",
"
FAK
\n",
"
MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
A549 Cas9
\n",
"
False
\n",
"
\n",
"
\n",
"
2138
\n",
"
2889
\n",
"
Q05397
\n",
"
CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCCOCC(=O)...
\n",
"
VHL
\n",
"
InChI=1S/C54H67F3N10O8S/c1-33(35-12-14-36(15-1...
\n",
"
RDCVMTUYWQXPEC-FSHOLZCKSA-N
\n",
"
1073.253
\n",
"
76
\n",
"
7
\n",
"
21
\n",
"
...
\n",
"
NaN
\n",
"
Degradation of FAK in A549 cells after 24 h tr...
\n",
"
1072.481615
\n",
"
7.11
\n",
"
FAK
\n",
"
MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
A549 Cas9
\n",
"
False
\n",
"
\n",
"
\n",
"
2139
\n",
"
2890
\n",
"
Q05397
\n",
"
CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCC(=O)N[C...
\n",
"
VHL
\n",
"
InChI=1S/C52H63F3N10O7S/c1-31(33-12-14-34(15-1...
\n",
"
SLSLLSIRBMAERC-MGVZSLQJSA-N
\n",
"
1029.200
\n",
"
73
\n",
"
7
\n",
"
18
\n",
"
...
\n",
"
NaN
\n",
"
Degradation of FAK in A549 cells after 24 h tr...
\n",
"
1028.455400
\n",
"
7.26
\n",
"
FAK
\n",
"
MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
A549 Cas9
\n",
"
True
\n",
"
\n",
"
\n",
"
2140
\n",
"
2891
\n",
"
Q05397
\n",
"
CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCC(=O)N[C@H...
\n",
"
VHL
\n",
"
InChI=1S/C51H61F3N10O6S/c1-30(32-12-14-33(15-1...
\n",
"
ASRIXACKPXMNKY-FCFVTTBASA-N
\n",
"
999.174
\n",
"
71
\n",
"
7
\n",
"
16
\n",
"
...
\n",
"
NaN
\n",
"
Degradation of FAK in A549 cells after 24 h tr...
\n",
"
998.444835
\n",
"
7.31
\n",
"
FAK
\n",
"
MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
A549 Cas9
\n",
"
True
\n",
"
\n",
" \n",
"
\n",
"
2141 rows × 35 columns
\n",
"
"
],
"text/plain": [
" Compound ID Uniprot Smiles \\\n",
"0 1 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"1 2 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"2 3 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"3 4 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"4 5 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"... ... ... ... \n",
"2136 2342 O60885 Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O... \n",
"2137 2887 Q05397 CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCCOCCOCCO... \n",
"2138 2889 Q05397 CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCCOCC(=O)... \n",
"2139 2890 Q05397 CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCOCC(=O)N[C... \n",
"2140 2891 Q05397 CNC(=O)c1ccccc1Nc1cc(Nc2ccc(N3CCN(CCC(=O)N[C@H... \n",
"\n",
" E3 Ligase InChI \\\n",
"0 VHL InChI=1S/C73H88ClF3N10O10S4/c1-47(49-13-15-51(... \n",
"1 VHL InChI=1S/C74H90ClF3N10O10S4/c1-48(50-13-15-52(... \n",
"2 VHL InChI=1S/C75H92ClF3N10O10S4/c1-49(51-16-18-53(... \n",
"3 VHL InChI=1S/C76H94ClF3N10O10S4/c1-50(52-17-19-54(... \n",
"4 VHL InChI=1S/C77H96ClF3N10O10S4/c1-51(53-18-20-55(... \n",
"... ... ... \n",
"2136 VHL InChI=1S/C50H61ClN8O8S2/c1-29-31(3)69-49-42(29... \n",
"2137 VHL InChI=1S/C58H75F3N10O10S/c1-37(39-12-14-40(15-... \n",
"2138 VHL InChI=1S/C54H67F3N10O8S/c1-33(35-12-14-36(15-1... \n",
"2139 VHL InChI=1S/C52H63F3N10O7S/c1-31(33-12-14-34(15-1... \n",
"2140 VHL InChI=1S/C51H61F3N10O6S/c1-30(32-12-14-33(15-1... \n",
"\n",
" InChI Key Molecular Weight Heavy Atom Count \\\n",
"0 SXPDUCVNMGMWBJ-FMZBIETASA-N 1486.282 101 \n",
"1 HQKUMELJMUNTTF-NMKDNUEVSA-N 1500.309 102 \n",
"2 ATQCEJKUPSBDMA-QARNUTPLSA-N 1514.336 103 \n",
"3 FNKQAGMHNFFSEI-DTTPTBRMSA-N 1528.363 104 \n",
"4 PXVFFBGSTYQHRO-REQIQPEASA-N 1542.390 105 \n",
"... ... ... ... \n",
"2136 VRVWHAZIBGEPEK-DPSJZEHMSA-N 1001.673 69 \n",
"2137 FOOHAGZPIHCYKX-ZSFXBAAMSA-N 1161.359 82 \n",
"2138 RDCVMTUYWQXPEC-FSHOLZCKSA-N 1073.253 76 \n",
"2139 SLSLLSIRBMAERC-MGVZSLQJSA-N 1029.200 73 \n",
"2140 ASRIXACKPXMNKY-FCFVTTBASA-N 999.174 71 \n",
"\n",
" Ring Count Rotatable Bond Count ... Name \\\n",
"0 10 24 ... NaN \n",
"1 10 25 ... NaN \n",
"2 10 26 ... NaN \n",
"3 10 27 ... NaN \n",
"4 10 28 ... NaN \n",
"... ... ... ... ... \n",
"2136 7 20 ... NaN \n",
"2137 7 27 ... NaN \n",
"2138 7 21 ... NaN \n",
"2139 7 18 ... NaN \n",
"2140 7 16 ... NaN \n",
"\n",
" Assay (DC50/Dmax) Exact Mass XLogP3 \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"... ... ... ... \n",
"2136 Degradation of BRD4 long in HEK293 cells after... 1000.374231 6.76 \n",
"2137 Degradation of FAK in A549 cells after 24 h tr... 1160.534044 6.81 \n",
"2138 Degradation of FAK in A549 cells after 24 h tr... 1072.481615 7.11 \n",
"2139 Degradation of FAK in A549 cells after 24 h tr... 1028.455400 7.26 \n",
"2140 Degradation of FAK in A549 cells after 24 h tr... 998.444835 7.31 \n",
"\n",
" Target (Parsed) POI Sequence \\\n",
"0 NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"1 NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"2 NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"3 NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"4 NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"... ... ... \n",
"2136 BRD4 long MSAESGPGTRLRNLPVMGDGLETSQMSTTQAQAQPQPANAASTNPP... \n",
"2137 FAK MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN... \n",
"2138 FAK MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN... \n",
"2139 FAK MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN... \n",
"2140 FAK MAAAYLDPNLNHTPNSSTKTHLGTGMERSPGAMERVLKVFHYFESN... \n",
"\n",
" E3 Ligase Uniprot E3 Ligase Sequence \\\n",
"0 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"1 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"2 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"3 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"4 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"... ... ... \n",
"2136 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"2137 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"2138 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"2139 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"2140 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"\n",
" Cell Line Identifier Active - OR \n",
"0 MOLT-4 NaN \n",
"1 MOLT-4 NaN \n",
"2 MOLT-4 NaN \n",
"3 MOLT-4 NaN \n",
"4 MOLT-4 True \n",
"... ... ... \n",
"2136 HEK293 True \n",
"2137 A549 Cas9 False \n",
"2138 A549 Cas9 False \n",
"2139 A549 Cas9 True \n",
"2140 A549 Cas9 True \n",
"\n",
"[2141 rows x 35 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Compound ID
\n",
"
Uniprot
\n",
"
Smiles
\n",
"
E3 Ligase
\n",
"
InChI
\n",
"
InChI Key
\n",
"
Molecular Weight
\n",
"
Heavy Atom Count
\n",
"
Ring Count
\n",
"
Rotatable Bond Count
\n",
"
...
\n",
"
Name
\n",
"
Assay (DC50/Dmax)
\n",
"
Exact Mass
\n",
"
XLogP3
\n",
"
Target (Parsed)
\n",
"
POI Sequence
\n",
"
E3 Ligase Uniprot
\n",
"
E3 Ligase Sequence
\n",
"
Cell Line Identifier
\n",
"
Active - OR
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
1
\n",
"
Q07817
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
VHL
\n",
"
InChI=1S/C73H88ClF3N10O10S4/c1-47(49-13-15-51(...
\n",
"
SXPDUCVNMGMWBJ-FMZBIETASA-N
\n",
"
1486.282
\n",
"
101.0
\n",
"
10.0
\n",
"
24.0
\n",
"
...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
MOLT-4
\n",
"
NaN
\n",
"
\n",
"
\n",
"
1
\n",
"
2
\n",
"
Q07817
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
VHL
\n",
"
InChI=1S/C74H90ClF3N10O10S4/c1-48(50-13-15-52(...
\n",
"
HQKUMELJMUNTTF-NMKDNUEVSA-N
\n",
"
1500.309
\n",
"
102.0
\n",
"
10.0
\n",
"
25.0
\n",
"
...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
MOLT-4
\n",
"
NaN
\n",
"
\n",
"
\n",
"
2
\n",
"
3
\n",
"
Q07817
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
VHL
\n",
"
InChI=1S/C75H92ClF3N10O10S4/c1-49(51-16-18-53(...
\n",
"
ATQCEJKUPSBDMA-QARNUTPLSA-N
\n",
"
1514.336
\n",
"
103.0
\n",
"
10.0
\n",
"
26.0
\n",
"
...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
MOLT-4
\n",
"
NaN
\n",
"
\n",
"
\n",
"
3
\n",
"
4
\n",
"
Q07817
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
VHL
\n",
"
InChI=1S/C76H94ClF3N10O10S4/c1-50(52-17-19-54(...
\n",
"
FNKQAGMHNFFSEI-DTTPTBRMSA-N
\n",
"
1528.363
\n",
"
104.0
\n",
"
10.0
\n",
"
27.0
\n",
"
...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
MOLT-4
\n",
"
NaN
\n",
"
\n",
"
\n",
"
4
\n",
"
6
\n",
"
Q07817
\n",
"
Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...
\n",
"
VHL
\n",
"
InChI=1S/C78H98ClF3N10O10S4/c1-52(54-19-21-56(...
\n",
"
DKBAKHBUQPFQDO-PXKQGBTKSA-N
\n",
"
1556.417
\n",
"
106.0
\n",
"
10.0
\n",
"
29.0
\n",
"
...
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
NaN
\n",
"
MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...
\n",
"
P40337
\n",
"
MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE...
\n",
"
MOLT-4
\n",
"
NaN
\n",
"
\n",
"
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
\n",
"
\n",
"
1896
\n",
"
384
\n",
"
Q07820
\n",
"
O=C(CCCCC(=O)NCCN1C(=O)c2cccc3c(Sc4ccc(Br)cc4)...
\n",
"
CRBN
\n",
"
InChI=1S/C45H45BrN6O8S/c46-27-15-17-28(18-16-2...
\n",
"
BORXNUWYWZOREQ-UHFFFAOYSA-N
\n",
"
909.860
\n",
"
61.0
\n",
"
7.0
\n",
"
19.0
\n",
"
...
\n",
"
NaN
\n",
"
NaN
\n",
"
908.220296
\n",
"
5.98
\n",
"
NaN
\n",
"
MFGLKRNAVIGLNLYCGGAGLGAGSGGATRPGGRLLATEKEASARR...
\n",
"
Q96SW2
\n",
"
MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...
\n",
"
HeLa
\n",
"
NaN
\n",
"
\n",
"
\n",
"
1897
\n",
"
910
\n",
"
Q9UBN7
\n",
"
O=C(CCCCCCC(=O)N/N=C/c1ccc(OCCOCCOCCn2cc(CNc3c...
\n",
"
CRBN
\n",
"
InChI=1S/C37H45N9O10/c47-31-15-14-30(35(50)40-...
\n",
"
MHILTYZXXFOWJH-WVKHYPTHSA-N
\n",
"
775.820
\n",
"
56.0
\n",
"
5.0
\n",
"
23.0
\n",
"
...
\n",
"
NaN
\n",
"
NaN
\n",
"
775.328939
\n",
"
1.00
\n",
"
NaN
\n",
"
MTSTGQDSTTTRQRRSRQNPQSPPQDSSVTSKRNIKKGAVPRSIPN...
\n",
"
Q96SW2
\n",
"
MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...
\n",
"
MM1.S
\n",
"
NaN
\n",
"
\n",
"
\n",
"
1898
\n",
"
2544
\n",
"
O60760
\n",
"
O=C1CCC(N2C(=O)c3cccc(N4CCN(C(=O)C5CCN(c6ccc(N...
\n",
"
CRBN
\n",
"
InChI=1S/C40H38N8O7/c49-33-14-13-32(36(51)44-3...
\n",
"
KQNXUQJGOJWQGL-UHFFFAOYSA-N
\n",
"
742.793
\n",
"
55.0
\n",
"
8.0
\n",
"
8.0
\n",
"
...
\n",
"
PROTAC(H-PGDS)-7
\n",
"
Degradation of HPGDS in KU812 cells after 6/24...
\n",
"
742.286346
\n",
"
2.77
\n",
"
HPGDS
\n",
"
MPNYKLTYFNMRGRAEIIRYIFAYLDIQYEDHRIEQADWPEIKSTL...
\n",
"
Q96SW2
\n",
"
MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...
\n",
"
Ku812
\n",
"
True
\n",
"
\n",
"
\n",
"
1899
\n",
"
1214
\n",
"
P14174
\n",
"
O=C1CCC(N2C(=O)c3cccc(NCCCCCCCC(=O)Nc4ccc(N5Cc...
\n",
"
CRBN
\n",
"
InChI=1S/C35H35N5O8/c41-24-15-10-21-20-39(35(4...
\n",
"
HAHDZDUOFHMMEA-UHFFFAOYSA-N
\n",
"
653.692
\n",
"
48.0
\n",
"
6.0
\n",
"
12.0
\n",
"
...
\n",
"
NaN
\n",
"
NaN
\n",
"
653.248563
\n",
"
4.16
\n",
"
NaN
\n",
"
MPMFIVNTNVPRASVPDGFLSELTQQLAQATGKPPQYIAVHVVPDQ...
\n",
"
Q96SW2
\n",
"
MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...
\n",
"
A549 Cas9
\n",
"
False
\n",
"
\n",
"
\n",
"
1900
\n",
"
136
\n",
"
P00533
\n",
"
O=C1CCC(N2Cc3c(NC(=O)CCCCCCCN4CCN(c5ccc(Nc6cc7...
\n",
"
CRBN
\n",
"
InChI=1S/C49H57FN12O5/c50-33-10-12-34(13-11-33...
\n",
"
ZWLPWTDAYPOQGR-UHFFFAOYSA-N
\n",
"
913.072
\n",
"
67.0
\n",
"
9.0
\n",
"
17.0
\n",
"
...
\n",
"
NaN
\n",
"
NaN
\n",
"
912.455891
\n",
"
5.26
\n",
"
NaN
\n",
"
MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...
\n",
"
Q96SW2
\n",
"
MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI...
\n",
"
HCC827
\n",
"
True
\n",
"
\n",
" \n",
"
\n",
"
1901 rows × 35 columns
\n",
"
"
],
"text/plain": [
" Compound ID Uniprot Smiles \\\n",
"0 1 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"1 2 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"2 3 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"3 4 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"4 6 Q07817 Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)... \n",
"... ... ... ... \n",
"1896 384 Q07820 O=C(CCCCC(=O)NCCN1C(=O)c2cccc3c(Sc4ccc(Br)cc4)... \n",
"1897 910 Q9UBN7 O=C(CCCCCCC(=O)N/N=C/c1ccc(OCCOCCOCCn2cc(CNc3c... \n",
"1898 2544 O60760 O=C1CCC(N2C(=O)c3cccc(N4CCN(C(=O)C5CCN(c6ccc(N... \n",
"1899 1214 P14174 O=C1CCC(N2C(=O)c3cccc(NCCCCCCCC(=O)Nc4ccc(N5Cc... \n",
"1900 136 P00533 O=C1CCC(N2Cc3c(NC(=O)CCCCCCCN4CCN(c5ccc(Nc6cc7... \n",
"\n",
" E3 Ligase InChI \\\n",
"0 VHL InChI=1S/C73H88ClF3N10O10S4/c1-47(49-13-15-51(... \n",
"1 VHL InChI=1S/C74H90ClF3N10O10S4/c1-48(50-13-15-52(... \n",
"2 VHL InChI=1S/C75H92ClF3N10O10S4/c1-49(51-16-18-53(... \n",
"3 VHL InChI=1S/C76H94ClF3N10O10S4/c1-50(52-17-19-54(... \n",
"4 VHL InChI=1S/C78H98ClF3N10O10S4/c1-52(54-19-21-56(... \n",
"... ... ... \n",
"1896 CRBN InChI=1S/C45H45BrN6O8S/c46-27-15-17-28(18-16-2... \n",
"1897 CRBN InChI=1S/C37H45N9O10/c47-31-15-14-30(35(50)40-... \n",
"1898 CRBN InChI=1S/C40H38N8O7/c49-33-14-13-32(36(51)44-3... \n",
"1899 CRBN InChI=1S/C35H35N5O8/c41-24-15-10-21-20-39(35(4... \n",
"1900 CRBN InChI=1S/C49H57FN12O5/c50-33-10-12-34(13-11-33... \n",
"\n",
" InChI Key Molecular Weight Heavy Atom Count \\\n",
"0 SXPDUCVNMGMWBJ-FMZBIETASA-N 1486.282 101.0 \n",
"1 HQKUMELJMUNTTF-NMKDNUEVSA-N 1500.309 102.0 \n",
"2 ATQCEJKUPSBDMA-QARNUTPLSA-N 1514.336 103.0 \n",
"3 FNKQAGMHNFFSEI-DTTPTBRMSA-N 1528.363 104.0 \n",
"4 DKBAKHBUQPFQDO-PXKQGBTKSA-N 1556.417 106.0 \n",
"... ... ... ... \n",
"1896 BORXNUWYWZOREQ-UHFFFAOYSA-N 909.860 61.0 \n",
"1897 MHILTYZXXFOWJH-WVKHYPTHSA-N 775.820 56.0 \n",
"1898 KQNXUQJGOJWQGL-UHFFFAOYSA-N 742.793 55.0 \n",
"1899 HAHDZDUOFHMMEA-UHFFFAOYSA-N 653.692 48.0 \n",
"1900 ZWLPWTDAYPOQGR-UHFFFAOYSA-N 913.072 67.0 \n",
"\n",
" Ring Count Rotatable Bond Count ... Name \\\n",
"0 10.0 24.0 ... NaN \n",
"1 10.0 25.0 ... NaN \n",
"2 10.0 26.0 ... NaN \n",
"3 10.0 27.0 ... NaN \n",
"4 10.0 29.0 ... NaN \n",
"... ... ... ... ... \n",
"1896 7.0 19.0 ... NaN \n",
"1897 5.0 23.0 ... NaN \n",
"1898 8.0 8.0 ... PROTAC(H-PGDS)-7 \n",
"1899 6.0 12.0 ... NaN \n",
"1900 9.0 17.0 ... NaN \n",
"\n",
" Assay (DC50/Dmax) Exact Mass XLogP3 \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"... ... ... ... \n",
"1896 NaN 908.220296 5.98 \n",
"1897 NaN 775.328939 1.00 \n",
"1898 Degradation of HPGDS in KU812 cells after 6/24... 742.286346 2.77 \n",
"1899 NaN 653.248563 4.16 \n",
"1900 NaN 912.455891 5.26 \n",
"\n",
" Target (Parsed) POI Sequence \\\n",
"0 NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"1 NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"2 NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"3 NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"4 NaN MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME... \n",
"... ... ... \n",
"1896 NaN MFGLKRNAVIGLNLYCGGAGLGAGSGGATRPGGRLLATEKEASARR... \n",
"1897 NaN MTSTGQDSTTTRQRRSRQNPQSPPQDSSVTSKRNIKKGAVPRSIPN... \n",
"1898 HPGDS MPNYKLTYFNMRGRAEIIRYIFAYLDIQYEDHRIEQADWPEIKSTL... \n",
"1899 NaN MPMFIVNTNVPRASVPDGFLSELTQQLAQATGKPPQYIAVHVVPDQ... \n",
"1900 NaN MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED... \n",
"\n",
" E3 Ligase Uniprot E3 Ligase Sequence \\\n",
"0 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"1 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"2 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"3 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"4 P40337 MPRRAENWDEAEVGAEEAGVEEYGPEEDGGEESGAEESGPEESGPE... \n",
"... ... ... \n",
"1896 Q96SW2 MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI... \n",
"1897 Q96SW2 MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI... \n",
"1898 Q96SW2 MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI... \n",
"1899 Q96SW2 MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI... \n",
"1900 Q96SW2 MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNI... \n",
"\n",
" Cell Line Identifier Active - OR \n",
"0 MOLT-4 NaN \n",
"1 MOLT-4 NaN \n",
"2 MOLT-4 NaN \n",
"3 MOLT-4 NaN \n",
"4 MOLT-4 NaN \n",
"... ... ... \n",
"1896 HeLa NaN \n",
"1897 MM1.S NaN \n",
"1898 Ku812 True \n",
"1899 A549 Cas9 False \n",
"1900 HCC827 True \n",
"\n",
"[1901 rows x 35 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import numpy as np\n",
"\n",
"# Remove duplicates with a custom function\n",
"def merge_numerical_cols(group):\n",
" key_cols = [\n",
" 'Smiles',\n",
" 'Uniprot',\n",
" 'E3 Ligase Uniprot',\n",
" 'Cell Line Identifier',\n",
" ]\n",
" class_cols = ['DC50 (nM)', 'Dmax (%)']\n",
" # Loop over all numerical columns\n",
" for col in group.select_dtypes(include=[np.number]).columns:\n",
" if col == 'Compound ID':\n",
" continue\n",
" # Compute the geometric mean for the column\n",
" values = group[col].dropna()\n",
" if not values.empty:\n",
" group[col] = np.prod(values) ** (1 / len(values))\n",
"\n",
" row = group.drop_duplicates(subset=key_cols + class_cols).reset_index(drop=True)\n",
"\n",
" assert len(row) == 1\n",
"\n",
" return row\n",
"\n",
"\n",
"def remove_duplicates(df):\n",
" key_cols = [\n",
" 'Smiles',\n",
" 'Uniprot',\n",
" 'E3 Ligase Uniprot',\n",
" 'Cell Line Identifier',\n",
" ]\n",
" class_cols = ['DC50 (nM)', 'Dmax (%)']\n",
" # Check if there are any duplicated entries having the same key columns, if\n",
" # so, merge them by applying a geometric mean to their DC50 and Dmax columns\n",
" duplicated = df[df.duplicated(subset=key_cols, keep=False)]\n",
"\n",
" # NOTE: Reset index to remove the multi-index\n",
" merged = duplicated.groupby(key_cols).apply(lambda x: merge_numerical_cols(x))\n",
" merged = merged.reset_index(drop=True)\n",
"\n",
" # Remove the duplicated entries from the original dataframe df\n",
" df = df[~df.duplicated(subset=key_cols, keep=False)]\n",
" # Concatenate the merged dataframe with the original dataframe\n",
" return pd.concat([df, merged], ignore_index=True)\n",
"\n",
"\n",
"display(protac_df)\n",
"display(remove_duplicates(protac_df))"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
"pDC50_threshold = 6.0\n",
"Dmax_threshold = 0.6\n",
"protac_df['Active'] = protac_df.apply(\n",
" lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1\n",
")\n",
"protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Smiles
\n",
"
Uniprot
\n",
"
Cell Line Identifier
\n",
"
E3 Ligase Uniprot
\n",
"
Active
\n",
"
Database
\n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Smiles, Uniprot, Cell Line Identifier, E3 Ligase Uniprot, Active, Database]\n",
"Index: []"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Get all entries with same ['Smiles', 'Uniprot', 'Cell Line Identifier', 'E3 Ligase Uniprot'] columns\n",
"tmp = protac_df.dropna(subset=['Smiles', 'Uniprot', 'Cell Line Identifier', 'E3 Ligase Uniprot', 'Active'])[['Smiles', 'Uniprot', 'Cell Line Identifier', 'E3 Ligase Uniprot', 'Active', 'Database']]\n",
"\n",
"# Get entries with duplicates\n",
"duplicates = tmp[tmp.duplicated(subset=['Smiles', 'Uniprot', 'Cell Line Identifier', 'E3 Ligase Uniprot', 'Active'], keep=False)]\n",
"# Sort duplicates, so that they appear close to each other\n",
"duplicates = duplicates.sort_values(['Smiles', 'Uniprot', 'Cell Line Identifier', 'E3 Ligase Uniprot', 'Active'])\n",
"duplicates.to_csv('duplicates.csv', index=False)\n",
"duplicates"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"