In [2]:
# from IPython.display import display_html

import logging
import warnings
import re
import os
import numpy as np
import pandas as pd
import pickle
import pickle
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import AllChem
from typing import Literal, Union, List, Dict, Any, Callable
from collections import defaultdict
from tqdm.auto import tqdm
from rdkit import RDLogger

RDLogger.DisableLog('rdApp.*')

In [3]:
def set_global_logging_level(level=logging.ERROR, prefices=[""]):
    """
    Override logging levels of different modules based on their name as a prefix.
    It needs to be invoked after the modules have been loaded so that their loggers have been initialized.

    Args:
        - level: desired level. e.g. logging.INFO. Optional. Default is logging.ERROR
        - prefices: list of one or more str prefices to match (e.g. ["transformers", "torch"]). Optional.
          Default is `[""]` to match all active loggers.
          The match is a case-sensitive `module_name.startswith(prefix)`
    """
    prefix_re = re.compile(fr'^(?:{ "|".join(prefices) })')
    for name in logging.root.manager.loggerDict:
        if re.match(prefix_re, name):
            logging.getLogger(name).setLevel(level)


# Filter out annoying Pytorch Lightning printouts
warnings.filterwarnings('ignore')
warnings.filterwarnings(
    'ignore', '.*Covariance of the parameters could not be estimated.*')
warnings.filterwarnings(
    'ignore', '.*You seem to be using the pipelines sequentially on GPU.*')

In [4]:
# data_dir = os.path.join(os.getcwd(), '..', 'data')
data_dir = os.path.join(os.getcwd(), 'data')
dirs_to_make = [
    data_dir,
    # os.path.join(data_dir, 'raw'),
    # os.path.join(data_dir, 'processed'),
]
for d in dirs_to_make:
    if not os.path.exists(d):
        os.makedirs(d)

In [5]:
protacdb_file = os.path.join(data_dir, 'PROTAC-DB.csv')
protac_df = pd.read_csv(protacdb_file).reset_index(drop=True)

protacdb_file = os.path.join(data_dir, 'PROTAC-DB-v2.csv')
protac_v2_df = pd.read_csv(protacdb_file).reset_index(drop=True)

print(f'Loaded protac.csv')

old2new = {
    'E3 ligase': 'E3 Ligase',
}
protac_df = protac_df.rename(columns=old2new)
protac_v2_df = protac_v2_df.rename(columns=old2new)

Loaded protac.csv


In [6]:
len(protac_v2_df), len(protac_df)

(9380, 5388)

In [7]:
def print_dmax_dc_info(df):
    num_all_notna = len(df.dropna(subset=['Assay (DC50/Dmax)', 'DC50 (nM)', 'Dmax (%)']).dropna(how='all').drop_duplicates())
    num_assay_notna = len(df.dropna(subset=['Assay (DC50/Dmax)']).dropna(how='all').drop_duplicates())
    num_both_notna = len(df.dropna(subset=['DC50 (nM)', 'Dmax (%)']).dropna(how='all').drop_duplicates())
    num_dmax_notna = len(df.dropna(subset=['Dmax (%)']).dropna(how='all').drop_duplicates())
    num_dc50_notna = len(df.dropna(subset=['DC50 (nM)']).dropna(how='all').drop_duplicates())
    num_degr_notna = len(df.dropna(subset=['Percent degradation (%)']).dropna(how='all').drop_duplicates())
    print(f'Number of rows with all 3: {num_all_notna}')
    print(f'Number of rows with Assay: {num_assay_notna}')
    print(f'Number of rows with both DC50 and Dmax: {num_both_notna}')
    print(f'Number of rows with DC50: {num_dc50_notna}')
    print(f'Number of rows with Dmax: {num_dmax_notna}')
    print(f'Number of rows with Percent degradation: {num_degr_notna}')

print('PROTAC-DB')
print_dmax_dc_info(protac_df)
print('')
print('PROTAC-DB-v2')
print_dmax_dc_info(protac_v2_df)

PROTAC-DB
Number of rows with all 3: 344
Number of rows with Assay: 1008
Number of rows with both DC50 and Dmax: 344
Number of rows with DC50: 905
Number of rows with Dmax: 726
Number of rows with Percent degradation: 362

PROTAC-DB-v2
Number of rows with all 3: 909
Number of rows with Assay: 1892
Number of rows with both DC50 and Dmax: 909
Number of rows with DC50: 1762
Number of rows with Dmax: 1317
Number of rows with Percent degradation: 1422


In [8]:
def clean_string(s: str) -> str:
    """ Clean a string by removing <, >, =, NaN, and ranges like 100-200.
    Args:
        s(str): string to clean
    Returns:
        str: cleaned string
    """
    if pd.isnull(s) or s in {'nan', 'n/a', 'NaN', ''}:
        return np.nan
    if 'N.D.' in s:
        return '0'
    s = s.strip('(WB)').strip()
    # # Combine regex operations for efficiency
    # s = re.sub(r'[<=>]|NaN|[\d]+[-~]', '', s)  # Remove <, >, =, NaN, and ranges like 100-200
    # Remove <, >, =, NaN
    s = re.sub(r'[<=>]|NaN', '', s)
    # Replace ranges like 100-200 or 1~3 with the left-most value in the range
    s = re.sub(r'\b(\d+)[-~]\d+\b', r'\1', s)
    # Replace (n/a) with nan
    s = s.replace('(n/a)', 'nan')
    s = re.sub(r'[~<=>% ]', '', s)  # Remove ~, <, >, =, % and spaces
    return s


def split_clean_str(s: str, return_floats: bool = False) -> Union[List[str], List[float]]:
    """ Split a string by '/' and clean each part.
    Args:
        s(str): string to split
        return_floats(bool): whether to return floats or strings
    Returns:
        list: list of cleaned strings or floats
    """
    if pd.isnull(s) or s in {'nan', 'n/a', 'NaN', ''}:
        return np.nan
    cleaned_values = [clean_string(part.strip())
                      for part in s.replace('(n/a)', 'nan').split('/')]
    return [float(value) if return_floats else value for value in cleaned_values]


print(split_clean_str('-100-200/-5/(n/a)/<=90.317/>1000/NaN', return_floats=True))
print(split_clean_str('N.D.', return_floats=True))
print(split_clean_str('96/73 (WB)', return_floats=True))
print(split_clean_str('1.0~3/3.14', return_floats=True))

[-100.0, -5.0, nan, 90.317, 1000.0, nan]
[0.0]
[96.0, 73.0]
[1.0, 3.14]


In [9]:
def get_assay_texts(df: pd.DataFrame, assay_column: str) -> List[str]:
    tmp = df[assay_column].dropna()
    if tmp.empty:
        return []
    return tmp.unique().tolist()


def clean_assay_text(assay):
    tmp = assay.replace('/', ' and ')
    tmp = tmp.replace('BRD4 BD1 and 2', 'BRD4 BD1 and BRD4 BD2')
    tmp = tmp.replace('(Ba and F3 WT)', '(Ba/F3 WT)')
    tmp = tmp.replace('(EGFR L858R and T790M)', '(EGFR L858R/T790M)')
    return tmp


assays = {}
for c in protac_df.columns:
    if 'Assay' in c:
        assays[c] = get_assay_texts(protac_df, c)
texts = list(set([x for y in assays.values() for x in y]))
print(len(texts))
print(sum([len(x) for x in assays.values()]))

813
848


In [10]:
def extract_dc50_info(sentence):
    # Regex patterns for proteins/genes, cell types, and treatment hours
    protein_regex = r"Degradation of total\s(.+?)\s(in|after|using|proteins)"
    cell_regex = r"in\s([A-Za-z0-9-/.;\(\)\s\+]+)\scells"
    treatment_regex = r"after\s(\d+/?\d*?/?\d*?\s?h)"

    # Extracting protein information
    if 'total' in sentence.lower():
        protein_match = re.search(protein_regex, sentence)
        proteins = protein_match.group(1).split(' and ') if protein_match else [
            re.search(r"Degradation of\s([A-Za-z0-9-]+)", sentence).group(1)]
    else:
        if ' in ' in sentence.lower():
            proteins = sentence.split(' in ')[0].split('Degradation of ')[-1]
            proteins = proteins.split('/') if '/' in proteins else [proteins]
        else:
            protein_match = re.search(protein_regex, sentence)
            proteins = protein_match.group(1).split(
                '/') if protein_match else [re.search(r"Degradation of\s([A-Za-z0-9-\/]+)", sentence).group(1)]
    # Handle special cases...
    if 'BRD4 short/long' in sentence:
        proteins = ['BRD4 short', 'BRD4 long']
    if 'BRD4 BD1/2' in sentence:
        proteins = ['BRD4 BD1', 'BRD4 BD2']
    elif 'BRD4 BD1' in sentence:
        proteins = ['BRD4 BD1']
    if 'EGFR L858R/T790M' in sentence:
        proteins = ['EGFR L858R/T790M']
    if 'EGFR del19/T790M/C797S' in sentence:
        proteins = ['EGFR del19/T790M/C797S']

    # Extracting cell types
    cell_match = re.search(cell_regex, sentence)
    cells = cell_match.group(1).split('/') if cell_match else np.nan
    # Handle special cases...
    if 'Ba/F3' in sentence:
        # Replace any occurences that contain 'Ba' or 'F3' with 'Ba/F3' and remove duplicates while preserving the order in the other cells
        cells = ['Ba/F3' if 'Ba' in c or 'F3' in c else c for c in cells]
        cells.pop(cells.index('Ba/F3'))
    if 'ER-positive breast cancer cell lines' in sentence:
        cells = ['ER-positive breast cancer cell lines']
    if 'LNCaP (AR T878A)' in sentence:
        cells = ['LNCaP']
    if 'in A152T neurons' in sentence:
        cells = ['A152T neurons']
    if 'of Rpn13 in MM.1S after' in sentence:
        cells = ['MM.1S']
    if 'Primary Cardiomyocytes' in sentence:
        cells = ['Primary Cardiomyocytes']
    if ' HDAC6 in MM1S after' in sentence:
        cells = ['MM.1S']

    # Extracting treatment hours
    treatment_hours_match = re.search(treatment_regex, sentence)
    if treatment_hours_match:
        treatment_hours = treatment_hours_match.group(1).strip('h')
        treatment_hours = split_clean_str(treatment_hours, return_floats=True)
    else:
        treatment_hours = np.nan

    return {
        'Target (Parsed)': proteins,
        'Cell Type': cells,
        'Treatment Time (h)': treatment_hours,
    }


corner_cases = [
    # 'Degradation of BRD4',
    # 'Degradation of BRD4 short/long in HeLa cells after 24 h treatment',
    # 'Degradation of BRD4 BD1 assessed by EGFP/mCherry reporter assay',
    # 'Degradation of BRD4 BD1/2 assessed by EGFP/mCherry reporter assay',
    # 'Degradation of WT/Exon 20 Ins EGFR in OVCAR8/HeLa cells after 24 h treatment',
    # 'Degradation of TPM3-TRKA/TRKA in KM12/HEL cells after 6 h treatment',
    # 'Degradation of Exon 19 del/L858R EGFR in HCC827/H3255 cells after 24 h treatment',
    # 'Degradation of NPM-ALK/EML4-ALK in SU-DHL-1/NCI-H2228 cells after 16 h treatment',
    # 'Degradation of BCR-ABL T315I in Ba/F3 cells after 24 h treatment',
    # 'Degradation of BCR-ABL T315I in MOL/(Ba/F3)/R4;11 cells after 24 h treatment',
    # 'Degradation of ALK in H3122/Karpas 299/Kelly cells 16 h treatment',
    'Degradation of AR in LNCaP/VCaP AR+ cells after 6 h treatment',
    'Degradation of BRD4 BD1/2 assessed by EGFP/mCherry reporter assay',
    'Degradation of BRD4 BD1 assessed by EGFP/mCherry reporter assay',
    'Degradation of PARP1 in Primary Cardiomyocytes after 24 h treatment',
    'Degradation of HDAC6 in MM1S after 6 h treatment by in-cell ELISA analysis',
    'Degradation of total tau/P-tau in A152T neurons after 24 h treatment',
    'Degradation of Rpn13 in MM.1S after 16 h treatment',
    'Degradation of HDAC6 in MM1S after 6 h treatment by in-cell ELISA analysis',
]

# for assay in assays["Assay (DC50/Dmax)"][-5:] + corner_cases:
#     if len(assay) < 5:
#         continue
#     print(assay)
#     extracted_info = extract_dc50_info(assay)
#     proteins, cells, treatment_hours = extracted_info[
#         'Target (Parsed)'], extracted_info['Cell Type'], extracted_info['Treatment Time (h)']
#     print(proteins, "|", cells, "|", treatment_hours)
#     print('-' * 80)

In [11]:
def get_dc50_dmax_df(df):
    param_cols = ['DC50 (nM)', 'Dmax (%)']
    dc50_dmax_df = df.dropna(subset=param_cols + ["Assay (DC50/Dmax)"], how='all')
    dc50_dmax_df = dc50_dmax_df[dc50_dmax_df["Assay (DC50/Dmax)"].notnull()]
    return dc50_dmax_df.drop_duplicates()

The 'Dmax (%)' column in PROTAC-DB-v2 has two entries which are _dates_ (you never stop surprising me, PROTAC-DB). Convert them to NaNs.

In [12]:
# If any entry in the 'Dmax (%)' column contains the character ':', then it is a
# date and it needs to be set to NaN
def clean_dmax(df):
    df['Dmax (%)'] = df['Dmax (%)'].apply(lambda x: np.nan if ':' in str(x) else x)
    return df

In [13]:
dfs = {}

for name, df in [('protac-db', protac_df), ('protac-db-v2', protac_v2_df)]:
    dc50_dmax_df = get_dc50_dmax_df(clean_dmax(df))

    parsed_table = []
    for i, row in tqdm(dc50_dmax_df.iterrows(), total=len(dc50_dmax_df), desc='Extracting DC50/Dmax info'):
        assay = row["Assay (DC50/Dmax)"]
        if len(assay) < 5:
            continue
        extracted_info = extract_dc50_info(assay)
        extracted_info['DC50 (nM)'] = split_clean_str(
            row['DC50 (nM)'], return_floats=True)
        extracted_info['Dmax (%)'] = split_clean_str(
            row['Dmax (%)'], return_floats=True)

        # Get the max len of each list in the extracted info
        max_len = max([len(v)
                    for v in extracted_info.values() if isinstance(v, list)])
        for i in range(max_len):
            row_tmp = row.copy().to_dict()
            row_tmp.update({k: v[i % len(v)] if isinstance(v, list)
                        else v for k, v in extracted_info.items()})
            parsed_table.append(row_tmp)

    parsed_table = pd.DataFrame(parsed_table)
    display(parsed_table.head())
    print(f'Parsed table len: {len(parsed_table)}')
    dfs[name] = parsed_table

Extracting DC50/Dmax info:   0%|          | 0/1008 [00:00<?, ?it/s]

Unnamed: 0,Compound ID,Uniprot,Target,E3 Ligase,PDB,Name,Smiles,DC50 (nM),Dmax (%),Assay (DC50/Dmax),...,Hydrogen Bond Acceptor Count,Hydrogen Bond Donor Count,Rotatable Bond Count,Topological Polar Surface Area,Molecular Formula,InChI,InChI Key,Target (Parsed),Cell Type,Treatment Time (h)
0,11,Q9H8M2,BRD9,VHL,,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,560.0,80.0,Degradation of BRD9 in HeLa cells after 4 h tr...,...,16,3,22,199.15,C54H69FN8O10S,InChI=1S/C54H69FN8O10S/c1-34-47(74-33-58-34)35...,MXAKQOVZPDLCDK-UDVNCTHFSA-N,BRD9,HeLa,4.0
1,22,Q9H8M2,BRD9,VHL,,VZ185,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,1.76,95.0,Degradation of BRD9 in RI-1 cells after 8 h tr...,...,14,3,19,180.69,C53H67FN8O8S,InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...,ZAGCLFXBHOXXEN-JPTLTNPLSA-N,BRD9,RI-1,8.0
2,22,Q9H8M2,BRD9,VHL,,VZ185,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,4.0,,Degradation of HiBiT-BRD9 in HEK293 cells afte...,...,14,3,19,180.69,C53H67FN8O8S,InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...,ZAGCLFXBHOXXEN-JPTLTNPLSA-N,HiBiT-BRD9,HEK293,24.0
3,22,Q9H8M2,BRD9,VHL,,VZ185,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,2.0,,Degradation of BRD9 in EOL-1/A-204 cells after...,...,14,3,19,180.69,C53H67FN8O8S,InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...,ZAGCLFXBHOXXEN-JPTLTNPLSA-N,BRD9,EOL-1,18.0
4,22,Q9H8M2,BRD9,VHL,,VZ185,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,8.0,,Degradation of BRD9 in EOL-1/A-204 cells after...,...,14,3,19,180.69,C53H67FN8O8S,InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...,ZAGCLFXBHOXXEN-JPTLTNPLSA-N,BRD9,A-204,18.0


Parsed table len: 1205


Extracting DC50/Dmax info:   0%|          | 0/1892 [00:00<?, ?it/s]

Unnamed: 0,Compound ID,Uniprot,Target,E3 Ligase,PDB,Name,Smiles,DC50 (nM),Dmax (%),Assay (DC50/Dmax),...,Hydrogen Bond Acceptor Count,Hydrogen Bond Donor Count,Rotatable Bond Count,Topological Polar Surface Area,Molecular Formula,InChI,InChI Key,Target (Parsed),Cell Type,Treatment Time (h)
0,11,Q9H8M2,BRD9,VHL,,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,560.0,80.0,Degradation of BRD9 in HeLa cells after 4 h tr...,...,16,3,22,199.15,C54H69FN8O10S,InChI=1S/C54H69FN8O10S/c1-34-47(74-33-58-34)35...,MXAKQOVZPDLCDK-UDVNCTHFSA-N,BRD9,HeLa,4.0
1,22,Q9H8M2,BRD9,VHL,,VZ185,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,1.76,95.0,Degradation of BRD9 in RI-1 cells after 8 h tr...,...,14,3,19,180.69,C53H67FN8O8S,InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...,ZAGCLFXBHOXXEN-JPTLTNPLSA-N,BRD9,RI-1,8.0
2,22,Q9H8M2,BRD9,VHL,,VZ185,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,4.0,,Degradation of HiBiT-BRD9 in HEK293 cells afte...,...,14,3,19,180.69,C53H67FN8O8S,InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...,ZAGCLFXBHOXXEN-JPTLTNPLSA-N,HiBiT-BRD9,HEK293,24.0
3,22,Q9H8M2,BRD9,VHL,,VZ185,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,2.0,,Degradation of BRD9 in EOL-1/A-204 cells after...,...,14,3,19,180.69,C53H67FN8O8S,InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...,ZAGCLFXBHOXXEN-JPTLTNPLSA-N,BRD9,EOL-1,18.0
4,22,Q9H8M2,BRD9,VHL,,VZ185,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,8.0,,Degradation of BRD9 in EOL-1/A-204 cells after...,...,14,3,19,180.69,C53H67FN8O8S,InChI=1S/C53H67FN8O8S/c1-33-46(71-32-57-33)34-...,ZAGCLFXBHOXXEN-JPTLTNPLSA-N,BRD9,A-204,18.0


Parsed table len: 2264


In [14]:
def canonize_smiles(smi):
    return Chem.MolToSmiles(Chem.MolFromSmiles(smi))

dfs['protac-db']['Smiles'] = dfs['protac-db']['Smiles'].apply(canonize_smiles)
dfs['protac-db-v2']['Smiles'] = dfs['protac-db-v2']['Smiles'].apply(canonize_smiles)

In [27]:
# Get the number of entries in both dfs
print(f'Number of entries in protac-db: {len(dfs["protac-db"])}')
print(f'Number of entries in protac-db-v2: {len(dfs["protac-db-v2"])}')
# Get the number of entries shared between the two dfs
predict_cols = ["Smiles", "DC50 (nM)", "Dmax (%)", "E3 Ligase", "Uniprot", "Cell Type"]
print(f'Number of shared entries: {len(dfs["protac-db"].merge(dfs["protac-db-v2"], on=predict_cols, how="inner"))}')
# Get the number of total entries without duplicates
print(f'Number of total entries: {len(dfs["protac-db"].append(dfs["protac-db-v2"]).drop_duplicates(subset=predict_cols))}')

Number of entries in protac-db: 1205
Number of entries in protac-db-v2: 2264
Number of shared entries: 1249
Number of total entries: 2232
