# Double Check Cell Knockouts

Double check that there are no E3 ligase knockouts, like CRBN-/- cell lines, in the dataset.

NOTE: It requires the downloaded file "cellosaurus.txt" from the Cellosaurus database to be stored in the `data` directory.

In [1]:
import pandas as pd
import pickle

In [4]:
with open('../data/processed/cell2identifier.pkl', 'rb') as f:
    cell2identifier = pickle.load(f)
cell2identifier

{'MOLT-4': 'MOLT-4',
 'H1975': 'H1975/WR',
 'PBMCs': 'PH1-PBMCs-hiPSC4F1',
 'THP-1': 'THP-1',
 'K562': 'EGFP-K562',
 'MCF-7': 'MCF-7',
 'Hs578t': 'Hs578T-Dox',
 'THP': 'THP-1',
 'HCC827': 'HCC827',
 'MOLM14': 'MOLM-14',
 'HeLa': 'HeLa',
 'HCT116': 'HCT116-53BPI(+/-)',
 'Human THP-1 monocytes': 'THP-1',
 'MCF-7 breast cancer cells': 'MCF-7',
 'NAMALWA': 'Namalwa',
 'K562 CML': 'K562/GM-CSF',
 'MOLM-13': 'MOLM-13',
 'MOLT4': 'MOLT4/P',
 'OPM2': 'OPM-2',
 'MV-4-11': 'MV4-11',
 'MDA-MB-468': 'MDA-MB-468',
 'PC-3': 'PC-3',
 'Jurkat': 'Jurkat',
 'LNCaP': 'LNCaP',
 'MCF7': 'MCF7 AREc32',
 '22Rv1': '22Rv1',
 'AML cells': 'OCI-AML-1',
 'H661': 'NCI-H661',
 'Hela': 'HeLa',
 'VCaP': 'VCaP',
 'THP1': 'PSC-THP1',
 'PC3': 'HNC PC3',
 'hela': 'HeLa',
 'Karpas422': 'Karpas-422',
 'U251': 'U251-TR3',
 'MM1S': 'MM1.S',
 'Ramos': 'Ramos',
 'MDA-MB-231': 'MDA-MB-231',
 'LnCap': 'LNCaP',
 'HEK293': 'HEK293',
 'MiaPaCa2': 'MIA PaCa-2',
 'SR': 'SR',
 'OVCAR8': 'OVCAR-8',
 'Hella': 'IH-1',
 'RS4;11': 'RS4;11'

In [5]:
protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
cell_lines = protac_df['Cell Line Identifier'].unique()
cell_lines

array(['MOLT-4', 'H1975/WR', 'PH1-PBMCs-hiPSC4F1', 'THP-1', 'EGFP-K562',
       'MCF-7', 'Hs578T-Dox', 'HCC827', 'MOLM-14', nan, 'HeLa',
       'HCT116-53BPI(+/-)', 'Namalwa', 'K562/GM-CSF', 'MOLM-13',
       'MOLT4/P', 'OPM-2', 'MV4-11', 'MDA-MB-468', 'PC-3', 'Jurkat',
       'LNCaP', 'MCF7 AREc32', '22Rv1', 'OCI-AML-1', 'NCI-H661', 'VCaP',
       'PSC-THP1', 'HNC PC3', 'Karpas-422', 'U251-TR3', 'MM1.S', 'Ramos',
       'MDA-MB-231', 'HEK293', 'MIA PaCa-2', 'SR', 'OVCAR-8', 'IH-1',
       'RS4;11', 'HEK293T', 'A375-C5', 'KYSE-520', 'NCI-H2030', 'SW1573',
       'SU-DHL-1', 'PA1', 'HT1080 EGFP', 'HepG2 hALR', 'MyLa 1929',
       'A549 Cas9', 'Huh7 IFITM2-/-', 'Ri-1', 'EoL-1', 'A-204',
       'T47D Ad12', 'VCaP AR-V7/pHag', 'NCI-H838', 'NCI-H2228',
       'HBL-1 [Human AIDS-related non-Hodgkin lymphoma]', 'Mino',
       'NCI-H3255', 'XLA-07', 'LNCaP95 clone A7', 'CA46', 'SUM149-Luc',
       'HD-MB03', 'MOLM-16', '293T FOXP3', 'NCI-H3122', 'Karpas-299',
       'Kelly', 'A431 siYAP', 'HCC

Each entry in cellosaurus looks as the following:

```
ID   #W7079
AC   CVCL_E549
SY   #W7079 REM; REMUS; W7079
DR   dbMHC; 48440
DR   ECACC; 94022552
DR   IHW; IHW09223
DR   IPD-IMGT/HLA; 11548
DR   Wikidata; Q54480892
RX   PubMed=30844424;
CC   Part of: 12th International Histocompatibility Workshop (12IHW) cell line panel.
CC   Population: Caucasian.
CC   HLA typing: A*02:01,02:06; B*40:01:02:01,67:01:01; C*03,07; DPB1*04:01,05:01 (IPD-IMGT/HLA=11548).
CC   Transformant: NCBI_TaxID; 10376; Epstein-Barr virus (EBV).
CC   Derived from site: In situ; Peripheral blood; UBERON=UBERON_0000178.
CC   Cell type: B-cell; CL=CL_0000236.
OX   NCBI_TaxID=9606; ! Homo sapiens (Human)
SX   Sex unspecified
AG   Age unspecified
CA   Transformed cell line
DT   Created: 22-10-12; Last updated: 30-01-24; Version: 17
//
```

In [None]:
# Load cellosaurus data into a string file
with open('data/cellosaurus.txt', 'r') as f:
    cellosaurus = f.read()

In [14]:
cell2cellosaurus = {}
for cell_line in cell_lines:
    cell2cellosaurus[cell_line] = None
    for i, entry in enumerate(cellosaurus.split('//\n')):
        # print(entry)
        # print('-' * 80)
        if f'ID   {cell_line}' in entry:
            cell2cellosaurus[cell_line] = entry

            if 'E3' in entry or 'ligase' in entry.lower():
                print(cell_line)
                print(entry)
                print('-' * 80)
            break

In [22]:
for cell_line, cellosaurus_entry in cell2cellosaurus.items():
    if pd.isna(cell_line):
        continue
    if '-/-' in cell_line or '+/-' in cell_line or '+/+' in cell_line or '-/-' in cellosaurus_entry or '+/-' in cellosaurus_entry or '+/+' in cellosaurus_entry:
        print(cellosaurus_entry)
        print('-' * 80)


ID   HCT116-53BPI(+/-)
AC   CVCL_1R00
DR   cancercelllines; CVCL_1R00
DR   RCB; RCB2996
DR   Wikidata; Q54882028
CC   Population: Caucasian.
CC   Knockout cell: Method=Homologous recombination; HGNC; 11999; TP53BP1 (Note=1 of 2 alleles).
CC   Sequence variation: Mutation; HGNC; 173; ACVR2A; Simple; p.Lys437Argfs*5 (c.1310delA); dbSNP=rs764719749; Zygosity=Homozygous (from parent cell line).
CC   Sequence variation: Mutation; HGNC; 1101; BRCA2; Simple; p.Ile2675Aspfs*6 (c.8021dupA) (c.8021_8022insA); ClinVar=VCV000267050; Zygosity=Heterozygous (from parent cell line).
CC   Sequence variation: Mutation; HGNC; 1787; CDKN2A; Simple; p.Arg24Serfs*20 (c.68dupG) (c.68_69insG) (p.G23fs); Zygosity=Heterozygous (from parent cell line).
CC   Sequence variation: Mutation; HGNC; 2514; CTNNB1; Simple; p.Ser45del (c.133_135delTCT); ClinVar=VCV000017576; Zygosity=Heterozygous (from parent cell line).
CC   Sequence variation: Mutation; HGNC; 3373; EP300; Simple; p.Met1470Cysfs*22 (c.4408delA); Zygosity

In [23]:
for cell_synonym, cell_id in cell2identifier.items():
    if pd.isna(cell_line):
        continue
    if '-/-' in cell_synonym or '+/-' in cell_synonym or '+/+' in cell_synonym or '-/-' in cell_id or '+/-' in cell_id or '+/+' in cell_id:
        print(cell_synonym)
        print(cell_id)
        print('-' * 80)

HCT116
HCT116-53BPI(+/-)
--------------------------------------------------------------------------------
Huh7
Huh7 IFITM2-/-
--------------------------------------------------------------------------------
