# Data Cleaning Parameters # TCGA abbreviations for cancer. From https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations TCGA_CODES = { 'LAML': 'Acute Myeloid Leukemia', 'ACC': 'Adrenocortical carcinoma', 'BLCA': 'Bladder Urothelial Carcinoma', 'LGG': 'Brain Lower Grade Glioma', 'BRCA': 'Breast invasive carcinoma', 'CESC': 'Cervical squamous cell carcinoma and endocervical adenocarcinoma', 'CHOL': 'Cholangiocarcinoma', 'LCML': 'Chronic Myelogenous Leukemia', 'COAD': 'Colon adenocarcinoma', 'CNTL': 'Controls', 'ESCA': 'Esophageal carcinoma', 'FPPP': 'FFPE Pilot Phase II', 'GBM': 'Glioblastoma multiforme', 'HNSC': 'Head and Neck squamous cell carcinoma', 'KICH': 'Kidney Chromophobe', 'KIRC': 'Kidney renal clear cell carcinoma', 'KIRP': 'Kidney renal papillary cell carcinoma', 'LIHC': 'Liver hepatocellular carcinoma', 'LUAD': 'Lung adenocarcinoma', 'LUSC': 'Lung squamous cell carcinoma', 'DLBC': 'Lymphoid Neoplasm Diffuse Large B-cell Lymphoma', 'MESO': 'Mesothelioma', 'MISC': 'Miscellaneous', 'OV': 'Ovarian serous cystadenocarcinoma', 'PAAD': 'Pancreatic adenocarcinoma', 'PCPG': 'Pheochromocytoma and Paraganglioma', 'PRAD': 'Prostate adenocarcinoma', 'READ': 'Rectum adenocarcinoma', 'SARC': 'Sarcoma', 'SKCM': 'Skin Cutaneous Melanoma', 'STAD': 'Stomach adenocarcinoma', 'TGCT': 'Testicular Germ Cell Tumors', 'THYM': 'Thymoma', 'THCA': 'Thyroid carcinoma', 'UCS': 'Uterine Carcinosarcoma', 'UCEC': 'Uterine Corpus Endometrial Carcinoma', 'UVM': 'Uveal Melanoma' } FODB_CODES = { 'ACC': 'Adenoid cystic carcinoma', 'ALL': 'Acute Lymphoid Leukemia', 'AML': 'Acute Myeloid Leukemia', 'BALL': 'B-cell acute lymphoblastic leukemia', 'BLCA': 'Bladder Urothelial Carcinoma', 'BRCA': 'Breast invasive carcinoma', 'CESC': 'Cervical squamous cell carcinoma and endocervical adenocarcinoma', 'CHOL': 'Cholangiocarcinoma', 'EPD': 'Ependymoma', 'HGG': 'High-grade glioma', 'HNSC': 'Head and Neck squamous cell carcinoma', 'KIRC': 'Kidney renal clear cell carcinoma', 'LGG': 'Low-grade glioma', 'LUAD': 'Lung adenocarcinoma', 'LUSC': 'Lung squamous cell carcinoma', 'MEL': 'Melanoma', 'MESO': 'Mesothelioma', 'NBL': 'Neuroblastoma', 'OS': 'Osteosarcoma', 'OV': 'Ovarian serous cystadenocarcinoma', 'PCPG': 'Pheochromocytoma and Paraganglioma', 'PRAD': 'Prostate adenocarcinoma', 'READ': 'Rectum adenocarcinoma', 'RHB': 'Rhabdomyosarcoma', 'SARC': 'Sarcoma', 'STAD': 'Stomach adenocarcinoma', 'TALL': 'T-cell acute lymphoblastic leukemia', 'THYM': 'Thymoma', 'UCEC': 'Uterine Corpus Endometrial Carcinoma', 'UCS': 'Uterine Carcinosarcoma', 'UVM': 'Uveal Melanoma', 'WLM': 'Wilms tumor' } VALID_AAS = {'A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'} DELIMITERS = {',', ';', '|', '\t', ' ', ':', '-', '/', '\\', '\n'}