File size: 3,453 Bytes
ffaff91 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
# Data Cleaning Parameters
# TCGA abbreviations for cancer. From https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations
TCGA_CODES = {
'LAML': 'Acute Myeloid Leukemia',
'ACC': 'Adrenocortical carcinoma',
'BLCA': 'Bladder Urothelial Carcinoma',
'LGG': 'Brain Lower Grade Glioma',
'BRCA': 'Breast invasive carcinoma',
'CESC': 'Cervical squamous cell carcinoma and endocervical adenocarcinoma',
'CHOL': 'Cholangiocarcinoma',
'LCML': 'Chronic Myelogenous Leukemia',
'COAD': 'Colon adenocarcinoma',
'CNTL': 'Controls',
'ESCA': 'Esophageal carcinoma',
'FPPP': 'FFPE Pilot Phase II',
'GBM': 'Glioblastoma multiforme',
'HNSC': 'Head and Neck squamous cell carcinoma',
'KICH': 'Kidney Chromophobe',
'KIRC': 'Kidney renal clear cell carcinoma',
'KIRP': 'Kidney renal papillary cell carcinoma',
'LIHC': 'Liver hepatocellular carcinoma',
'LUAD': 'Lung adenocarcinoma',
'LUSC': 'Lung squamous cell carcinoma',
'DLBC': 'Lymphoid Neoplasm Diffuse Large B-cell Lymphoma',
'MESO': 'Mesothelioma',
'MISC': 'Miscellaneous',
'OV': 'Ovarian serous cystadenocarcinoma',
'PAAD': 'Pancreatic adenocarcinoma',
'PCPG': 'Pheochromocytoma and Paraganglioma',
'PRAD': 'Prostate adenocarcinoma',
'READ': 'Rectum adenocarcinoma',
'SARC': 'Sarcoma',
'SKCM': 'Skin Cutaneous Melanoma',
'STAD': 'Stomach adenocarcinoma',
'TGCT': 'Testicular Germ Cell Tumors',
'THYM': 'Thymoma',
'THCA': 'Thyroid carcinoma',
'UCS': 'Uterine Carcinosarcoma',
'UCEC': 'Uterine Corpus Endometrial Carcinoma',
'UVM': 'Uveal Melanoma'
}
FODB_CODES = {
'ACC': 'Adenoid cystic carcinoma',
'ALL': 'Acute Lymphoid Leukemia',
'AML': 'Acute Myeloid Leukemia',
'BALL': 'B-cell acute lymphoblastic leukemia',
'BLCA': 'Bladder Urothelial Carcinoma',
'BRCA': 'Breast invasive carcinoma',
'CESC': 'Cervical squamous cell carcinoma and endocervical adenocarcinoma',
'CHOL': 'Cholangiocarcinoma',
'EPD': 'Ependymoma',
'HGG': 'High-grade glioma',
'HNSC': 'Head and Neck squamous cell carcinoma',
'KIRC': 'Kidney renal clear cell carcinoma',
'LGG': 'Low-grade glioma',
'LUAD': 'Lung adenocarcinoma',
'LUSC': 'Lung squamous cell carcinoma',
'MEL': 'Melanoma',
'MESO': 'Mesothelioma',
'NBL': 'Neuroblastoma',
'OS': 'Osteosarcoma',
'OV': 'Ovarian serous cystadenocarcinoma',
'PCPG': 'Pheochromocytoma and Paraganglioma',
'PRAD': 'Prostate adenocarcinoma',
'READ': 'Rectum adenocarcinoma',
'RHB': 'Rhabdomyosarcoma',
'SARC': 'Sarcoma',
'STAD': 'Stomach adenocarcinoma',
'TALL': 'T-cell acute lymphoblastic leukemia',
'THYM': 'Thymoma',
'UCEC': 'Uterine Corpus Endometrial Carcinoma',
'UCS': 'Uterine Carcinosarcoma',
'UVM': 'Uveal Melanoma',
'WLM': 'Wilms tumor'
}
VALID_AAS = {'A',
'R',
'N',
'D',
'C',
'E',
'Q',
'G',
'H',
'I',
'L',
'K',
'M',
'F',
'P',
'S',
'T',
'W',
'Y',
'V'}
DELIMITERS = {',',
';',
'|',
'\t',
' ',
':',
'-',
'/',
'\\',
'\n'} |