Fill-Mask
Transformers
PyTorch
esm
Inference Endpoints
FusOn-pLM / fuson_plm /utils /constants.py
svincoff's picture
adding utility files used throughout FusOn-pLM training and benchmarking
ffaff91
raw
history blame
3.45 kB
# Data Cleaning Parameters
# TCGA abbreviations for cancer. From https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations
TCGA_CODES = {
'LAML': 'Acute Myeloid Leukemia',
'ACC': 'Adrenocortical carcinoma',
'BLCA': 'Bladder Urothelial Carcinoma',
'LGG': 'Brain Lower Grade Glioma',
'BRCA': 'Breast invasive carcinoma',
'CESC': 'Cervical squamous cell carcinoma and endocervical adenocarcinoma',
'CHOL': 'Cholangiocarcinoma',
'LCML': 'Chronic Myelogenous Leukemia',
'COAD': 'Colon adenocarcinoma',
'CNTL': 'Controls',
'ESCA': 'Esophageal carcinoma',
'FPPP': 'FFPE Pilot Phase II',
'GBM': 'Glioblastoma multiforme',
'HNSC': 'Head and Neck squamous cell carcinoma',
'KICH': 'Kidney Chromophobe',
'KIRC': 'Kidney renal clear cell carcinoma',
'KIRP': 'Kidney renal papillary cell carcinoma',
'LIHC': 'Liver hepatocellular carcinoma',
'LUAD': 'Lung adenocarcinoma',
'LUSC': 'Lung squamous cell carcinoma',
'DLBC': 'Lymphoid Neoplasm Diffuse Large B-cell Lymphoma',
'MESO': 'Mesothelioma',
'MISC': 'Miscellaneous',
'OV': 'Ovarian serous cystadenocarcinoma',
'PAAD': 'Pancreatic adenocarcinoma',
'PCPG': 'Pheochromocytoma and Paraganglioma',
'PRAD': 'Prostate adenocarcinoma',
'READ': 'Rectum adenocarcinoma',
'SARC': 'Sarcoma',
'SKCM': 'Skin Cutaneous Melanoma',
'STAD': 'Stomach adenocarcinoma',
'TGCT': 'Testicular Germ Cell Tumors',
'THYM': 'Thymoma',
'THCA': 'Thyroid carcinoma',
'UCS': 'Uterine Carcinosarcoma',
'UCEC': 'Uterine Corpus Endometrial Carcinoma',
'UVM': 'Uveal Melanoma'
}
FODB_CODES = {
'ACC': 'Adenoid cystic carcinoma',
'ALL': 'Acute Lymphoid Leukemia',
'AML': 'Acute Myeloid Leukemia',
'BALL': 'B-cell acute lymphoblastic leukemia',
'BLCA': 'Bladder Urothelial Carcinoma',
'BRCA': 'Breast invasive carcinoma',
'CESC': 'Cervical squamous cell carcinoma and endocervical adenocarcinoma',
'CHOL': 'Cholangiocarcinoma',
'EPD': 'Ependymoma',
'HGG': 'High-grade glioma',
'HNSC': 'Head and Neck squamous cell carcinoma',
'KIRC': 'Kidney renal clear cell carcinoma',
'LGG': 'Low-grade glioma',
'LUAD': 'Lung adenocarcinoma',
'LUSC': 'Lung squamous cell carcinoma',
'MEL': 'Melanoma',
'MESO': 'Mesothelioma',
'NBL': 'Neuroblastoma',
'OS': 'Osteosarcoma',
'OV': 'Ovarian serous cystadenocarcinoma',
'PCPG': 'Pheochromocytoma and Paraganglioma',
'PRAD': 'Prostate adenocarcinoma',
'READ': 'Rectum adenocarcinoma',
'RHB': 'Rhabdomyosarcoma',
'SARC': 'Sarcoma',
'STAD': 'Stomach adenocarcinoma',
'TALL': 'T-cell acute lymphoblastic leukemia',
'THYM': 'Thymoma',
'UCEC': 'Uterine Corpus Endometrial Carcinoma',
'UCS': 'Uterine Carcinosarcoma',
'UVM': 'Uveal Melanoma',
'WLM': 'Wilms tumor'
}
VALID_AAS = {'A',
'R',
'N',
'D',
'C',
'E',
'Q',
'G',
'H',
'I',
'L',
'K',
'M',
'F',
'P',
'S',
'T',
'W',
'Y',
'V'}
DELIMITERS = {',',
';',
'|',
'\t',
' ',
':',
'-',
'/',
'\\',
'\n'}