| """ |
| Synthetic clinical oncology data generator for OncoAgent. |
| Generates OncoCoT-format samples for pipeline validation. |
| All data is 100% synthetic β zero real patient information. |
| """ |
|
|
| import json |
| import os |
| import random |
| from typing import List, Dict |
|
|
| |
| random.seed(42) |
|
|
| SYNTHETIC_ONCOCOT_SAMPLES: List[Dict[str, str]] = [ |
| |
| { |
| "history": ( |
| "62-year-old female presents with persistent dry cough for 3 months, " |
| "unintentional weight loss of 8 kg, and hemoptysis. Chest CT reveals a " |
| "2.5 cm spiculated mass in the left upper lobe with associated pleural " |
| "thickening and enlarged mediastinal lymph nodes measuring 1.2 cm. " |
| "Patient is a former smoker with 30 pack-year history." |
| ), |
| "reasoning": ( |
| "1. Identify lesion characteristics: 2.5 cm mass classifies as T1c/T2a. " |
| "2. Morphology: 'Spiculated' margins are highly indicative of malignancy " |
| "(positive predictive value >90%). " |
| "3. Nodal involvement: Mediastinal lymph nodes at 1.2 cm suggest N2 status. " |
| "4. Clinical correlation: Hemoptysis + weight loss + smoking history " |
| "significantly increase pre-test probability. " |
| "5. Staging synthesis: T2aN2M0 β Stage IIIA per AJCC 8th edition." |
| ), |
| "conclusion": ( |
| "High suspicion for non-small cell lung cancer (NSCLC), likely Stage IIIA. " |
| "Recommend urgent tissue biopsy (CT-guided or bronchoscopy) and PET-CT " |
| "for comprehensive staging. Multidisciplinary tumor board consultation required." |
| ), |
| }, |
| { |
| "history": ( |
| "55-year-old male with a palpable 3.5 cm mass in the right breast, " |
| "skin dimpling, and axillary lymphadenopathy on the ipsilateral side. " |
| "Mammography shows an irregular dense mass with microcalcifications. " |
| "Family history positive for BRCA2 mutation in first-degree relative." |
| ), |
| "reasoning": ( |
| "1. Mass characteristics: 3.5 cm irregular mass with microcalcifications " |
| "is highly suspicious (BI-RADS 5). " |
| "2. Clinical signs: Skin dimpling indicates possible Cooper ligament involvement. " |
| "3. Nodal status: Ipsilateral axillary lymphadenopathy suggests N1 involvement. " |
| "4. Risk factors: Male breast cancer accounts for <1% of cases, but BRCA2 " |
| "significantly increases risk (6-8% lifetime). " |
| "5. Staging estimate: T2N1M0 β Stage IIB." |
| ), |
| "conclusion": ( |
| "High suspicion for male breast carcinoma, likely Stage IIB. " |
| "Recommend core needle biopsy with receptor testing (ER/PR/HER2), " |
| "BRCA genetic testing, and staging workup including chest/abdominal CT." |
| ), |
| }, |
| { |
| "history": ( |
| "70-year-old male presents with progressive difficulty swallowing solids " |
| "over 4 months, weight loss of 12 kg, and retrosternal pain. Upper " |
| "endoscopy reveals a 4 cm circumferential mass in the distal esophagus " |
| "with mucosal ulceration. CT shows thickened esophageal wall and " |
| "suspicious celiac lymph nodes." |
| ), |
| "reasoning": ( |
| "1. Lesion: 4 cm circumferential mass with ulceration is T3 (adventitial invasion likely). " |
| "2. Location: Distal esophagus suggests adenocarcinoma (Barrett's association). " |
| "3. Nodal disease: Celiac lymph nodes represent M1 lymph node disease per AJCC. " |
| "4. Symptoms: Progressive dysphagia + significant weight loss indicate advanced disease. " |
| "5. Staging: T3N1M1(LYM) β Stage IVA." |
| ), |
| "conclusion": ( |
| "High suspicion for esophageal adenocarcinoma, Stage IVA. " |
| "Recommend endoscopic biopsy with HER2 testing, PET-CT for complete staging, " |
| "and referral for palliative chemoradiation consideration." |
| ), |
| }, |
| { |
| "history": ( |
| "48-year-old female with recently discovered hepatic masses on " |
| "ultrasound performed for right upper quadrant pain. CT reveals " |
| "multiple bilobar liver lesions (largest 6 cm) with arterial enhancement " |
| "and washout. AFP level is 850 ng/mL. History of hepatitis C cirrhosis." |
| ), |
| "reasoning": ( |
| "1. Imaging: Arterial enhancement with washout is pathognomonic for HCC (LI-RADS 5). " |
| "2. Biomarker: AFP >400 ng/mL is highly specific for hepatocellular carcinoma. " |
| "3. Risk factor: HCV cirrhosis is the leading cause of HCC. " |
| "4. Extent: Bilobar disease precludes surgical resection. " |
| "5. Staging: Beyond Milan criteria (single β€5cm or β€3 lesions each β€3cm) β BCLC Stage C." |
| ), |
| "conclusion": ( |
| "Hepatocellular carcinoma confirmed by imaging criteria (LI-RADS 5) and AFP elevation. " |
| "BCLC Stage C. Recommend systemic therapy (atezolizumab + bevacizumab per NCCN) " |
| "and liver transplant evaluation if disease responds." |
| ), |
| }, |
| { |
| "history": ( |
| "58-year-old male with iron-deficiency anemia, change in bowel habits " |
| "for 6 months, and a 2 cm mass found in the sigmoid colon on colonoscopy. " |
| "Biopsy confirms moderately differentiated adenocarcinoma. CT abdomen shows " |
| "3 suspicious pericolonic lymph nodes and 2 small liver lesions." |
| ), |
| "reasoning": ( |
| "1. Primary tumor: 2 cm sigmoid adenocarcinoma, moderately differentiated. " |
| "2. Local spread: Pericolonic lymph nodes suggest N1 disease. " |
| "3. Distant metastasis: Liver lesions are concerning for M1a hepatic metastases. " |
| "4. Presentation: Iron-deficiency anemia is classic for right-sided colon cancer " |
| "but can occur in sigmoid lesions with chronic occult bleeding. " |
| "5. Staging: T3N1M1a β Stage IVA (AJCC 8th edition)." |
| ), |
| "conclusion": ( |
| "Sigmoid colon adenocarcinoma, Stage IVA with hepatic metastases. " |
| "Recommend molecular profiling (MSI, KRAS/NRAS/BRAF), " |
| "liver MRI for surgical resectability assessment, and FOLFOX/FOLFIRI-based " |
| "systemic therapy per NCCN guidelines." |
| ), |
| }, |
| |
| { |
| "history": ( |
| "45-year-old female with a 1.5 cm solid thyroid nodule found incidentally " |
| "on carotid ultrasound. Fine needle aspiration shows Bethesda IV " |
| "(follicular neoplasm). No cervical lymphadenopathy. TSH is normal." |
| ), |
| "reasoning": ( |
| "1. Nodule: 1.5 cm solid nodule with Bethesda IV cytology. " |
| "2. Risk of malignancy: Bethesda IV carries 15-30% cancer risk. " |
| "3. Favorable factors: No lymphadenopathy, normal TSH. " |
| "4. Cannot distinguish follicular adenoma from carcinoma on cytology alone. " |
| "5. Assessment: Intermediate risk requiring diagnostic surgery." |
| ), |
| "conclusion": ( |
| "Indeterminate thyroid nodule (Bethesda IV) with moderate malignancy risk. " |
| "Recommend molecular testing (Afirma or ThyroSeq) if available. " |
| "If molecular testing is inconclusive, diagnostic lobectomy is indicated." |
| ), |
| }, |
| { |
| "history": ( |
| "60-year-old male with a PSA level of 7.2 ng/mL on routine screening. " |
| "Digital rectal exam reveals a firm nodule on the right lobe. " |
| "MRI prostate shows a PI-RADS 4 lesion in the peripheral zone, " |
| "15 mm in greatest dimension. No extraprostatic extension." |
| ), |
| "reasoning": ( |
| "1. PSA: 7.2 ng/mL is elevated (normal <4.0), PSA density should be calculated. " |
| "2. DRE: Palpable nodule correlates with imaging finding. " |
| "3. MRI: PI-RADS 4 has ~60-70% probability of clinically significant cancer. " |
| "4. Confined disease: No extraprostatic extension is favorable. " |
| "5. Assessment: High probability of Gleason 3+4 or higher prostate cancer." |
| ), |
| "conclusion": ( |
| "Probable clinically significant prostate cancer. " |
| "Recommend MRI-targeted fusion biopsy (minimum 12 systematic + 2-3 targeted cores). " |
| "If positive, staging with PSMA PET-CT per NCCN guidelines." |
| ), |
| }, |
| { |
| "history": ( |
| "52-year-old female with a 2 cm pancreatic cystic lesion found on CT " |
| "performed for back pain. MRI with MRCP shows a branch-duct IPMN in the " |
| "pancreatic body with a mural nodule measuring 5 mm. CA 19-9 is 45 U/mL. " |
| "No main duct dilation." |
| ), |
| "reasoning": ( |
| "1. Cyst type: Branch-duct IPMN is the most common pancreatic cystic neoplasm. " |
| "2. Worrisome feature: Mural nodule (5 mm) is a 'worrisome feature' per Fukuoka criteria. " |
| "3. Size: 2 cm is below the high-risk threshold of 3 cm. " |
| "4. Biomarker: CA 19-9 of 45 is borderline (normal <37). " |
| "5. Assessment: Moderate risk β warrants EUS for further characterization." |
| ), |
| "conclusion": ( |
| "Branch-duct IPMN with worrisome features (mural nodule). " |
| "Recommend endoscopic ultrasound (EUS) with FNA for cytology and cyst fluid analysis. " |
| "If high-grade dysplasia found, surgical resection is indicated." |
| ), |
| }, |
| |
| { |
| "history": ( |
| "35-year-old female with a 1 cm well-circumscribed, oval, hypoechoic " |
| "breast mass found on screening ultrasound. BI-RADS 3. No family history " |
| "of breast cancer. No skin changes or axillary lymphadenopathy." |
| ), |
| "reasoning": ( |
| "1. Mass morphology: Well-circumscribed, oval shape is characteristic of fibroadenoma. " |
| "2. BI-RADS 3: Probably benign (<2% malignancy risk). " |
| "3. Age: 35 years old β breast cancer is rare at this age without risk factors. " |
| "4. No concerning features: No skin changes, no lymphadenopathy. " |
| "5. Assessment: Low risk, likely fibroadenoma." |
| ), |
| "conclusion": ( |
| "Probably benign breast mass (BI-RADS 3), most likely fibroadenoma. " |
| "Recommend short-interval follow-up ultrasound at 6 months. " |
| "If stable at 2 years, reclassify as BI-RADS 2 (benign)." |
| ), |
| }, |
| { |
| "history": ( |
| "28-year-old male with a small, well-circumscribed 8 mm pulmonary nodule " |
| "found incidentally on chest X-ray performed for pre-employment screening. " |
| "Non-smoker, no respiratory symptoms, no weight loss. CT confirms a smooth, " |
| "round, calcified nodule in the right middle lobe." |
| ), |
| "reasoning": ( |
| "1. Nodule: 8 mm, smooth margins, calcified β benign morphology. " |
| "2. Calcification pattern: Diffuse calcification is highly associated with granuloma. " |
| "3. Risk factors: Non-smoker, young age, asymptomatic. " |
| "4. Fleischner criteria: Calcified nodules are generally benign and do not " |
| "require follow-up imaging. " |
| "5. Assessment: Very low risk, most likely granuloma (infectious etiology)." |
| ), |
| "conclusion": ( |
| "Benign calcified pulmonary granuloma. No malignancy concern. " |
| "No further imaging or follow-up required per Fleischner Society guidelines. " |
| "Reassure patient." |
| ), |
| }, |
| ] |
|
|
|
|
| def generate_oncocot_samples(output_path: str = "data/samples/oncocot_synthetic.json") -> str: |
| """ |
| Writes the synthetic OncoCoT samples to a JSON file. |
| |
| Args: |
| output_path: Path to the output JSON file. |
| |
| Returns: |
| The absolute path to the generated file. |
| """ |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) |
| with open(output_path, "w", encoding="utf-8") as f: |
| json.dump(SYNTHETIC_ONCOCOT_SAMPLES, f, ensure_ascii=False, indent=2) |
| print(f"β
Generated {len(SYNTHETIC_ONCOCOT_SAMPLES)} synthetic OncoCoT samples β {output_path}") |
| return os.path.abspath(output_path) |
|
|
|
|
| def generate_pmc_patients_format( |
| output_path: str = "data/samples/pmc_patients_synthetic.json", |
| ) -> str: |
| """ |
| Converts the OncoCoT samples into a PMC-Patients-compatible format. |
| |
| Args: |
| output_path: Path to the output JSON file. |
| |
| Returns: |
| The absolute path to the generated file. |
| """ |
| pmc_samples: List[Dict[str, str]] = [] |
| for sample in SYNTHETIC_ONCOCOT_SAMPLES: |
| pmc_samples.append({ |
| "patient": sample["history"], |
| "medical_history": sample["history"], |
| "reasoning": sample["reasoning"], |
| "conclusion": sample["conclusion"], |
| }) |
|
|
| os.makedirs(os.path.dirname(output_path), exist_ok=True) |
| with open(output_path, "w", encoding="utf-8") as f: |
| json.dump(pmc_samples, f, ensure_ascii=False, indent=2) |
| print(f"β
Generated {len(pmc_samples)} PMC-Patients format samples β {output_path}") |
| return os.path.abspath(output_path) |
|
|
|
|
| if __name__ == "__main__": |
| generate_oncocot_samples() |
| generate_pmc_patients_format() |
| print("π All synthetic data generated successfully.") |
|
|