OncoAgent / data_prep /sample_data.py
MaximoLopezChenlo's picture
Upload folder using huggingface_hub
e1624f5 verified
"""
Synthetic clinical oncology data generator for OncoAgent.
Generates OncoCoT-format samples for pipeline validation.
All data is 100% synthetic β€” zero real patient information.
"""
import json
import os
import random
from typing import List, Dict
# Reproducibility seed (Rule #22)
random.seed(42)
SYNTHETIC_ONCOCOT_SAMPLES: List[Dict[str, str]] = [
# === HIGH RISK (5 cases) ===
{
"history": (
"62-year-old female presents with persistent dry cough for 3 months, "
"unintentional weight loss of 8 kg, and hemoptysis. Chest CT reveals a "
"2.5 cm spiculated mass in the left upper lobe with associated pleural "
"thickening and enlarged mediastinal lymph nodes measuring 1.2 cm. "
"Patient is a former smoker with 30 pack-year history."
),
"reasoning": (
"1. Identify lesion characteristics: 2.5 cm mass classifies as T1c/T2a. "
"2. Morphology: 'Spiculated' margins are highly indicative of malignancy "
"(positive predictive value >90%). "
"3. Nodal involvement: Mediastinal lymph nodes at 1.2 cm suggest N2 status. "
"4. Clinical correlation: Hemoptysis + weight loss + smoking history "
"significantly increase pre-test probability. "
"5. Staging synthesis: T2aN2M0 β†’ Stage IIIA per AJCC 8th edition."
),
"conclusion": (
"High suspicion for non-small cell lung cancer (NSCLC), likely Stage IIIA. "
"Recommend urgent tissue biopsy (CT-guided or bronchoscopy) and PET-CT "
"for comprehensive staging. Multidisciplinary tumor board consultation required."
),
},
{
"history": (
"55-year-old male with a palpable 3.5 cm mass in the right breast, "
"skin dimpling, and axillary lymphadenopathy on the ipsilateral side. "
"Mammography shows an irregular dense mass with microcalcifications. "
"Family history positive for BRCA2 mutation in first-degree relative."
),
"reasoning": (
"1. Mass characteristics: 3.5 cm irregular mass with microcalcifications "
"is highly suspicious (BI-RADS 5). "
"2. Clinical signs: Skin dimpling indicates possible Cooper ligament involvement. "
"3. Nodal status: Ipsilateral axillary lymphadenopathy suggests N1 involvement. "
"4. Risk factors: Male breast cancer accounts for <1% of cases, but BRCA2 "
"significantly increases risk (6-8% lifetime). "
"5. Staging estimate: T2N1M0 β†’ Stage IIB."
),
"conclusion": (
"High suspicion for male breast carcinoma, likely Stage IIB. "
"Recommend core needle biopsy with receptor testing (ER/PR/HER2), "
"BRCA genetic testing, and staging workup including chest/abdominal CT."
),
},
{
"history": (
"70-year-old male presents with progressive difficulty swallowing solids "
"over 4 months, weight loss of 12 kg, and retrosternal pain. Upper "
"endoscopy reveals a 4 cm circumferential mass in the distal esophagus "
"with mucosal ulceration. CT shows thickened esophageal wall and "
"suspicious celiac lymph nodes."
),
"reasoning": (
"1. Lesion: 4 cm circumferential mass with ulceration is T3 (adventitial invasion likely). "
"2. Location: Distal esophagus suggests adenocarcinoma (Barrett's association). "
"3. Nodal disease: Celiac lymph nodes represent M1 lymph node disease per AJCC. "
"4. Symptoms: Progressive dysphagia + significant weight loss indicate advanced disease. "
"5. Staging: T3N1M1(LYM) β†’ Stage IVA."
),
"conclusion": (
"High suspicion for esophageal adenocarcinoma, Stage IVA. "
"Recommend endoscopic biopsy with HER2 testing, PET-CT for complete staging, "
"and referral for palliative chemoradiation consideration."
),
},
{
"history": (
"48-year-old female with recently discovered hepatic masses on "
"ultrasound performed for right upper quadrant pain. CT reveals "
"multiple bilobar liver lesions (largest 6 cm) with arterial enhancement "
"and washout. AFP level is 850 ng/mL. History of hepatitis C cirrhosis."
),
"reasoning": (
"1. Imaging: Arterial enhancement with washout is pathognomonic for HCC (LI-RADS 5). "
"2. Biomarker: AFP >400 ng/mL is highly specific for hepatocellular carcinoma. "
"3. Risk factor: HCV cirrhosis is the leading cause of HCC. "
"4. Extent: Bilobar disease precludes surgical resection. "
"5. Staging: Beyond Milan criteria (single ≀5cm or ≀3 lesions each ≀3cm) β†’ BCLC Stage C."
),
"conclusion": (
"Hepatocellular carcinoma confirmed by imaging criteria (LI-RADS 5) and AFP elevation. "
"BCLC Stage C. Recommend systemic therapy (atezolizumab + bevacizumab per NCCN) "
"and liver transplant evaluation if disease responds."
),
},
{
"history": (
"58-year-old male with iron-deficiency anemia, change in bowel habits "
"for 6 months, and a 2 cm mass found in the sigmoid colon on colonoscopy. "
"Biopsy confirms moderately differentiated adenocarcinoma. CT abdomen shows "
"3 suspicious pericolonic lymph nodes and 2 small liver lesions."
),
"reasoning": (
"1. Primary tumor: 2 cm sigmoid adenocarcinoma, moderately differentiated. "
"2. Local spread: Pericolonic lymph nodes suggest N1 disease. "
"3. Distant metastasis: Liver lesions are concerning for M1a hepatic metastases. "
"4. Presentation: Iron-deficiency anemia is classic for right-sided colon cancer "
"but can occur in sigmoid lesions with chronic occult bleeding. "
"5. Staging: T3N1M1a β†’ Stage IVA (AJCC 8th edition)."
),
"conclusion": (
"Sigmoid colon adenocarcinoma, Stage IVA with hepatic metastases. "
"Recommend molecular profiling (MSI, KRAS/NRAS/BRAF), "
"liver MRI for surgical resectability assessment, and FOLFOX/FOLFIRI-based "
"systemic therapy per NCCN guidelines."
),
},
# === MEDIUM RISK (3 cases) ===
{
"history": (
"45-year-old female with a 1.5 cm solid thyroid nodule found incidentally "
"on carotid ultrasound. Fine needle aspiration shows Bethesda IV "
"(follicular neoplasm). No cervical lymphadenopathy. TSH is normal."
),
"reasoning": (
"1. Nodule: 1.5 cm solid nodule with Bethesda IV cytology. "
"2. Risk of malignancy: Bethesda IV carries 15-30% cancer risk. "
"3. Favorable factors: No lymphadenopathy, normal TSH. "
"4. Cannot distinguish follicular adenoma from carcinoma on cytology alone. "
"5. Assessment: Intermediate risk requiring diagnostic surgery."
),
"conclusion": (
"Indeterminate thyroid nodule (Bethesda IV) with moderate malignancy risk. "
"Recommend molecular testing (Afirma or ThyroSeq) if available. "
"If molecular testing is inconclusive, diagnostic lobectomy is indicated."
),
},
{
"history": (
"60-year-old male with a PSA level of 7.2 ng/mL on routine screening. "
"Digital rectal exam reveals a firm nodule on the right lobe. "
"MRI prostate shows a PI-RADS 4 lesion in the peripheral zone, "
"15 mm in greatest dimension. No extraprostatic extension."
),
"reasoning": (
"1. PSA: 7.2 ng/mL is elevated (normal <4.0), PSA density should be calculated. "
"2. DRE: Palpable nodule correlates with imaging finding. "
"3. MRI: PI-RADS 4 has ~60-70% probability of clinically significant cancer. "
"4. Confined disease: No extraprostatic extension is favorable. "
"5. Assessment: High probability of Gleason 3+4 or higher prostate cancer."
),
"conclusion": (
"Probable clinically significant prostate cancer. "
"Recommend MRI-targeted fusion biopsy (minimum 12 systematic + 2-3 targeted cores). "
"If positive, staging with PSMA PET-CT per NCCN guidelines."
),
},
{
"history": (
"52-year-old female with a 2 cm pancreatic cystic lesion found on CT "
"performed for back pain. MRI with MRCP shows a branch-duct IPMN in the "
"pancreatic body with a mural nodule measuring 5 mm. CA 19-9 is 45 U/mL. "
"No main duct dilation."
),
"reasoning": (
"1. Cyst type: Branch-duct IPMN is the most common pancreatic cystic neoplasm. "
"2. Worrisome feature: Mural nodule (5 mm) is a 'worrisome feature' per Fukuoka criteria. "
"3. Size: 2 cm is below the high-risk threshold of 3 cm. "
"4. Biomarker: CA 19-9 of 45 is borderline (normal <37). "
"5. Assessment: Moderate risk β€” warrants EUS for further characterization."
),
"conclusion": (
"Branch-duct IPMN with worrisome features (mural nodule). "
"Recommend endoscopic ultrasound (EUS) with FNA for cytology and cyst fluid analysis. "
"If high-grade dysplasia found, surgical resection is indicated."
),
},
# === LOW RISK (2 cases) ===
{
"history": (
"35-year-old female with a 1 cm well-circumscribed, oval, hypoechoic "
"breast mass found on screening ultrasound. BI-RADS 3. No family history "
"of breast cancer. No skin changes or axillary lymphadenopathy."
),
"reasoning": (
"1. Mass morphology: Well-circumscribed, oval shape is characteristic of fibroadenoma. "
"2. BI-RADS 3: Probably benign (<2% malignancy risk). "
"3. Age: 35 years old β€” breast cancer is rare at this age without risk factors. "
"4. No concerning features: No skin changes, no lymphadenopathy. "
"5. Assessment: Low risk, likely fibroadenoma."
),
"conclusion": (
"Probably benign breast mass (BI-RADS 3), most likely fibroadenoma. "
"Recommend short-interval follow-up ultrasound at 6 months. "
"If stable at 2 years, reclassify as BI-RADS 2 (benign)."
),
},
{
"history": (
"28-year-old male with a small, well-circumscribed 8 mm pulmonary nodule "
"found incidentally on chest X-ray performed for pre-employment screening. "
"Non-smoker, no respiratory symptoms, no weight loss. CT confirms a smooth, "
"round, calcified nodule in the right middle lobe."
),
"reasoning": (
"1. Nodule: 8 mm, smooth margins, calcified β€” benign morphology. "
"2. Calcification pattern: Diffuse calcification is highly associated with granuloma. "
"3. Risk factors: Non-smoker, young age, asymptomatic. "
"4. Fleischner criteria: Calcified nodules are generally benign and do not "
"require follow-up imaging. "
"5. Assessment: Very low risk, most likely granuloma (infectious etiology)."
),
"conclusion": (
"Benign calcified pulmonary granuloma. No malignancy concern. "
"No further imaging or follow-up required per Fleischner Society guidelines. "
"Reassure patient."
),
},
]
def generate_oncocot_samples(output_path: str = "data/samples/oncocot_synthetic.json") -> str:
"""
Writes the synthetic OncoCoT samples to a JSON file.
Args:
output_path: Path to the output JSON file.
Returns:
The absolute path to the generated file.
"""
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(SYNTHETIC_ONCOCOT_SAMPLES, f, ensure_ascii=False, indent=2)
print(f"βœ… Generated {len(SYNTHETIC_ONCOCOT_SAMPLES)} synthetic OncoCoT samples β†’ {output_path}")
return os.path.abspath(output_path)
def generate_pmc_patients_format(
output_path: str = "data/samples/pmc_patients_synthetic.json",
) -> str:
"""
Converts the OncoCoT samples into a PMC-Patients-compatible format.
Args:
output_path: Path to the output JSON file.
Returns:
The absolute path to the generated file.
"""
pmc_samples: List[Dict[str, str]] = []
for sample in SYNTHETIC_ONCOCOT_SAMPLES:
pmc_samples.append({
"patient": sample["history"],
"medical_history": sample["history"],
"reasoning": sample["reasoning"],
"conclusion": sample["conclusion"],
})
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(pmc_samples, f, ensure_ascii=False, indent=2)
print(f"βœ… Generated {len(pmc_samples)} PMC-Patients format samples β†’ {output_path}")
return os.path.abspath(output_path)
if __name__ == "__main__":
generate_oncocot_samples()
generate_pmc_patients_format()
print("πŸš€ All synthetic data generated successfully.")