Spaces:
Sleeping
Sleeping
| """ | |
| Synthetic Medical Test Data Generator | |
| Creates realistic medical test cases for validation without real PHI | |
| """ | |
| import json | |
| import random | |
| from datetime import datetime, timedelta | |
| from typing import Dict, List, Any | |
| class MedicalTestDataGenerator: | |
| """Generate synthetic medical test data for validation""" | |
| def __init__(self, seed=42): | |
| random.seed(seed) | |
| def generate_ecg_test_case(self, case_id: int, pathology: str) -> Dict[str, Any]: | |
| """Generate a synthetic ECG test case""" | |
| # Base parameters | |
| base_hr = { | |
| "normal": (60, 100), | |
| "atrial_fibrillation": (80, 150), | |
| "ventricular_tachycardia": (150, 250), | |
| "heart_block": (30, 60), | |
| "st_elevation": (60, 100), | |
| "st_depression": (60, 100), | |
| "qt_prolongation": (60, 90), | |
| "bundle_branch_block": (60, 100) | |
| } | |
| hr_range = base_hr.get(pathology, (60, 100)) | |
| heart_rate = random.randint(hr_range[0], hr_range[1]) | |
| # Generate measurements | |
| pr_interval = random.randint(120, 200) if pathology != "heart_block" else random.randint(200, 350) | |
| qrs_duration = random.randint(80, 100) if pathology != "bundle_branch_block" else random.randint(120, 160) | |
| qt_interval = random.randint(350, 450) if pathology != "qt_prolongation" else random.randint(450, 550) | |
| qtc = qt_interval / (60/heart_rate)**0.5 | |
| return { | |
| "case_id": f"ECG_{case_id:04d}", | |
| "modality": "ECG", | |
| "patient_age": random.randint(30, 80), | |
| "patient_sex": random.choice(["M", "F"]), | |
| "pathology": pathology, | |
| "measurements": { | |
| "heart_rate": heart_rate, | |
| "pr_interval_ms": pr_interval, | |
| "qrs_duration_ms": qrs_duration, | |
| "qt_interval_ms": qt_interval, | |
| "qtc_ms": round(qtc, 1), | |
| "axis": random.choice(["normal", "left", "right"]) | |
| }, | |
| "ground_truth": { | |
| "diagnosis": pathology, | |
| "severity": random.choice(["mild", "moderate", "severe"]), | |
| "clinical_significance": self._get_clinical_significance(pathology), | |
| "requires_immediate_action": pathology in ["ventricular_tachycardia", "st_elevation"] | |
| }, | |
| "confidence_expected": self._get_expected_confidence(pathology), | |
| "review_required": pathology in ["heart_block", "qt_prolongation"] | |
| } | |
| def generate_radiology_test_case(self, case_id: int, pathology: str, modality: str) -> Dict[str, Any]: | |
| """Generate a synthetic radiology test case""" | |
| findings = { | |
| "normal": "No acute findings", | |
| "pneumonia": "Focal consolidation in right lower lobe", | |
| "fracture": "Transverse fracture of distal radius", | |
| "tumor": "3.2 cm mass in left upper lobe", | |
| "organomegaly": "Hepatomegaly with liver span 18 cm" | |
| } | |
| return { | |
| "case_id": f"RAD_{case_id:04d}", | |
| "modality": modality, | |
| "imaging_type": random.choice(["Chest X-ray", "CT Chest", "MRI Brain", "Ultrasound Abdomen"]), | |
| "patient_age": random.randint(20, 85), | |
| "patient_sex": random.choice(["M", "F"]), | |
| "pathology": pathology, | |
| "findings": findings.get(pathology, "Unknown findings"), | |
| "ground_truth": { | |
| "primary_diagnosis": pathology, | |
| "anatomical_location": self._get_anatomical_location(pathology), | |
| "severity": random.choice(["mild", "moderate", "severe"]), | |
| "clinical_significance": self._get_clinical_significance(pathology), | |
| "requires_follow_up": pathology != "normal" | |
| }, | |
| "confidence_expected": self._get_expected_confidence(pathology), | |
| "review_required": pathology in ["tumor", "fracture"] | |
| } | |
| def _get_clinical_significance(self, pathology: str) -> str: | |
| significance_map = { | |
| "normal": "None", | |
| "atrial_fibrillation": "High - stroke risk", | |
| "ventricular_tachycardia": "Critical - life-threatening", | |
| "heart_block": "High - may require pacemaker", | |
| "st_elevation": "Critical - acute MI", | |
| "st_depression": "High - ischemia", | |
| "qt_prolongation": "Moderate - arrhythmia risk", | |
| "bundle_branch_block": "Moderate - conduction disorder", | |
| "pneumonia": "High - infectious process", | |
| "fracture": "Moderate - structural injury", | |
| "tumor": "High - potential malignancy", | |
| "organomegaly": "Moderate - systemic disease" | |
| } | |
| return significance_map.get(pathology, "Unknown") | |
| def _get_anatomical_location(self, pathology: str) -> str: | |
| location_map = { | |
| "pneumonia": "Right lower lobe", | |
| "fracture": "Distal radius", | |
| "tumor": "Left upper lobe", | |
| "organomegaly": "Liver" | |
| } | |
| return location_map.get(pathology, "N/A") | |
| def _get_expected_confidence(self, pathology: str) -> float: | |
| """Expected confidence score for validation""" | |
| # High confidence cases | |
| if pathology in ["normal", "st_elevation", "ventricular_tachycardia", "fracture"]: | |
| return random.uniform(0.85, 0.95) | |
| # Medium confidence cases | |
| elif pathology in ["qt_prolongation", "heart_block", "pneumonia", "tumor"]: | |
| return random.uniform(0.65, 0.85) | |
| # Lower confidence cases | |
| else: | |
| return random.uniform(0.50, 0.70) | |
| def generate_test_dataset(self, num_ecg=500, num_radiology=200) -> Dict[str, List[Dict]]: | |
| """Generate complete test dataset""" | |
| print(f"Generating synthetic medical test dataset...") | |
| print(f"ECG cases: {num_ecg}") | |
| print(f"Radiology cases: {num_radiology}") | |
| # ECG pathology distribution | |
| ecg_pathologies = [ | |
| ("normal", int(num_ecg * 0.20)), # 20% normal | |
| ("atrial_fibrillation", int(num_ecg * 0.16)), | |
| ("ventricular_tachycardia", int(num_ecg * 0.12)), | |
| ("heart_block", int(num_ecg * 0.10)), | |
| ("st_elevation", int(num_ecg * 0.14)), | |
| ("st_depression", int(num_ecg * 0.12)), | |
| ("qt_prolongation", int(num_ecg * 0.08)), | |
| ("bundle_branch_block", int(num_ecg * 0.08)) | |
| ] | |
| ecg_cases = [] | |
| case_id = 1 | |
| for pathology, count in ecg_pathologies: | |
| for _ in range(count): | |
| ecg_cases.append(self.generate_ecg_test_case(case_id, pathology)) | |
| case_id += 1 | |
| # Radiology pathology distribution | |
| rad_pathologies = [ | |
| ("normal", int(num_radiology * 0.25)), # 25% normal | |
| ("pneumonia", int(num_radiology * 0.30)), | |
| ("fracture", int(num_radiology * 0.20)), | |
| ("tumor", int(num_radiology * 0.15)), | |
| ("organomegaly", int(num_radiology * 0.10)) | |
| ] | |
| rad_cases = [] | |
| case_id = 1 | |
| for pathology, count in rad_pathologies: | |
| for _ in range(count): | |
| modality = random.choice(["Chest X-ray", "CT", "MRI", "Ultrasound"]) | |
| rad_cases.append(self.generate_radiology_test_case(case_id, pathology, modality)) | |
| case_id += 1 | |
| print(f"\nGenerated:") | |
| print(f" ECG cases: {len(ecg_cases)}") | |
| print(f" Radiology cases: {len(rad_cases)}") | |
| print(f" Total: {len(ecg_cases) + len(rad_cases)}") | |
| return { | |
| "ecg_cases": ecg_cases, | |
| "radiology_cases": rad_cases, | |
| "metadata": { | |
| "generated_date": datetime.now().isoformat(), | |
| "total_cases": len(ecg_cases) + len(rad_cases), | |
| "ecg_distribution": {p: c for p, c in ecg_pathologies}, | |
| "radiology_distribution": {p: c for p, c in rad_pathologies} | |
| } | |
| } | |
| class ValidationMetricsCalculator: | |
| """Calculate clinical validation metrics""" | |
| def calculate_metrics(self, predictions: List[Dict], ground_truth: List[Dict]) -> Dict[str, Any]: | |
| """Calculate sensitivity, specificity, F1, AUROC""" | |
| # Match predictions with ground truth | |
| tp = fp = tn = fn = 0 | |
| for pred, truth in zip(predictions, ground_truth): | |
| pred_positive = pred.get("diagnosis") == truth.get("pathology") | |
| truth_positive = truth.get("pathology") != "normal" | |
| if pred_positive and truth_positive: | |
| tp += 1 | |
| elif pred_positive and not truth_positive: | |
| fp += 1 | |
| elif not pred_positive and not truth_positive: | |
| tn += 1 | |
| elif not pred_positive and truth_positive: | |
| fn += 1 | |
| # Calculate metrics | |
| sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0.0 | |
| specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0 | |
| precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0 | |
| recall = sensitivity | |
| f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0 | |
| return { | |
| "confusion_matrix": { | |
| "true_positives": tp, | |
| "false_positives": fp, | |
| "true_negatives": tn, | |
| "false_negatives": fn | |
| }, | |
| "metrics": { | |
| "sensitivity": round(sensitivity, 4), | |
| "specificity": round(specificity, 4), | |
| "precision": round(precision, 4), | |
| "recall": round(recall, 4), | |
| "f1_score": round(f1_score, 4) | |
| }, | |
| "total_cases": len(predictions) | |
| } | |
| def main(): | |
| """Generate test dataset and save to files""" | |
| print("="*60) | |
| print("SYNTHETIC MEDICAL TEST DATA GENERATION") | |
| print("="*60) | |
| print(f"Started: {datetime.now().isoformat()}\n") | |
| generator = MedicalTestDataGenerator(seed=42) | |
| # Generate full dataset | |
| dataset = generator.generate_test_dataset(num_ecg=500, num_radiology=200) | |
| # Save to files | |
| output_dir = "/workspace/medical-ai-platform/test_data" | |
| import os | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Save complete dataset | |
| with open(f"{output_dir}/complete_test_dataset.json", "w") as f: | |
| json.dump(dataset, f, indent=2) | |
| print(f"\nSaved complete dataset to: {output_dir}/complete_test_dataset.json") | |
| # Save ECG cases separately | |
| with open(f"{output_dir}/ecg_test_cases.json", "w") as f: | |
| json.dump(dataset["ecg_cases"], f, indent=2) | |
| print(f"Saved ECG cases to: {output_dir}/ecg_test_cases.json") | |
| # Save radiology cases separately | |
| with open(f"{output_dir}/radiology_test_cases.json", "w") as f: | |
| json.dump(dataset["radiology_cases"], f, indent=2) | |
| print(f"Saved radiology cases to: {output_dir}/radiology_test_cases.json") | |
| # Generate summary statistics | |
| summary = { | |
| "total_cases": dataset["metadata"]["total_cases"], | |
| "ecg_cases": len(dataset["ecg_cases"]), | |
| "radiology_cases": len(dataset["radiology_cases"]), | |
| "ecg_distribution": dataset["metadata"]["ecg_distribution"], | |
| "radiology_distribution": dataset["metadata"]["radiology_distribution"], | |
| "generated_date": dataset["metadata"]["generated_date"] | |
| } | |
| with open(f"{output_dir}/dataset_summary.json", "w") as f: | |
| json.dump(summary, f, indent=2) | |
| print(f"Saved summary to: {output_dir}/dataset_summary.json") | |
| print("\n" + "="*60) | |
| print("DATA GENERATION COMPLETE") | |
| print("="*60) | |
| print(f"\nDataset Statistics:") | |
| print(f" Total Cases: {summary['total_cases']}") | |
| print(f" ECG Cases: {summary['ecg_cases']}") | |
| print(f" Radiology Cases: {summary['radiology_cases']}") | |
| print(f"\nECG Pathology Distribution:") | |
| for pathology, count in summary['ecg_distribution'].items(): | |
| print(f" {pathology}: {count} cases") | |
| print(f"\nRadiology Pathology Distribution:") | |
| for pathology, count in summary['radiology_distribution'].items(): | |
| print(f" {pathology}: {count} cases") | |
| if __name__ == "__main__": | |
| main() | |