Spaces:

RocketFarmStudios
/

cps-api-tx

Sleeping

App Files Files Community

Ali2206 commited on Aug 31

Commit

f0d524c

1 Parent(s): 25dd79e

device token

Browse files

Files changed (4) hide show

api/routes/patients.py +35 -97
api/services/fhir_integration.py +148 -6
api/services/synthea_integration.py +168 -222
data/new_tool.json +0 -1

api/routes/patients.py CHANGED Viewed

@@ -726,6 +726,18 @@ async def get_patients(
             if patient.get("date_of_birth") == "":
                 patient["date_of_birth"] = None
             processed_patients.append(patient)
         logger.info(f"✅ Returning {len(processed_patients)} processed patients")
@@ -1170,14 +1182,20 @@ async def update_patient(
 @router.post("/patients/import-hapi-fhir", status_code=status.HTTP_201_CREATED)
 async def import_hapi_patients(
     limit: int = Query(20, ge=1, le=100, description="Number of patients to import"),
     current_user: dict = Depends(get_current_user)
 ):
     """
-    Import patients from HAPI FHIR Test Server
     """
     try:
         service = HAPIFHIRIntegrationService()
-        result = await service.import_patients_from_hapi(limit=limit)
         # Create detailed message
         message_parts = []
@@ -1185,6 +1203,8 @@ async def import_hapi_patients(
             message_parts.append(f"Successfully imported {result['imported_count']} patients")
         if result["skipped_count"] > 0:
             message_parts.append(f"Skipped {result['skipped_count']} duplicate patients")
         if result["errors"]:
             message_parts.append(f"Encountered {len(result['errors'])} errors")
@@ -1194,9 +1214,12 @@ async def import_hapi_patients(
             "message": message,
             "imported_count": result["imported_count"],
             "skipped_count": result["skipped_count"],
             "total_found": result["total_found"],
             "imported_patients": result["imported_patients"],
             "skipped_patients": result["skipped_patients"],
             "errors": result["errors"],
             "source": "hapi_fhir"
         }
@@ -1386,17 +1409,12 @@ async def fetch_ehr_data(
                 "field_mapping": ehr_system_config["field_mapping"]
             }
         else:
-            # For other EHR systems, we would implement actual FHIR client calls
-            # For now, return a sample structure
-            sample_data = generate_sample_ehr_data(ehr_system, limit)
-            return {
-                "ehr_system": ehr_system,
-                "data": sample_data,
-                "total_count": len(sample_data),
-                "field_mapping": ehr_system_config["field_mapping"],
-                "note": "This is sample data. Implement actual FHIR client integration for production use."
-            }
     except Exception as e:
         logger.error(f"Error fetching EHR data: {str(e)}")
@@ -1405,89 +1423,7 @@ async def fetch_ehr_data(
             detail=f"Failed to fetch EHR data: {str(e)}"
         )
-def generate_sample_ehr_data(ehr_system: str, limit: int) -> List[dict]:
-    """
-    Generate sample EHR data for testing purposes
-    """
-    import random
-    from datetime import date, timedelta
-    sample_names = [
-        "John Smith", "Jane Doe", "Michael Johnson", "Sarah Wilson", "David Brown",
-        "Emily Davis", "Robert Miller", "Lisa Garcia", "James Rodriguez", "Maria Martinez",
-        "Christopher Anderson", "Jennifer Taylor", "Daniel Thomas", "Amanda Jackson",
-        "Matthew White", "Nicole Harris", "Joshua Martin", "Stephanie Thompson"
-    ]
-    sample_addresses = [
-        "123 Main St, New York, NY 10001",
-        "456 Oak Ave, Los Angeles, CA 90210",
-        "789 Pine Rd, Chicago, IL 60601",
-        "321 Elm St, Houston, TX 77001",
-        "654 Maple Dr, Phoenix, AZ 85001"
-    ]
-    sample_allergies = [
-        "Penicillin", "Peanuts", "Latex", "Shellfish", "Dairy", "Eggs", "Soy", "Wheat"
-    ]
-    sample_conditions = [
-        "Hypertension", "Diabetes Type 2", "Asthma", "Depression", "Anxiety",
-        "Obesity", "Arthritis", "Heart Disease", "Chronic Kidney Disease", "COPD"
-    ]
-    sample_medications = [
-        "Lisinopril", "Metformin", "Albuterol", "Sertraline", "Atorvastatin",
-        "Omeprazole", "Amlodipine", "Losartan", "Simvastatin", "Hydrochlorothiazide"
-    ]
-    sample_insurance = [
-        ("Blue Cross Blue Shield", "BCBS123456"),
-        ("Aetna", "AET789012"),
-        ("Cigna", "CIG345678"),
-        ("UnitedHealth", "UHC901234"),
-        ("Humana", "HUM567890")
-    ]
-    data = []
-    for i in range(min(limit, 18)):
-        # Generate random date of birth (18-80 years old)
-        years_old = random.randint(18, 80)
-        birth_date = date.today() - timedelta(days=years_old * 365 + random.randint(0, 365))
-        # Generate random allergies and conditions
-        patient_allergies = random.sample(sample_allergies, random.randint(0, 3))
-        patient_conditions = random.sample(sample_conditions, random.randint(0, 2))
-        patient_medications = random.sample(sample_medications, random.randint(0, 3))
-        # Generate emergency contact
-        emergency_contact = random.choice(sample_names)
-        while emergency_contact == sample_names[i % len(sample_names)]:
-            emergency_contact = random.choice(sample_names)
-        # Generate insurance info
-        insurance_provider, insurance_policy = random.choice(sample_insurance)
-        patient_data = {
-            "ehr_id": f"{ehr_system.upper()}{str(i+1).zfill(3)}",
-            "full_name": sample_names[i % len(sample_names)],
-            "date_of_birth": birth_date.strftime("%Y-%m-%d"),
-            "gender": random.choice(["male", "female"]),
-            "address": random.choice(sample_addresses),
-            "national_id": f"{random.randint(100000000, 999999999)}",
-            "blood_type": random.choice(["A+", "A-", "B+", "B-", "AB+", "AB-", "O+", "O-"]),
-            "allergies": patient_allergies,
-            "chronic_conditions": patient_conditions,
-            "medications": patient_medications,
-            "emergency_contact_name": emergency_contact,
-            "emergency_contact_phone": f"555-{random.randint(100, 999)}-{random.randint(1000, 9999)}",
-            "insurance_provider": insurance_provider,
-            "insurance_policy_number": insurance_policy
-        }
-        data.append(patient_data)
-    return data
 @router.post("/patients/generate-synthea", status_code=status.HTTP_201_CREATED)
 async def generate_synthea_patients(
@@ -1644,6 +1580,7 @@ async def generate_and_import_synthea_patients(
     age_max: int = Query(80, ge=0, le=120, description="Maximum age for generated patients"),
     gender: str = Query("both", description="Gender distribution: male, female, or both"),
     location: str = Query("Massachusetts", description="Location for generated patients"),
     current_user: dict = Depends(get_current_user)
 ):
     """
@@ -1666,7 +1603,8 @@ async def generate_and_import_synthea_patients(
             age_min=age_min,
             age_max=age_max,
             gender=gender,
-            location=location
         )
         if not generation_result['patients']:

             if patient.get("date_of_birth") == "":
                 patient["date_of_birth"] = None
+            # Add missing required fields for Synthea patients
+            if "status" not in patient:
+                patient["status"] = "active"
+            if "created_at" not in patient:
+                patient["created_at"] = patient.get("import_date", datetime.utcnow())
+            if "updated_at" not in patient:
+                patient["updated_at"] = patient.get("last_updated", datetime.utcnow())
+            # Ensure source field is present
+            if "source" not in patient:
+                patient["source"] = "synthea"
             processed_patients.append(patient)
         logger.info(f"✅ Returning {len(processed_patients)} processed patients")
 @router.post("/patients/import-hapi-fhir", status_code=status.HTTP_201_CREATED)
 async def import_hapi_patients(
     limit: int = Query(20, ge=1, le=100, description="Number of patients to import"),
+    require_medical_data: bool = Query(False, description="Require patients to have medical data (conditions, medications, encounters, or observations)"),
+    min_completeness_score: float = Query(0.7, ge=0.0, le=1.0, description="Minimum validation score (0-1) for a patient to be considered complete"),
     current_user: dict = Depends(get_current_user)
 ):
     """
+    Import patients from HAPI FHIR Test Server with data completeness validation
     """
     try:
         service = HAPIFHIRIntegrationService()
+        result = await service.import_patients_from_hapi(
+            limit=limit,
+            require_medical_data=require_medical_data,
+            min_completeness_score=min_completeness_score
+        )
         # Create detailed message
         message_parts = []
             message_parts.append(f"Successfully imported {result['imported_count']} patients")
         if result["skipped_count"] > 0:
             message_parts.append(f"Skipped {result['skipped_count']} duplicate patients")
+        if result["filtered_count"] > 0:
+            message_parts.append(f"Filtered out {result['filtered_count']} incomplete patients")
         if result["errors"]:
             message_parts.append(f"Encountered {len(result['errors'])} errors")
             "message": message,
             "imported_count": result["imported_count"],
             "skipped_count": result["skipped_count"],
+            "filtered_count": result["filtered_count"],
             "total_found": result["total_found"],
             "imported_patients": result["imported_patients"],
             "skipped_patients": result["skipped_patients"],
+            "filtered_patients": result["filtered_patients"],
+            "validation_summary": result["validation_summary"],
             "errors": result["errors"],
             "source": "hapi_fhir"
         }
                 "field_mapping": ehr_system_config["field_mapping"]
             }
         else:
+            # For other EHR systems, we don't support hardcoded data
+            # Only real Synthea data is supported
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=f"EHR system '{ehr_system}' is not supported. Only HAPI FHIR Test Server and Synthea are supported for real data generation."
+            )
     except Exception as e:
         logger.error(f"Error fetching EHR data: {str(e)}")
             detail=f"Failed to fetch EHR data: {str(e)}"
         )
 @router.post("/patients/generate-synthea", status_code=status.HTTP_201_CREATED)
 async def generate_synthea_patients(
     age_max: int = Query(80, ge=0, le=120, description="Maximum age for generated patients"),
     gender: str = Query("both", description="Gender distribution: male, female, or both"),
     location: str = Query("Massachusetts", description="Location for generated patients"),
+    require_medical_data: bool = Query(True, description="Require patients to have medical data (conditions, medications, encounters, or observations)"),
     current_user: dict = Depends(get_current_user)
 ):
     """
             age_min=age_min,
             age_max=age_max,
             gender=gender,
+            location=location,
+            require_medical_data=require_medical_data
         )
         if not generation_result['patients']:

api/services/fhir_integration.py CHANGED Viewed

@@ -11,9 +11,90 @@ class HAPIFHIRIntegrationService:
     def __init__(self):
         self.fhir_client = HAPIFHIRClient()
-    async def import_patients_from_hapi(self, limit: int = 20) -> dict:
         """
-        Import patients from HAPI FHIR Test Server with detailed feedback
         """
         try:
             print(f"Fetching {limit} patients from HAPI FHIR...")
@@ -24,19 +105,34 @@ class HAPIFHIRIntegrationService:
                 return {
                     "imported_count": 0,
                     "skipped_count": 0,
                     "total_found": 0,
                     "imported_patients": [],
                     "skipped_patients": [],
                     "errors": []
                 }
-            print(f"Found {len(patients)} patients, checking for duplicates...")
             imported_count = 0
             skipped_count = 0
             imported_patients = []
             skipped_patients = []
             errors = []
             for patient in patients:
                 try:
@@ -58,13 +154,48 @@ class HAPIFHIRIntegrationService:
                     # Enhance patient data with additional FHIR data
                     enhanced_patient = await self._enhance_patient_data(patient)
                     # Insert into database
                     result = await db.patients.insert_one(enhanced_patient)
                     if result.inserted_id:
                         imported_count += 1
-                        imported_patients.append(patient['full_name'])
-                        print(f"Imported patient: {patient['full_name']} (ID: {result.inserted_id})")
                 except Exception as e:
                     error_msg = f"Error importing patient {patient.get('full_name', 'Unknown')}: {e}"
@@ -72,14 +203,22 @@ class HAPIFHIRIntegrationService:
                     print(error_msg)
                     continue
-            print(f"Import completed: {imported_count} imported, {skipped_count} skipped")
             return {
                 "imported_count": imported_count,
                 "skipped_count": skipped_count,
                 "total_found": len(patients),
                 "imported_patients": imported_patients,
                 "skipped_patients": skipped_patients,
                 "errors": errors
             }
@@ -88,9 +227,12 @@ class HAPIFHIRIntegrationService:
             return {
                 "imported_count": 0,
                 "skipped_count": 0,
                 "total_found": 0,
                 "imported_patients": [],
                 "skipped_patients": [],
                 "errors": [str(e)]
             }

     def __init__(self):
         self.fhir_client = HAPIFHIRClient()
+    def _validate_patient_data_completeness(self, patient: Dict, require_medical_data: bool = False) -> Dict[str, any]:
         """
+        Validate if a patient has complete data
+        Args:
+            patient: Patient data dictionary
+            require_medical_data: Whether to require medical data (observations, medications, conditions)
+        Returns:
+            Dict with validation results:
+            {
+                "is_complete": bool,
+                "missing_fields": List[str],
+                "has_medical_data": bool,
+                "validation_score": float (0-1)
+            }
+        """
+        required_demographic_fields = [
+            'full_name', 'gender', 'date_of_birth', 'address'
+        ]
+        optional_demographic_fields = [
+            'phone', 'email', 'marital_status', 'language'
+        ]
+        medical_data_fields = [
+            'observations', 'medications', 'conditions'
+        ]
+        missing_fields = []
+        validation_score = 0.0
+        total_fields = len(required_demographic_fields) + len(optional_demographic_fields)
+        present_fields = 0
+        # Check required demographic fields
+        for field in required_demographic_fields:
+            value = patient.get(field, '')
+            if not value or (isinstance(value, str) and value.strip() == ''):
+                missing_fields.append(field)
+            else:
+                present_fields += 1
+        # Check optional demographic fields
+        for field in optional_demographic_fields:
+            value = patient.get(field, '')
+            if value and (not isinstance(value, str) or value.strip() != ''):
+                present_fields += 1
+        # Check medical data
+        has_medical_data = False
+        if 'clinical_data' in patient:
+            clinical_data = patient['clinical_data']
+            for field in medical_data_fields:
+                if field in clinical_data and clinical_data[field]:
+                    has_medical_data = True
+                    break
+        # Calculate validation score
+        validation_score = present_fields / total_fields if total_fields > 0 else 0.0
+        # Determine if patient is complete
+        is_complete = len(missing_fields) == 0 and validation_score >= 0.7
+        # If medical data is required, check if patient has it
+        if require_medical_data and not has_medical_data:
+            is_complete = False
+            missing_fields.append('medical_data')
+        return {
+            "is_complete": is_complete,
+            "missing_fields": missing_fields,
+            "has_medical_data": has_medical_data,
+            "validation_score": validation_score,
+            "demographic_completeness": present_fields / len(required_demographic_fields + optional_demographic_fields) if (len(required_demographic_fields) + len(optional_demographic_fields)) > 0 else 0.0
+        }
+    async def import_patients_from_hapi(self, limit: int = 20, require_medical_data: bool = False, min_completeness_score: float = 0.7) -> dict:
+        """
+        Import patients from HAPI FHIR Test Server with data completeness validation
+        Args:
+            limit: Number of patients to fetch from HAPI FHIR
+            require_medical_data: Whether to require patients to have medical data
+            min_completeness_score: Minimum validation score (0-1) for a patient to be considered complete
         """
         try:
             print(f"Fetching {limit} patients from HAPI FHIR...")
                 return {
                     "imported_count": 0,
                     "skipped_count": 0,
+                    "filtered_count": 0,
                     "total_found": 0,
                     "imported_patients": [],
                     "skipped_patients": [],
+                    "filtered_patients": [],
+                    "validation_summary": {},
                     "errors": []
                 }
+            print(f"Found {len(patients)} patients, checking for duplicates and data completeness...")
             imported_count = 0
             skipped_count = 0
+            filtered_count = 0
             imported_patients = []
             skipped_patients = []
+            filtered_patients = []
             errors = []
+            validation_summary = {
+                "total_processed": len(patients),
+                "complete_patients": 0,
+                "incomplete_patients": 0,
+                "with_medical_data": 0,
+                "without_medical_data": 0,
+                "average_completeness_score": 0.0
+            }
+            total_completeness_score = 0.0
             for patient in patients:
                 try:
                     # Enhance patient data with additional FHIR data
                     enhanced_patient = await self._enhance_patient_data(patient)
+                    # Validate data completeness
+                    validation_result = self._validate_patient_data_completeness(
+                        enhanced_patient,
+                        require_medical_data=require_medical_data
+                    )
+                    # Update validation summary
+                    total_completeness_score += validation_result["validation_score"]
+                    if validation_result["has_medical_data"]:
+                        validation_summary["with_medical_data"] += 1
+                    else:
+                        validation_summary["without_medical_data"] += 1
+                    # Check if patient meets completeness criteria
+                    if not validation_result["is_complete"] or validation_result["validation_score"] < min_completeness_score:
+                        filtered_count += 1
+                        filtered_patients.append({
+                            "name": patient['full_name'],
+                            "fhir_id": patient['fhir_id'],
+                            "missing_fields": validation_result["missing_fields"],
+                            "completeness_score": validation_result["validation_score"],
+                            "has_medical_data": validation_result["has_medical_data"]
+                        })
+                        print(f"Patient {patient['full_name']} filtered out - missing: {validation_result['missing_fields']}, score: {validation_result['validation_score']:.2f}")
+                        validation_summary["incomplete_patients"] += 1
+                        continue
+                    validation_summary["complete_patients"] += 1
                     # Insert into database
                     result = await db.patients.insert_one(enhanced_patient)
                     if result.inserted_id:
                         imported_count += 1
+                        imported_patients.append({
+                            "name": patient['full_name'],
+                            "fhir_id": patient['fhir_id'],
+                            "completeness_score": validation_result["validation_score"],
+                            "has_medical_data": validation_result["has_medical_data"]
+                        })
+                        print(f"Imported patient: {patient['full_name']} (ID: {result.inserted_id}, Score: {validation_result['validation_score']:.2f})")
                 except Exception as e:
                     error_msg = f"Error importing patient {patient.get('full_name', 'Unknown')}: {e}"
                     print(error_msg)
                     continue
+            # Calculate average completeness score
+            if validation_summary["total_processed"] > 0:
+                validation_summary["average_completeness_score"] = total_completeness_score / validation_summary["total_processed"]
+            print(f"Import completed: {imported_count} imported, {skipped_count} skipped, {filtered_count} filtered out")
+            print(f"Validation summary: {validation_summary}")
             return {
                 "imported_count": imported_count,
                 "skipped_count": skipped_count,
+                "filtered_count": filtered_count,
                 "total_found": len(patients),
                 "imported_patients": imported_patients,
                 "skipped_patients": skipped_patients,
+                "filtered_patients": filtered_patients,
+                "validation_summary": validation_summary,
                 "errors": errors
             }
             return {
                 "imported_count": 0,
                 "skipped_count": 0,
+                "filtered_count": 0,
                 "total_found": 0,
                 "imported_patients": [],
                 "skipped_patients": [],
+                "filtered_patients": [],
+                "validation_summary": {},
                 "errors": [str(e)]
             }

api/services/synthea_integration.py CHANGED Viewed

@@ -28,7 +28,10 @@ class SyntheaIntegrationService:
         # Check if we're in a containerized environment (like Hugging Face Spaces)
         self.is_containerized = os.path.exists('/.dockerenv') or os.environ.get('HF_SPACE_ID') is not None
-        # Always try to use real Synthea first, regardless of environment
         self.use_mock_data = False
         # Try multiple directory locations for better compatibility
@@ -55,7 +58,8 @@ class SyntheaIntegrationService:
             logger.warning("⚠️ No writable temp directory found, using current directory")
         self.synthea_dir = base_temp_dir / "cps_synthea"
-        self.output_dir = base_temp_dir / "cps_fhir_output"
         self.synthea_jar_path = self.synthea_dir / "synthea-with-dependencies.jar"
         # Try to create directories
@@ -65,7 +69,9 @@ class SyntheaIntegrationService:
             logger.info(f"✅ Using directories: synthea={self.synthea_dir}, output={self.output_dir}")
         except Exception as e:
             logger.warning(f"⚠️ Could not create directories: {e}, will try to use existing paths")
-            # Don't set use_mock_data to True yet - let's try to work with what we have
         # Synthea configuration
         self.default_config = {
@@ -90,6 +96,20 @@ class SyntheaIntegrationService:
             "exporter.fhir.include_practitioners": "false"
         }
     async def download_synthea(self) -> bool:
         """
         Download Synthea JAR file if not present
@@ -160,10 +180,12 @@ class SyntheaIntegrationService:
                 file.unlink()
             # Run Synthea with command line arguments (more reliable)
             cmd = [
                 "java", "-jar", str(self.synthea_jar_path),
                 "-p", str(population),
-                "-o", str(self.output_dir.absolute()),
                 "--seed", str(int(datetime.now().timestamp())),
                 "--exporter.fhir.transaction_bundle=true",
                 "--exporter.fhir.include_patient_summary=true",
@@ -181,7 +203,7 @@ class SyntheaIntegrationService:
             logger.info(f"Output directory exists before generation: {self.output_dir.exists()}")
             # Try multiple working directories for better compatibility
-            working_dirs = [str(self.synthea_dir), str(self.output_dir), str(Path.cwd())]
             process = None
             for working_dir in working_dirs:
@@ -206,7 +228,13 @@ class SyntheaIntegrationService:
             if process.returncode == 0:
                 logger.info("✅ Synthea generation completed successfully")
-                logger.info(f"Output: {stdout.decode()}")
                 # Debug: Check what files were actually created
                 logger.info(f"🔍 Checking output directory immediately after generation: {self.output_dir}")
@@ -222,6 +250,14 @@ class SyntheaIntegrationService:
                     for subdir in subdirs:
                         json_files = list(subdir.glob("*.json"))
                         logger.info(f"📁 JSON files in {subdir.name}: {[f.name for f in json_files]}")
                     # Also check if files were created in the working directory
                     working_dir_files = list(Path.cwd().glob("*.json"))
@@ -231,12 +267,29 @@ class SyntheaIntegrationService:
                     synthea_dir_files = list(self.synthea_dir.glob("*.json"))
                     logger.info(f"📁 JSON files in synthea directory: {[f.name for f in synthea_dir_files]}")
                 else:
                     logger.warning(f"⚠️ Output directory does not exist: {self.output_dir}")
                 return True
             else:
-                error_output = stderr.decode()
                 logger.error(f"❌ Synthea generation failed with return code {process.returncode}")
                 logger.error(f"Error output: {error_output}")
                 return False
@@ -245,7 +298,7 @@ class SyntheaIntegrationService:
             logger.error(f"❌ Error running Synthea: {str(e)}")
             return False
-    async def process_synthea_output(self) -> List[Dict[str, Any]]:
         """
         Process Synthea output files and convert to application format
         """
@@ -257,9 +310,10 @@ class SyntheaIntegrationService:
             # List of directories to search for Synthea output
             search_dirs = [
-                self.output_dir,
-                self.output_dir.parent,
-                Path.cwd(),
                 Path('/tmp'),
                 Path('/app'),
                 Path('/app/tmp')
@@ -315,6 +369,9 @@ class SyntheaIntegrationService:
                 return []
             # Process each patient file
             for file_path in patient_files:
                 try:
                     logger.info(f"📄 Processing file: {file_path}")
@@ -324,8 +381,14 @@ class SyntheaIntegrationService:
                     patient_data = await self._extract_patient_data(bundle, file_path.name)
                     if patient_data:
-                        patients.append(patient_data)
-                        logger.info(f"✅ Extracted patient: {patient_data.get('full_name', 'Unknown')}")
                     else:
                         logger.warning(f"⚠️ No patient data extracted from {file_path}")
@@ -333,6 +396,8 @@ class SyntheaIntegrationService:
                     logger.error(f"❌ Error processing {file_path}: {str(e)}")
                     continue
             logger.info(f"✅ Successfully processed {len(patients)} patients from Synthea output")
             return patients
@@ -527,6 +592,72 @@ class SyntheaIntegrationService:
         except:
             return []
     async def save_patients_to_database(self, patients: List[Dict[str, Any]]) -> Dict[str, Any]:
         """
         Save generated patients directly to the database
@@ -553,6 +684,9 @@ class SyntheaIntegrationService:
                         'marital_status': patient.get('marital_status', ''),
                         'language': patient.get('language', 'English'),
                         'source': patient.get('source', 'synthea'),
                         'import_date': datetime.utcnow(),
                         'last_updated': datetime.utcnow(),
                         'conditions': patient.get('conditions', []),
@@ -599,7 +733,8 @@ class SyntheaIntegrationService:
         age_min: int = 18,
         age_max: int = 80,
         gender: str = "both",
-        location: str = "Massachusetts"
     ) -> Dict[str, Any]:
         """
         Complete workflow: generate Synthea data and prepare for import
@@ -607,7 +742,7 @@ class SyntheaIntegrationService:
         try:
             logger.info(f"🎯 Starting Synthea generation for {population} patients")
-            # Always try real Synthea first, regardless of environment
             try:
                 # Download Synthea if needed
                 if not await self.download_synthea():
@@ -630,12 +765,19 @@ class SyntheaIntegrationService:
                     logger.error("❌ Synthea generation failed")
                     raise Exception("Synthea generation failed")
-                # Process output
-                patients = await self.process_synthea_output()
                 if not patients:
-                    logger.error("❌ No patients generated from Synthea")
-                    raise Exception("No patients generated from Synthea")
                 # Save patients to database
                 db_result = await self.save_patients_to_database(patients)
@@ -650,15 +792,14 @@ class SyntheaIntegrationService:
                     "patients": patients,
                     "config": config_overrides,
                     "output_directory": str(self.output_dir),
-                    "source": "synthea_real"
                 }
             except Exception as e:
                 logger.error(f"❌ Synthea integration failed: {str(e)}")
-                raise HTTPException(
-                    status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-                    detail=f"Synthea generation failed: {str(e)}"
-                )
         except Exception as e:
             logger.error(f"❌ Error in generate_and_import_patients: {str(e)}")
@@ -667,198 +808,6 @@ class SyntheaIntegrationService:
                 detail=f"Patient generation failed: {str(e)}"
             )
-    async def _generate_mock_patients(
-        self,
-        population: int = 10,
-        age_min: int = 18,
-        age_max: int = 80,
-        gender: str = "both",
-        location: str = "Massachusetts"
-    ) -> Dict[str, Any]:
-        """
-        Generate realistic mock patient data when Synthea is not available
-        """
-        import random
-        from datetime import datetime, timedelta
-        logger.info(f"🎭 Generating {population} realistic mock patients")
-        # More comprehensive name lists
-        first_names = [
-            "John", "Jane", "Michael", "Sarah", "David", "Emily", "Robert", "Lisa", "James", "Maria",
-            "William", "Jennifer", "Christopher", "Jessica", "Daniel", "Amanda", "Matthew", "Nicole", "Anthony", "Stephanie",
-            "Mark", "Melissa", "Donald", "Michelle", "Steven", "Laura", "Paul", "Kimberly", "Andrew", "Deborah",
-            "Joshua", "Dorothy", "Kenneth", "Helen", "Kevin", "Sharon", "Brian", "Carol", "George", "Ruth",
-            "Edward", "Julie", "Ronald", "Joyce", "Timothy", "Virginia", "Jason", "Victoria", "Jeffrey", "Kelly"
-        ]
-        last_names = [
-            "Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez",
-            "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson", "Thomas", "Taylor", "Moore", "Jackson", "Martin",
-            "Lee", "Perez", "Thompson", "White", "Harris", "Sanchez", "Clark", "Ramirez", "Lewis", "Robinson",
-            "Walker", "Young", "Allen", "King", "Wright", "Scott", "Torres", "Nguyen", "Hill", "Flores",
-            "Green", "Adams", "Nelson", "Baker", "Hall", "Rivera", "Campbell", "Mitchell", "Carter", "Roberts"
-        ]
-        cities = ["Boston", "Cambridge", "Worcester", "Springfield", "Lowell", "New Bedford", "Brockton", "Quincy", "Lynn", "Fall River"]
-        # Medical conditions for more realistic data
-        conditions = [
-            "Hypertension", "Diabetes Type 2", "Asthma", "Depression", "Anxiety", "Obesity", "Arthritis", "Heart Disease",
-            "High Cholesterol", "Migraine", "Insomnia", "GERD", "Allergies", "Back Pain", "Carpal Tunnel Syndrome"
-        ]
-        # Medications for more realistic data
-        medications = [
-            "Lisinopril", "Metformin", "Albuterol", "Sertraline", "Atorvastatin", "Omeprazole", "Ibuprofen", "Acetaminophen",
-            "Loratadine", "Melatonin", "Vitamin D", "Fish Oil", "Calcium", "Iron", "Folic Acid"
-        ]
-        patients = []
-        for i in range(population):
-            # Generate random name
-            first_name = random.choice(first_names)
-            last_name = random.choice(last_names)
-            full_name = f"{first_name} {last_name}"
-            # Generate random age
-            age = random.randint(age_min, age_max)
-            birth_date = datetime.now() - timedelta(days=age*365 + random.randint(0, 365))
-            # Generate random gender
-            if gender == "both":
-                patient_gender = random.choice(["male", "female"])
-            else:
-                patient_gender = gender
-            # Generate random address
-            street_number = random.randint(100, 9999)
-            street_name = random.choice(["Main St", "Oak Ave", "Elm St", "Maple Dr", "Cedar Ln", "Pine Rd", "Birch Way", "Willow Ct"])
-            city = random.choice(cities)
-            state = "MA"
-            postal_code = f"{random.randint(10000, 99999)}"
-            # Generate realistic medical data
-            patient_conditions = []
-            patient_medications = []
-            patient_encounters = []
-            patient_observations = []
-            patient_procedures = []
-            patient_immunizations = []
-            patient_allergies = []
-            # Add 0-3 random conditions
-            num_conditions = random.randint(0, 3)
-            for _ in range(num_conditions):
-                condition = random.choice(conditions)
-                if condition not in [c['code'] for c in patient_conditions]:
-                    patient_conditions.append({
-                        'id': f"condition-{random.randint(1000, 9999)}",
-                        'code': condition,
-                        'status': random.choice(['active', 'inactive', 'resolved']),
-                        'onset_date': (datetime.now() - timedelta(days=random.randint(30, 3650))).strftime('%Y-%m-%d'),
-                        'recorded_date': datetime.now().strftime('%Y-%m-%d'),
-                        'verification_status': 'confirmed',
-                        'category': 'diagnosis'
-                    })
-            # Add 0-4 random medications
-            num_medications = random.randint(0, 4)
-            for _ in range(num_medications):
-                medication = random.choice(medications)
-                if medication not in [m['name'] for m in patient_medications]:
-                    patient_medications.append({
-                        'id': f"med-{random.randint(1000, 9999)}",
-                        'name': medication,
-                        'status': random.choice(['active', 'discontinued', 'completed']),
-                        'prescribed_date': (datetime.now() - timedelta(days=random.randint(7, 365))).strftime('%Y-%m-%d'),
-                        'requester': f"Dr. {random.choice(['Smith', 'Johnson', 'Williams', 'Brown', 'Davis'])}",
-                        'dosage': f"{random.randint(1, 3)} tablet(s) daily",
-                        'intent': 'order',
-                        'priority': 'routine'
-                    })
-            # Add 1-5 encounters
-            num_encounters = random.randint(1, 5)
-            for j in range(num_encounters):
-                encounter_date = datetime.now() - timedelta(days=random.randint(1, 365))
-                patient_encounters.append({
-                    'id': f"encounter-{random.randint(1000, 9999)}",
-                    'type': random.choice(['Office Visit', 'Emergency Room', 'Hospital Admission', 'Telemedicine', 'Lab Visit']),
-                    'status': 'finished',
-                    'period': {
-                        'start': encounter_date.strftime('%Y-%m-%dT%H:%M:%S'),
-                        'end': (encounter_date + timedelta(hours=random.randint(1, 4))).strftime('%Y-%m-%dT%H:%M:%S')
-                    },
-                    'service_provider': f"{random.choice(['General Hospital', 'Medical Center', 'Clinic'])}",
-                    'class': 'ambulatory',
-                    'reason': random.choice(['Routine Checkup', 'Follow-up', 'Emergency', 'Lab Work', 'Consultation'])
-                })
-            # Add 2-8 observations
-            num_observations = random.randint(2, 8)
-            for _ in range(num_observations):
-                observation_date = datetime.now() - timedelta(days=random.randint(1, 365))
-                patient_observations.append({
-                    'id': f"obs-{random.randint(1000, 9999)}",
-                    'code': random.choice(['Blood Pressure', 'Heart Rate', 'Temperature', 'Weight', 'Height', 'Blood Glucose', 'Cholesterol']),
-                    'value': f"{random.randint(70, 200)}",
-                    'unit': random.choice(['mmHg', 'bpm', '°F', 'lbs', 'inches', 'mg/dL']),
-                    'status': 'final',
-                    'effective_date': observation_date.strftime('%Y-%m-%dT%H:%M:%S'),
-                    'category': 'vital-signs'
-                })
-            patient_data = {
-                'fhir_id': f"mock-patient-{i+1}",
-                'full_name': full_name,
-                'gender': patient_gender,
-                'date_of_birth': birth_date.strftime('%Y-%m-%d'),
-                'address': f"{street_number} {street_name}",
-                'city': city,
-                'state': state,
-                'postal_code': postal_code,
-                'country': 'US',
-                'marital_status': random.choice(['single', 'married', 'divorced', 'widowed']),
-                'language': 'English',
-                'source': 'synthea_mock',
-                'import_date': datetime.utcnow().isoformat(),
-                'last_updated': datetime.utcnow().isoformat(),
-                'conditions': patient_conditions,
-                'medications': patient_medications,
-                'encounters': patient_encounters,
-                'observations': patient_observations,
-                'procedures': patient_procedures,
-                'immunizations': patient_immunizations,
-                'allergies': patient_allergies
-            }
-            patients.append(patient_data)
-        # Save mock patients to database
-        if patients:
-            db_result = await self.save_patients_to_database(patients)
-            logger.info(f"💾 Mock patients database save result: {db_result}")
-        else:
-            db_result = {"saved_count": 0, "failed_count": 0, "errors": ["No mock patients to save"], "success": False}
-        return {
-            "status": "success",
-            "generated_patients": len(patients),
-            "saved_to_database": db_result["saved_count"],
-            "failed_to_save": db_result["failed_count"],
-            "database_errors": db_result["errors"],
-            "patients": patients,
-            "config": {
-                "population": population,
-                "age_min": age_min,
-                "age_max": age_max,
-                "gender": gender,
-                "location": location
-            },
-            "output_directory": "mock_data",
-            "source": "synthea_mock"
-        }
     async def get_synthea_statistics(self) -> Dict[str, Any]:
         """
         Get statistics about Synthea capabilities and generated data
@@ -873,8 +822,7 @@ class SyntheaIntegrationService:
                 "synthea_available": False,
                 "java_available": False,
                 "directories_accessible": False,
-                "environment": "local",
-                "using_mock_data": False
             }
             # Set environment info
@@ -883,8 +831,7 @@ class SyntheaIntegrationService:
             else:
                 stats["environment"] = "local"
-            # Always try to use real Synthea
-            stats["using_mock_data"] = False
             # Check if directories are accessible
             try:
@@ -957,6 +904,5 @@ class SyntheaIntegrationService:
                 "synthea_available": False,
                 "java_available": False,
                 "directories_accessible": False,
-                "environment": "unknown",
-                "using_mock_data": True
             }

         # Check if we're in a containerized environment (like Hugging Face Spaces)
         self.is_containerized = os.path.exists('/.dockerenv') or os.environ.get('HF_SPACE_ID') is not None
+        # Check if Java is available locally
+        self.java_available = self._check_java_availability()
+        # Always use real Synthea data - no fallback to mock data
         self.use_mock_data = False
         # Try multiple directory locations for better compatibility
             logger.warning("⚠️ No writable temp directory found, using current directory")
         self.synthea_dir = base_temp_dir / "cps_synthea"
+        # Use the actual output directory where Synthea creates files
+        self.output_dir = Path.cwd() / "output" / "fhir"
         self.synthea_jar_path = self.synthea_dir / "synthea-with-dependencies.jar"
         # Try to create directories
             logger.info(f"✅ Using directories: synthea={self.synthea_dir}, output={self.output_dir}")
         except Exception as e:
             logger.warning(f"⚠️ Could not create directories: {e}, will try to use existing paths")
+        # Log the configuration
+        logger.info("🚀 Using real Synthea generation (no fallback to mock data)")
         # Synthea configuration
         self.default_config = {
             "exporter.fhir.include_practitioners": "false"
         }
+    def _check_java_availability(self) -> bool:
+        """
+        Check if Java is available in the system
+        """
+        try:
+            import subprocess
+            result = subprocess.run(['java', '-version'],
+                                  capture_output=True,
+                                  text=True,
+                                  timeout=10)
+            return result.returncode == 0
+        except (subprocess.TimeoutExpired, FileNotFoundError, Exception):
+            return False
     async def download_synthea(self) -> bool:
         """
         Download Synthea JAR file if not present
                 file.unlink()
             # Run Synthea with command line arguments (more reliable)
+            # Use the parent directory of output/fhir as the output directory
+            output_parent = Path.cwd() / "output"
             cmd = [
                 "java", "-jar", str(self.synthea_jar_path),
                 "-p", str(population),
+                "-o", str(output_parent.absolute()),
                 "--seed", str(int(datetime.now().timestamp())),
                 "--exporter.fhir.transaction_bundle=true",
                 "--exporter.fhir.include_patient_summary=true",
             logger.info(f"Output directory exists before generation: {self.output_dir.exists()}")
             # Try multiple working directories for better compatibility
+            working_dirs = [str(Path.cwd()), str(self.synthea_dir), str(self.output_dir)]
             process = None
             for working_dir in working_dirs:
             if process.returncode == 0:
                 logger.info("✅ Synthea generation completed successfully")
+                # Handle potential encoding issues with subprocess output
+                try:
+                    output_text = stdout.decode('utf-8', errors='ignore')
+                    logger.info(f"Output: {output_text}")
+                except Exception as decode_error:
+                    logger.warning(f"⚠️ Could not decode stdout: {decode_error}")
+                    logger.info("✅ Synthea generation completed successfully (output not displayed due to encoding)")
                 # Debug: Check what files were actually created
                 logger.info(f"🔍 Checking output directory immediately after generation: {self.output_dir}")
                     for subdir in subdirs:
                         json_files = list(subdir.glob("*.json"))
                         logger.info(f"📁 JSON files in {subdir.name}: {[f.name for f in json_files]}")
+                    # Specifically check for fhir subdirectory
+                    fhir_dir = self.output_dir / "fhir"
+                    if fhir_dir.exists():
+                        fhir_files = list(fhir_dir.glob("*.json"))
+                        logger.info(f"📁 JSON files in fhir subdirectory: {[f.name for f in fhir_files]}")
+                    else:
+                        logger.warning(f"⚠️ FHIR subdirectory does not exist: {fhir_dir}")
                     # Also check if files were created in the working directory
                     working_dir_files = list(Path.cwd().glob("*.json"))
                     synthea_dir_files = list(self.synthea_dir.glob("*.json"))
                     logger.info(f"📁 JSON files in synthea directory: {[f.name for f in synthea_dir_files]}")
+                    # Check what files were created in the working directory where Synthea actually ran
+                    for working_dir in working_dirs:
+                        if Path(working_dir).exists():
+                            working_dir_files = list(Path(working_dir).glob("*.json"))
+                            logger.info(f"📁 JSON files in working directory {working_dir}: {[f.name for f in working_dir_files]}")
+                            # Also check subdirectories in the working directory
+                            for subdir in Path(working_dir).iterdir():
+                                if subdir.is_dir():
+                                    subdir_files = list(subdir.glob("*.json"))
+                                    if subdir_files:
+                                        logger.info(f"📁 JSON files in subdirectory {subdir}: {[f.name for f in subdir_files]}")
                 else:
                     logger.warning(f"⚠️ Output directory does not exist: {self.output_dir}")
                 return True
             else:
+                # Handle potential encoding issues with stderr
+                try:
+                    error_output = stderr.decode('utf-8', errors='ignore')
+                except Exception as decode_error:
+                    error_output = f"Could not decode error output: {decode_error}"
                 logger.error(f"❌ Synthea generation failed with return code {process.returncode}")
                 logger.error(f"Error output: {error_output}")
                 return False
             logger.error(f"❌ Error running Synthea: {str(e)}")
             return False
+    async def process_synthea_output(self, require_medical_data: bool = False) -> List[Dict[str, Any]]:
         """
         Process Synthea output files and convert to application format
         """
             # List of directories to search for Synthea output
             search_dirs = [
+                self.output_dir,  # The actual fhir directory where files are created
+                Path.cwd() / "output" / "fhir",  # Explicit path to fhir directory
+                Path.cwd() / "output",  # Parent output directory
+                Path.cwd(),  # Current working directory
                 Path('/tmp'),
                 Path('/app'),
                 Path('/app/tmp')
                 return []
             # Process each patient file
+            valid_patients = 0
+            invalid_patients = 0
             for file_path in patient_files:
                 try:
                     logger.info(f"📄 Processing file: {file_path}")
                     patient_data = await self._extract_patient_data(bundle, file_path.name)
                     if patient_data:
+                        # Validate patient data completeness
+                        if self._validate_patient_data_completeness(patient_data, require_medical_data):
+                            patients.append(patient_data)
+                            valid_patients += 1
+                            logger.info(f"✅ Validated and extracted patient: {patient_data.get('full_name', 'Unknown')}")
+                        else:
+                            invalid_patients += 1
+                            logger.warning(f"❌ Patient validation failed: {patient_data.get('full_name', 'Unknown')}")
                     else:
                         logger.warning(f"⚠️ No patient data extracted from {file_path}")
                     logger.error(f"❌ Error processing {file_path}: {str(e)}")
                     continue
+            logger.info(f"📊 Patient validation summary: {valid_patients} valid, {invalid_patients} invalid")
             logger.info(f"✅ Successfully processed {len(patients)} patients from Synthea output")
             return patients
         except:
             return []
+    def _validate_patient_data_completeness(self, patient_data: Dict[str, Any], require_medical_data: bool = True) -> bool:
+        """
+        Validate that a patient has all required data fields
+        Args:
+            patient_data: The patient data to validate
+            require_medical_data: If True, patient must have at least some medical data (default: True for complete data)
+        """
+        try:
+            # Required basic fields - all must be present and not empty
+            required_fields = [
+                'full_name', 'gender', 'date_of_birth', 'address',
+                'city', 'state', 'postal_code', 'country'
+            ]
+            # Check if all required fields are present and not empty
+            for field in required_fields:
+                value = patient_data.get(field)
+                if not value or (isinstance(value, str) and not value.strip()):
+                    logger.warning(f"⚠️ Missing or empty required field '{field}' for patient {patient_data.get('full_name', 'Unknown')}")
+                    return False
+            # Validate name is not just whitespace
+            if not patient_data.get('full_name', '').strip():
+                logger.warning(f"⚠️ Empty or invalid name for patient")
+                return False
+            # Validate gender is valid
+            if patient_data.get('gender') not in ['male', 'female', 'other', 'unknown']:
+                logger.warning(f"⚠️ Invalid gender '{patient_data.get('gender')}' for patient {patient_data.get('full_name', 'Unknown')}")
+                return False
+            # Validate date of birth format
+            try:
+                if patient_data.get('date_of_birth'):
+                    datetime.strptime(patient_data['date_of_birth'], '%Y-%m-%d')
+                else:
+                    logger.warning(f"⚠️ Missing date of birth for patient {patient_data.get('full_name', 'Unknown')}")
+                    return False
+            except ValueError:
+                logger.warning(f"⚠️ Invalid date of birth format '{patient_data.get('date_of_birth')}' for patient {patient_data.get('full_name', 'Unknown')}")
+                return False
+            # For complete data, we require medical data
+            if require_medical_data:
+                # Check if patient has at least some medical data
+                medical_data_present = (
+                    len(patient_data.get('conditions', [])) > 0 or
+                    len(patient_data.get('medications', [])) > 0 or
+                    len(patient_data.get('encounters', [])) > 0 or
+                    len(patient_data.get('observations', [])) > 0
+                )
+                if not medical_data_present:
+                    logger.warning(f"❌ Patient {patient_data.get('full_name', 'Unknown')} rejected: no medical data (conditions, medications, encounters, or observations)")
+                    return False
+                else:
+                    logger.info(f"✅ Patient {patient_data.get('full_name', 'Unknown')} has medical data")
+            logger.info(f"✅ Patient {patient_data.get('full_name', 'Unknown')} passed complete validation")
+            return True
+        except Exception as e:
+            logger.error(f"❌ Error validating patient data: {str(e)}")
+            return False
     async def save_patients_to_database(self, patients: List[Dict[str, Any]]) -> Dict[str, Any]:
         """
         Save generated patients directly to the database
                         'marital_status': patient.get('marital_status', ''),
                         'language': patient.get('language', 'English'),
                         'source': patient.get('source', 'synthea'),
+                        'status': 'active',
+                        'created_at': datetime.utcnow(),
+                        'updated_at': datetime.utcnow(),
                         'import_date': datetime.utcnow(),
                         'last_updated': datetime.utcnow(),
                         'conditions': patient.get('conditions', []),
         age_min: int = 18,
         age_max: int = 80,
         gender: str = "both",
+        location: str = "Massachusetts",
+        require_medical_data: bool = True
     ) -> Dict[str, Any]:
         """
         Complete workflow: generate Synthea data and prepare for import
         try:
             logger.info(f"🎯 Starting Synthea generation for {population} patients")
+            # Always use real Synthea - no fallback to mock data
             try:
                 # Download Synthea if needed
                 if not await self.download_synthea():
                     logger.error("❌ Synthea generation failed")
                     raise Exception("Synthea generation failed")
+                # Process output - only get patients with complete data
+                patients = await self.process_synthea_output(require_medical_data=True)
                 if not patients:
+                    logger.error("❌ No patients with complete data generated from Synthea")
+                    raise Exception("No patients with complete data generated from Synthea")
+                # Limit to exactly 10 patients with complete data
+                if len(patients) > 10:
+                    logger.info(f"📊 Limiting from {len(patients)} to 10 patients with complete data")
+                    patients = patients[:10]
+                logger.info(f"📊 Final patient count for database storage: {len(patients)}")
                 # Save patients to database
                 db_result = await self.save_patients_to_database(patients)
                     "patients": patients,
                     "config": config_overrides,
                     "output_directory": str(self.output_dir),
+                    "source": "synthea_real",
+                    "message": f"Successfully stored {db_result['saved_count']} patients with complete data to database"
                 }
             except Exception as e:
                 logger.error(f"❌ Synthea integration failed: {str(e)}")
+                # No fallback to mock data - raise the error
+                raise Exception(f"Synthea generation failed: {str(e)}")
         except Exception as e:
             logger.error(f"❌ Error in generate_and_import_patients: {str(e)}")
                 detail=f"Patient generation failed: {str(e)}"
             )
     async def get_synthea_statistics(self) -> Dict[str, Any]:
         """
         Get statistics about Synthea capabilities and generated data
                 "synthea_available": False,
                 "java_available": False,
                 "directories_accessible": False,
+                "environment": "local"
             }
             # Set environment info
             else:
                 stats["environment"] = "local"
+            # Always use real Synthea - no mock data fallback
             # Check if directories are accessible
             try:
                 "synthea_available": False,
                 "java_available": False,
                 "directories_accessible": False,
+                "environment": "unknown"
             }

data/new_tool.json DELETED Viewed

	@@ -1 +0,0 @@
1	- []