Ali2206 commited on
Commit
f0d524c
·
1 Parent(s): 25dd79e

device token

Browse files
api/routes/patients.py CHANGED
@@ -726,6 +726,18 @@ async def get_patients(
726
  if patient.get("date_of_birth") == "":
727
  patient["date_of_birth"] = None
728
 
 
 
 
 
 
 
 
 
 
 
 
 
729
  processed_patients.append(patient)
730
 
731
  logger.info(f"✅ Returning {len(processed_patients)} processed patients")
@@ -1170,14 +1182,20 @@ async def update_patient(
1170
  @router.post("/patients/import-hapi-fhir", status_code=status.HTTP_201_CREATED)
1171
  async def import_hapi_patients(
1172
  limit: int = Query(20, ge=1, le=100, description="Number of patients to import"),
 
 
1173
  current_user: dict = Depends(get_current_user)
1174
  ):
1175
  """
1176
- Import patients from HAPI FHIR Test Server
1177
  """
1178
  try:
1179
  service = HAPIFHIRIntegrationService()
1180
- result = await service.import_patients_from_hapi(limit=limit)
 
 
 
 
1181
 
1182
  # Create detailed message
1183
  message_parts = []
@@ -1185,6 +1203,8 @@ async def import_hapi_patients(
1185
  message_parts.append(f"Successfully imported {result['imported_count']} patients")
1186
  if result["skipped_count"] > 0:
1187
  message_parts.append(f"Skipped {result['skipped_count']} duplicate patients")
 
 
1188
  if result["errors"]:
1189
  message_parts.append(f"Encountered {len(result['errors'])} errors")
1190
 
@@ -1194,9 +1214,12 @@ async def import_hapi_patients(
1194
  "message": message,
1195
  "imported_count": result["imported_count"],
1196
  "skipped_count": result["skipped_count"],
 
1197
  "total_found": result["total_found"],
1198
  "imported_patients": result["imported_patients"],
1199
  "skipped_patients": result["skipped_patients"],
 
 
1200
  "errors": result["errors"],
1201
  "source": "hapi_fhir"
1202
  }
@@ -1386,17 +1409,12 @@ async def fetch_ehr_data(
1386
  "field_mapping": ehr_system_config["field_mapping"]
1387
  }
1388
  else:
1389
- # For other EHR systems, we would implement actual FHIR client calls
1390
- # For now, return a sample structure
1391
- sample_data = generate_sample_ehr_data(ehr_system, limit)
1392
-
1393
- return {
1394
- "ehr_system": ehr_system,
1395
- "data": sample_data,
1396
- "total_count": len(sample_data),
1397
- "field_mapping": ehr_system_config["field_mapping"],
1398
- "note": "This is sample data. Implement actual FHIR client integration for production use."
1399
- }
1400
 
1401
  except Exception as e:
1402
  logger.error(f"Error fetching EHR data: {str(e)}")
@@ -1405,89 +1423,7 @@ async def fetch_ehr_data(
1405
  detail=f"Failed to fetch EHR data: {str(e)}"
1406
  )
1407
 
1408
- def generate_sample_ehr_data(ehr_system: str, limit: int) -> List[dict]:
1409
- """
1410
- Generate sample EHR data for testing purposes
1411
- """
1412
- import random
1413
- from datetime import date, timedelta
1414
-
1415
- sample_names = [
1416
- "John Smith", "Jane Doe", "Michael Johnson", "Sarah Wilson", "David Brown",
1417
- "Emily Davis", "Robert Miller", "Lisa Garcia", "James Rodriguez", "Maria Martinez",
1418
- "Christopher Anderson", "Jennifer Taylor", "Daniel Thomas", "Amanda Jackson",
1419
- "Matthew White", "Nicole Harris", "Joshua Martin", "Stephanie Thompson"
1420
- ]
1421
-
1422
- sample_addresses = [
1423
- "123 Main St, New York, NY 10001",
1424
- "456 Oak Ave, Los Angeles, CA 90210",
1425
- "789 Pine Rd, Chicago, IL 60601",
1426
- "321 Elm St, Houston, TX 77001",
1427
- "654 Maple Dr, Phoenix, AZ 85001"
1428
- ]
1429
-
1430
- sample_allergies = [
1431
- "Penicillin", "Peanuts", "Latex", "Shellfish", "Dairy", "Eggs", "Soy", "Wheat"
1432
- ]
1433
-
1434
- sample_conditions = [
1435
- "Hypertension", "Diabetes Type 2", "Asthma", "Depression", "Anxiety",
1436
- "Obesity", "Arthritis", "Heart Disease", "Chronic Kidney Disease", "COPD"
1437
- ]
1438
-
1439
- sample_medications = [
1440
- "Lisinopril", "Metformin", "Albuterol", "Sertraline", "Atorvastatin",
1441
- "Omeprazole", "Amlodipine", "Losartan", "Simvastatin", "Hydrochlorothiazide"
1442
- ]
1443
-
1444
- sample_insurance = [
1445
- ("Blue Cross Blue Shield", "BCBS123456"),
1446
- ("Aetna", "AET789012"),
1447
- ("Cigna", "CIG345678"),
1448
- ("UnitedHealth", "UHC901234"),
1449
- ("Humana", "HUM567890")
1450
- ]
1451
-
1452
- data = []
1453
- for i in range(min(limit, 18)):
1454
- # Generate random date of birth (18-80 years old)
1455
- years_old = random.randint(18, 80)
1456
- birth_date = date.today() - timedelta(days=years_old * 365 + random.randint(0, 365))
1457
-
1458
- # Generate random allergies and conditions
1459
- patient_allergies = random.sample(sample_allergies, random.randint(0, 3))
1460
- patient_conditions = random.sample(sample_conditions, random.randint(0, 2))
1461
- patient_medications = random.sample(sample_medications, random.randint(0, 3))
1462
-
1463
- # Generate emergency contact
1464
- emergency_contact = random.choice(sample_names)
1465
- while emergency_contact == sample_names[i % len(sample_names)]:
1466
- emergency_contact = random.choice(sample_names)
1467
-
1468
- # Generate insurance info
1469
- insurance_provider, insurance_policy = random.choice(sample_insurance)
1470
-
1471
- patient_data = {
1472
- "ehr_id": f"{ehr_system.upper()}{str(i+1).zfill(3)}",
1473
- "full_name": sample_names[i % len(sample_names)],
1474
- "date_of_birth": birth_date.strftime("%Y-%m-%d"),
1475
- "gender": random.choice(["male", "female"]),
1476
- "address": random.choice(sample_addresses),
1477
- "national_id": f"{random.randint(100000000, 999999999)}",
1478
- "blood_type": random.choice(["A+", "A-", "B+", "B-", "AB+", "AB-", "O+", "O-"]),
1479
- "allergies": patient_allergies,
1480
- "chronic_conditions": patient_conditions,
1481
- "medications": patient_medications,
1482
- "emergency_contact_name": emergency_contact,
1483
- "emergency_contact_phone": f"555-{random.randint(100, 999)}-{random.randint(1000, 9999)}",
1484
- "insurance_provider": insurance_provider,
1485
- "insurance_policy_number": insurance_policy
1486
- }
1487
-
1488
- data.append(patient_data)
1489
-
1490
- return data
1491
 
1492
  @router.post("/patients/generate-synthea", status_code=status.HTTP_201_CREATED)
1493
  async def generate_synthea_patients(
@@ -1644,6 +1580,7 @@ async def generate_and_import_synthea_patients(
1644
  age_max: int = Query(80, ge=0, le=120, description="Maximum age for generated patients"),
1645
  gender: str = Query("both", description="Gender distribution: male, female, or both"),
1646
  location: str = Query("Massachusetts", description="Location for generated patients"),
 
1647
  current_user: dict = Depends(get_current_user)
1648
  ):
1649
  """
@@ -1666,7 +1603,8 @@ async def generate_and_import_synthea_patients(
1666
  age_min=age_min,
1667
  age_max=age_max,
1668
  gender=gender,
1669
- location=location
 
1670
  )
1671
 
1672
  if not generation_result['patients']:
 
726
  if patient.get("date_of_birth") == "":
727
  patient["date_of_birth"] = None
728
 
729
+ # Add missing required fields for Synthea patients
730
+ if "status" not in patient:
731
+ patient["status"] = "active"
732
+ if "created_at" not in patient:
733
+ patient["created_at"] = patient.get("import_date", datetime.utcnow())
734
+ if "updated_at" not in patient:
735
+ patient["updated_at"] = patient.get("last_updated", datetime.utcnow())
736
+
737
+ # Ensure source field is present
738
+ if "source" not in patient:
739
+ patient["source"] = "synthea"
740
+
741
  processed_patients.append(patient)
742
 
743
  logger.info(f"✅ Returning {len(processed_patients)} processed patients")
 
1182
  @router.post("/patients/import-hapi-fhir", status_code=status.HTTP_201_CREATED)
1183
  async def import_hapi_patients(
1184
  limit: int = Query(20, ge=1, le=100, description="Number of patients to import"),
1185
+ require_medical_data: bool = Query(False, description="Require patients to have medical data (conditions, medications, encounters, or observations)"),
1186
+ min_completeness_score: float = Query(0.7, ge=0.0, le=1.0, description="Minimum validation score (0-1) for a patient to be considered complete"),
1187
  current_user: dict = Depends(get_current_user)
1188
  ):
1189
  """
1190
+ Import patients from HAPI FHIR Test Server with data completeness validation
1191
  """
1192
  try:
1193
  service = HAPIFHIRIntegrationService()
1194
+ result = await service.import_patients_from_hapi(
1195
+ limit=limit,
1196
+ require_medical_data=require_medical_data,
1197
+ min_completeness_score=min_completeness_score
1198
+ )
1199
 
1200
  # Create detailed message
1201
  message_parts = []
 
1203
  message_parts.append(f"Successfully imported {result['imported_count']} patients")
1204
  if result["skipped_count"] > 0:
1205
  message_parts.append(f"Skipped {result['skipped_count']} duplicate patients")
1206
+ if result["filtered_count"] > 0:
1207
+ message_parts.append(f"Filtered out {result['filtered_count']} incomplete patients")
1208
  if result["errors"]:
1209
  message_parts.append(f"Encountered {len(result['errors'])} errors")
1210
 
 
1214
  "message": message,
1215
  "imported_count": result["imported_count"],
1216
  "skipped_count": result["skipped_count"],
1217
+ "filtered_count": result["filtered_count"],
1218
  "total_found": result["total_found"],
1219
  "imported_patients": result["imported_patients"],
1220
  "skipped_patients": result["skipped_patients"],
1221
+ "filtered_patients": result["filtered_patients"],
1222
+ "validation_summary": result["validation_summary"],
1223
  "errors": result["errors"],
1224
  "source": "hapi_fhir"
1225
  }
 
1409
  "field_mapping": ehr_system_config["field_mapping"]
1410
  }
1411
  else:
1412
+ # For other EHR systems, we don't support hardcoded data
1413
+ # Only real Synthea data is supported
1414
+ raise HTTPException(
1415
+ status_code=status.HTTP_400_BAD_REQUEST,
1416
+ detail=f"EHR system '{ehr_system}' is not supported. Only HAPI FHIR Test Server and Synthea are supported for real data generation."
1417
+ )
 
 
 
 
 
1418
 
1419
  except Exception as e:
1420
  logger.error(f"Error fetching EHR data: {str(e)}")
 
1423
  detail=f"Failed to fetch EHR data: {str(e)}"
1424
  )
1425
 
1426
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1427
 
1428
  @router.post("/patients/generate-synthea", status_code=status.HTTP_201_CREATED)
1429
  async def generate_synthea_patients(
 
1580
  age_max: int = Query(80, ge=0, le=120, description="Maximum age for generated patients"),
1581
  gender: str = Query("both", description="Gender distribution: male, female, or both"),
1582
  location: str = Query("Massachusetts", description="Location for generated patients"),
1583
+ require_medical_data: bool = Query(True, description="Require patients to have medical data (conditions, medications, encounters, or observations)"),
1584
  current_user: dict = Depends(get_current_user)
1585
  ):
1586
  """
 
1603
  age_min=age_min,
1604
  age_max=age_max,
1605
  gender=gender,
1606
+ location=location,
1607
+ require_medical_data=require_medical_data
1608
  )
1609
 
1610
  if not generation_result['patients']:
api/services/fhir_integration.py CHANGED
@@ -11,9 +11,90 @@ class HAPIFHIRIntegrationService:
11
  def __init__(self):
12
  self.fhir_client = HAPIFHIRClient()
13
 
14
- async def import_patients_from_hapi(self, limit: int = 20) -> dict:
15
  """
16
- Import patients from HAPI FHIR Test Server with detailed feedback
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  """
18
  try:
19
  print(f"Fetching {limit} patients from HAPI FHIR...")
@@ -24,19 +105,34 @@ class HAPIFHIRIntegrationService:
24
  return {
25
  "imported_count": 0,
26
  "skipped_count": 0,
 
27
  "total_found": 0,
28
  "imported_patients": [],
29
  "skipped_patients": [],
 
 
30
  "errors": []
31
  }
32
 
33
- print(f"Found {len(patients)} patients, checking for duplicates...")
34
 
35
  imported_count = 0
36
  skipped_count = 0
 
37
  imported_patients = []
38
  skipped_patients = []
 
39
  errors = []
 
 
 
 
 
 
 
 
 
 
40
 
41
  for patient in patients:
42
  try:
@@ -58,13 +154,48 @@ class HAPIFHIRIntegrationService:
58
  # Enhance patient data with additional FHIR data
59
  enhanced_patient = await self._enhance_patient_data(patient)
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  # Insert into database
62
  result = await db.patients.insert_one(enhanced_patient)
63
 
64
  if result.inserted_id:
65
  imported_count += 1
66
- imported_patients.append(patient['full_name'])
67
- print(f"Imported patient: {patient['full_name']} (ID: {result.inserted_id})")
 
 
 
 
 
68
 
69
  except Exception as e:
70
  error_msg = f"Error importing patient {patient.get('full_name', 'Unknown')}: {e}"
@@ -72,14 +203,22 @@ class HAPIFHIRIntegrationService:
72
  print(error_msg)
73
  continue
74
 
75
- print(f"Import completed: {imported_count} imported, {skipped_count} skipped")
 
 
 
 
 
76
 
77
  return {
78
  "imported_count": imported_count,
79
  "skipped_count": skipped_count,
 
80
  "total_found": len(patients),
81
  "imported_patients": imported_patients,
82
  "skipped_patients": skipped_patients,
 
 
83
  "errors": errors
84
  }
85
 
@@ -88,9 +227,12 @@ class HAPIFHIRIntegrationService:
88
  return {
89
  "imported_count": 0,
90
  "skipped_count": 0,
 
91
  "total_found": 0,
92
  "imported_patients": [],
93
  "skipped_patients": [],
 
 
94
  "errors": [str(e)]
95
  }
96
 
 
11
  def __init__(self):
12
  self.fhir_client = HAPIFHIRClient()
13
 
14
+ def _validate_patient_data_completeness(self, patient: Dict, require_medical_data: bool = False) -> Dict[str, any]:
15
  """
16
+ Validate if a patient has complete data
17
+
18
+ Args:
19
+ patient: Patient data dictionary
20
+ require_medical_data: Whether to require medical data (observations, medications, conditions)
21
+
22
+ Returns:
23
+ Dict with validation results:
24
+ {
25
+ "is_complete": bool,
26
+ "missing_fields": List[str],
27
+ "has_medical_data": bool,
28
+ "validation_score": float (0-1)
29
+ }
30
+ """
31
+ required_demographic_fields = [
32
+ 'full_name', 'gender', 'date_of_birth', 'address'
33
+ ]
34
+
35
+ optional_demographic_fields = [
36
+ 'phone', 'email', 'marital_status', 'language'
37
+ ]
38
+
39
+ medical_data_fields = [
40
+ 'observations', 'medications', 'conditions'
41
+ ]
42
+
43
+ missing_fields = []
44
+ validation_score = 0.0
45
+ total_fields = len(required_demographic_fields) + len(optional_demographic_fields)
46
+ present_fields = 0
47
+
48
+ # Check required demographic fields
49
+ for field in required_demographic_fields:
50
+ value = patient.get(field, '')
51
+ if not value or (isinstance(value, str) and value.strip() == ''):
52
+ missing_fields.append(field)
53
+ else:
54
+ present_fields += 1
55
+
56
+ # Check optional demographic fields
57
+ for field in optional_demographic_fields:
58
+ value = patient.get(field, '')
59
+ if value and (not isinstance(value, str) or value.strip() != ''):
60
+ present_fields += 1
61
+
62
+ # Check medical data
63
+ has_medical_data = False
64
+ if 'clinical_data' in patient:
65
+ clinical_data = patient['clinical_data']
66
+ for field in medical_data_fields:
67
+ if field in clinical_data and clinical_data[field]:
68
+ has_medical_data = True
69
+ break
70
+
71
+ # Calculate validation score
72
+ validation_score = present_fields / total_fields if total_fields > 0 else 0.0
73
+
74
+ # Determine if patient is complete
75
+ is_complete = len(missing_fields) == 0 and validation_score >= 0.7
76
+
77
+ # If medical data is required, check if patient has it
78
+ if require_medical_data and not has_medical_data:
79
+ is_complete = False
80
+ missing_fields.append('medical_data')
81
+
82
+ return {
83
+ "is_complete": is_complete,
84
+ "missing_fields": missing_fields,
85
+ "has_medical_data": has_medical_data,
86
+ "validation_score": validation_score,
87
+ "demographic_completeness": present_fields / len(required_demographic_fields + optional_demographic_fields) if (len(required_demographic_fields) + len(optional_demographic_fields)) > 0 else 0.0
88
+ }
89
+
90
+ async def import_patients_from_hapi(self, limit: int = 20, require_medical_data: bool = False, min_completeness_score: float = 0.7) -> dict:
91
+ """
92
+ Import patients from HAPI FHIR Test Server with data completeness validation
93
+
94
+ Args:
95
+ limit: Number of patients to fetch from HAPI FHIR
96
+ require_medical_data: Whether to require patients to have medical data
97
+ min_completeness_score: Minimum validation score (0-1) for a patient to be considered complete
98
  """
99
  try:
100
  print(f"Fetching {limit} patients from HAPI FHIR...")
 
105
  return {
106
  "imported_count": 0,
107
  "skipped_count": 0,
108
+ "filtered_count": 0,
109
  "total_found": 0,
110
  "imported_patients": [],
111
  "skipped_patients": [],
112
+ "filtered_patients": [],
113
+ "validation_summary": {},
114
  "errors": []
115
  }
116
 
117
+ print(f"Found {len(patients)} patients, checking for duplicates and data completeness...")
118
 
119
  imported_count = 0
120
  skipped_count = 0
121
+ filtered_count = 0
122
  imported_patients = []
123
  skipped_patients = []
124
+ filtered_patients = []
125
  errors = []
126
+ validation_summary = {
127
+ "total_processed": len(patients),
128
+ "complete_patients": 0,
129
+ "incomplete_patients": 0,
130
+ "with_medical_data": 0,
131
+ "without_medical_data": 0,
132
+ "average_completeness_score": 0.0
133
+ }
134
+
135
+ total_completeness_score = 0.0
136
 
137
  for patient in patients:
138
  try:
 
154
  # Enhance patient data with additional FHIR data
155
  enhanced_patient = await self._enhance_patient_data(patient)
156
 
157
+ # Validate data completeness
158
+ validation_result = self._validate_patient_data_completeness(
159
+ enhanced_patient,
160
+ require_medical_data=require_medical_data
161
+ )
162
+
163
+ # Update validation summary
164
+ total_completeness_score += validation_result["validation_score"]
165
+
166
+ if validation_result["has_medical_data"]:
167
+ validation_summary["with_medical_data"] += 1
168
+ else:
169
+ validation_summary["without_medical_data"] += 1
170
+
171
+ # Check if patient meets completeness criteria
172
+ if not validation_result["is_complete"] or validation_result["validation_score"] < min_completeness_score:
173
+ filtered_count += 1
174
+ filtered_patients.append({
175
+ "name": patient['full_name'],
176
+ "fhir_id": patient['fhir_id'],
177
+ "missing_fields": validation_result["missing_fields"],
178
+ "completeness_score": validation_result["validation_score"],
179
+ "has_medical_data": validation_result["has_medical_data"]
180
+ })
181
+ print(f"Patient {patient['full_name']} filtered out - missing: {validation_result['missing_fields']}, score: {validation_result['validation_score']:.2f}")
182
+ validation_summary["incomplete_patients"] += 1
183
+ continue
184
+
185
+ validation_summary["complete_patients"] += 1
186
+
187
  # Insert into database
188
  result = await db.patients.insert_one(enhanced_patient)
189
 
190
  if result.inserted_id:
191
  imported_count += 1
192
+ imported_patients.append({
193
+ "name": patient['full_name'],
194
+ "fhir_id": patient['fhir_id'],
195
+ "completeness_score": validation_result["validation_score"],
196
+ "has_medical_data": validation_result["has_medical_data"]
197
+ })
198
+ print(f"Imported patient: {patient['full_name']} (ID: {result.inserted_id}, Score: {validation_result['validation_score']:.2f})")
199
 
200
  except Exception as e:
201
  error_msg = f"Error importing patient {patient.get('full_name', 'Unknown')}: {e}"
 
203
  print(error_msg)
204
  continue
205
 
206
+ # Calculate average completeness score
207
+ if validation_summary["total_processed"] > 0:
208
+ validation_summary["average_completeness_score"] = total_completeness_score / validation_summary["total_processed"]
209
+
210
+ print(f"Import completed: {imported_count} imported, {skipped_count} skipped, {filtered_count} filtered out")
211
+ print(f"Validation summary: {validation_summary}")
212
 
213
  return {
214
  "imported_count": imported_count,
215
  "skipped_count": skipped_count,
216
+ "filtered_count": filtered_count,
217
  "total_found": len(patients),
218
  "imported_patients": imported_patients,
219
  "skipped_patients": skipped_patients,
220
+ "filtered_patients": filtered_patients,
221
+ "validation_summary": validation_summary,
222
  "errors": errors
223
  }
224
 
 
227
  return {
228
  "imported_count": 0,
229
  "skipped_count": 0,
230
+ "filtered_count": 0,
231
  "total_found": 0,
232
  "imported_patients": [],
233
  "skipped_patients": [],
234
+ "filtered_patients": [],
235
+ "validation_summary": {},
236
  "errors": [str(e)]
237
  }
238
 
api/services/synthea_integration.py CHANGED
@@ -28,7 +28,10 @@ class SyntheaIntegrationService:
28
  # Check if we're in a containerized environment (like Hugging Face Spaces)
29
  self.is_containerized = os.path.exists('/.dockerenv') or os.environ.get('HF_SPACE_ID') is not None
30
 
31
- # Always try to use real Synthea first, regardless of environment
 
 
 
32
  self.use_mock_data = False
33
 
34
  # Try multiple directory locations for better compatibility
@@ -55,7 +58,8 @@ class SyntheaIntegrationService:
55
  logger.warning("⚠️ No writable temp directory found, using current directory")
56
 
57
  self.synthea_dir = base_temp_dir / "cps_synthea"
58
- self.output_dir = base_temp_dir / "cps_fhir_output"
 
59
  self.synthea_jar_path = self.synthea_dir / "synthea-with-dependencies.jar"
60
 
61
  # Try to create directories
@@ -65,7 +69,9 @@ class SyntheaIntegrationService:
65
  logger.info(f"✅ Using directories: synthea={self.synthea_dir}, output={self.output_dir}")
66
  except Exception as e:
67
  logger.warning(f"⚠️ Could not create directories: {e}, will try to use existing paths")
68
- # Don't set use_mock_data to True yet - let's try to work with what we have
 
 
69
 
70
  # Synthea configuration
71
  self.default_config = {
@@ -90,6 +96,20 @@ class SyntheaIntegrationService:
90
  "exporter.fhir.include_practitioners": "false"
91
  }
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  async def download_synthea(self) -> bool:
94
  """
95
  Download Synthea JAR file if not present
@@ -160,10 +180,12 @@ class SyntheaIntegrationService:
160
  file.unlink()
161
 
162
  # Run Synthea with command line arguments (more reliable)
 
 
163
  cmd = [
164
  "java", "-jar", str(self.synthea_jar_path),
165
  "-p", str(population),
166
- "-o", str(self.output_dir.absolute()),
167
  "--seed", str(int(datetime.now().timestamp())),
168
  "--exporter.fhir.transaction_bundle=true",
169
  "--exporter.fhir.include_patient_summary=true",
@@ -181,7 +203,7 @@ class SyntheaIntegrationService:
181
  logger.info(f"Output directory exists before generation: {self.output_dir.exists()}")
182
 
183
  # Try multiple working directories for better compatibility
184
- working_dirs = [str(self.synthea_dir), str(self.output_dir), str(Path.cwd())]
185
 
186
  process = None
187
  for working_dir in working_dirs:
@@ -206,7 +228,13 @@ class SyntheaIntegrationService:
206
 
207
  if process.returncode == 0:
208
  logger.info("✅ Synthea generation completed successfully")
209
- logger.info(f"Output: {stdout.decode()}")
 
 
 
 
 
 
210
 
211
  # Debug: Check what files were actually created
212
  logger.info(f"🔍 Checking output directory immediately after generation: {self.output_dir}")
@@ -222,6 +250,14 @@ class SyntheaIntegrationService:
222
  for subdir in subdirs:
223
  json_files = list(subdir.glob("*.json"))
224
  logger.info(f"📁 JSON files in {subdir.name}: {[f.name for f in json_files]}")
 
 
 
 
 
 
 
 
225
 
226
  # Also check if files were created in the working directory
227
  working_dir_files = list(Path.cwd().glob("*.json"))
@@ -231,12 +267,29 @@ class SyntheaIntegrationService:
231
  synthea_dir_files = list(self.synthea_dir.glob("*.json"))
232
  logger.info(f"📁 JSON files in synthea directory: {[f.name for f in synthea_dir_files]}")
233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  else:
235
  logger.warning(f"⚠️ Output directory does not exist: {self.output_dir}")
236
 
237
  return True
238
  else:
239
- error_output = stderr.decode()
 
 
 
 
240
  logger.error(f"❌ Synthea generation failed with return code {process.returncode}")
241
  logger.error(f"Error output: {error_output}")
242
  return False
@@ -245,7 +298,7 @@ class SyntheaIntegrationService:
245
  logger.error(f"❌ Error running Synthea: {str(e)}")
246
  return False
247
 
248
- async def process_synthea_output(self) -> List[Dict[str, Any]]:
249
  """
250
  Process Synthea output files and convert to application format
251
  """
@@ -257,9 +310,10 @@ class SyntheaIntegrationService:
257
 
258
  # List of directories to search for Synthea output
259
  search_dirs = [
260
- self.output_dir,
261
- self.output_dir.parent,
262
- Path.cwd(),
 
263
  Path('/tmp'),
264
  Path('/app'),
265
  Path('/app/tmp')
@@ -315,6 +369,9 @@ class SyntheaIntegrationService:
315
  return []
316
 
317
  # Process each patient file
 
 
 
318
  for file_path in patient_files:
319
  try:
320
  logger.info(f"📄 Processing file: {file_path}")
@@ -324,8 +381,14 @@ class SyntheaIntegrationService:
324
 
325
  patient_data = await self._extract_patient_data(bundle, file_path.name)
326
  if patient_data:
327
- patients.append(patient_data)
328
- logger.info(f"✅ Extracted patient: {patient_data.get('full_name', 'Unknown')}")
 
 
 
 
 
 
329
  else:
330
  logger.warning(f"⚠️ No patient data extracted from {file_path}")
331
 
@@ -333,6 +396,8 @@ class SyntheaIntegrationService:
333
  logger.error(f"❌ Error processing {file_path}: {str(e)}")
334
  continue
335
 
 
 
336
  logger.info(f"✅ Successfully processed {len(patients)} patients from Synthea output")
337
  return patients
338
 
@@ -527,6 +592,72 @@ class SyntheaIntegrationService:
527
  except:
528
  return []
529
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530
  async def save_patients_to_database(self, patients: List[Dict[str, Any]]) -> Dict[str, Any]:
531
  """
532
  Save generated patients directly to the database
@@ -553,6 +684,9 @@ class SyntheaIntegrationService:
553
  'marital_status': patient.get('marital_status', ''),
554
  'language': patient.get('language', 'English'),
555
  'source': patient.get('source', 'synthea'),
 
 
 
556
  'import_date': datetime.utcnow(),
557
  'last_updated': datetime.utcnow(),
558
  'conditions': patient.get('conditions', []),
@@ -599,7 +733,8 @@ class SyntheaIntegrationService:
599
  age_min: int = 18,
600
  age_max: int = 80,
601
  gender: str = "both",
602
- location: str = "Massachusetts"
 
603
  ) -> Dict[str, Any]:
604
  """
605
  Complete workflow: generate Synthea data and prepare for import
@@ -607,7 +742,7 @@ class SyntheaIntegrationService:
607
  try:
608
  logger.info(f"🎯 Starting Synthea generation for {population} patients")
609
 
610
- # Always try real Synthea first, regardless of environment
611
  try:
612
  # Download Synthea if needed
613
  if not await self.download_synthea():
@@ -630,12 +765,19 @@ class SyntheaIntegrationService:
630
  logger.error("❌ Synthea generation failed")
631
  raise Exception("Synthea generation failed")
632
 
633
- # Process output
634
- patients = await self.process_synthea_output()
635
 
636
  if not patients:
637
- logger.error("❌ No patients generated from Synthea")
638
- raise Exception("No patients generated from Synthea")
 
 
 
 
 
 
 
639
 
640
  # Save patients to database
641
  db_result = await self.save_patients_to_database(patients)
@@ -650,15 +792,14 @@ class SyntheaIntegrationService:
650
  "patients": patients,
651
  "config": config_overrides,
652
  "output_directory": str(self.output_dir),
653
- "source": "synthea_real"
 
654
  }
655
 
656
  except Exception as e:
657
  logger.error(f"❌ Synthea integration failed: {str(e)}")
658
- raise HTTPException(
659
- status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
660
- detail=f"Synthea generation failed: {str(e)}"
661
- )
662
 
663
  except Exception as e:
664
  logger.error(f"❌ Error in generate_and_import_patients: {str(e)}")
@@ -667,198 +808,6 @@ class SyntheaIntegrationService:
667
  detail=f"Patient generation failed: {str(e)}"
668
  )
669
 
670
- async def _generate_mock_patients(
671
- self,
672
- population: int = 10,
673
- age_min: int = 18,
674
- age_max: int = 80,
675
- gender: str = "both",
676
- location: str = "Massachusetts"
677
- ) -> Dict[str, Any]:
678
- """
679
- Generate realistic mock patient data when Synthea is not available
680
- """
681
- import random
682
- from datetime import datetime, timedelta
683
-
684
- logger.info(f"🎭 Generating {population} realistic mock patients")
685
-
686
- # More comprehensive name lists
687
- first_names = [
688
- "John", "Jane", "Michael", "Sarah", "David", "Emily", "Robert", "Lisa", "James", "Maria",
689
- "William", "Jennifer", "Christopher", "Jessica", "Daniel", "Amanda", "Matthew", "Nicole", "Anthony", "Stephanie",
690
- "Mark", "Melissa", "Donald", "Michelle", "Steven", "Laura", "Paul", "Kimberly", "Andrew", "Deborah",
691
- "Joshua", "Dorothy", "Kenneth", "Helen", "Kevin", "Sharon", "Brian", "Carol", "George", "Ruth",
692
- "Edward", "Julie", "Ronald", "Joyce", "Timothy", "Virginia", "Jason", "Victoria", "Jeffrey", "Kelly"
693
- ]
694
- last_names = [
695
- "Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez",
696
- "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson", "Thomas", "Taylor", "Moore", "Jackson", "Martin",
697
- "Lee", "Perez", "Thompson", "White", "Harris", "Sanchez", "Clark", "Ramirez", "Lewis", "Robinson",
698
- "Walker", "Young", "Allen", "King", "Wright", "Scott", "Torres", "Nguyen", "Hill", "Flores",
699
- "Green", "Adams", "Nelson", "Baker", "Hall", "Rivera", "Campbell", "Mitchell", "Carter", "Roberts"
700
- ]
701
- cities = ["Boston", "Cambridge", "Worcester", "Springfield", "Lowell", "New Bedford", "Brockton", "Quincy", "Lynn", "Fall River"]
702
-
703
- # Medical conditions for more realistic data
704
- conditions = [
705
- "Hypertension", "Diabetes Type 2", "Asthma", "Depression", "Anxiety", "Obesity", "Arthritis", "Heart Disease",
706
- "High Cholesterol", "Migraine", "Insomnia", "GERD", "Allergies", "Back Pain", "Carpal Tunnel Syndrome"
707
- ]
708
-
709
- # Medications for more realistic data
710
- medications = [
711
- "Lisinopril", "Metformin", "Albuterol", "Sertraline", "Atorvastatin", "Omeprazole", "Ibuprofen", "Acetaminophen",
712
- "Loratadine", "Melatonin", "Vitamin D", "Fish Oil", "Calcium", "Iron", "Folic Acid"
713
- ]
714
-
715
- patients = []
716
-
717
- for i in range(population):
718
- # Generate random name
719
- first_name = random.choice(first_names)
720
- last_name = random.choice(last_names)
721
- full_name = f"{first_name} {last_name}"
722
-
723
- # Generate random age
724
- age = random.randint(age_min, age_max)
725
- birth_date = datetime.now() - timedelta(days=age*365 + random.randint(0, 365))
726
-
727
- # Generate random gender
728
- if gender == "both":
729
- patient_gender = random.choice(["male", "female"])
730
- else:
731
- patient_gender = gender
732
-
733
- # Generate random address
734
- street_number = random.randint(100, 9999)
735
- street_name = random.choice(["Main St", "Oak Ave", "Elm St", "Maple Dr", "Cedar Ln", "Pine Rd", "Birch Way", "Willow Ct"])
736
- city = random.choice(cities)
737
- state = "MA"
738
- postal_code = f"{random.randint(10000, 99999)}"
739
-
740
- # Generate realistic medical data
741
- patient_conditions = []
742
- patient_medications = []
743
- patient_encounters = []
744
- patient_observations = []
745
- patient_procedures = []
746
- patient_immunizations = []
747
- patient_allergies = []
748
-
749
- # Add 0-3 random conditions
750
- num_conditions = random.randint(0, 3)
751
- for _ in range(num_conditions):
752
- condition = random.choice(conditions)
753
- if condition not in [c['code'] for c in patient_conditions]:
754
- patient_conditions.append({
755
- 'id': f"condition-{random.randint(1000, 9999)}",
756
- 'code': condition,
757
- 'status': random.choice(['active', 'inactive', 'resolved']),
758
- 'onset_date': (datetime.now() - timedelta(days=random.randint(30, 3650))).strftime('%Y-%m-%d'),
759
- 'recorded_date': datetime.now().strftime('%Y-%m-%d'),
760
- 'verification_status': 'confirmed',
761
- 'category': 'diagnosis'
762
- })
763
-
764
- # Add 0-4 random medications
765
- num_medications = random.randint(0, 4)
766
- for _ in range(num_medications):
767
- medication = random.choice(medications)
768
- if medication not in [m['name'] for m in patient_medications]:
769
- patient_medications.append({
770
- 'id': f"med-{random.randint(1000, 9999)}",
771
- 'name': medication,
772
- 'status': random.choice(['active', 'discontinued', 'completed']),
773
- 'prescribed_date': (datetime.now() - timedelta(days=random.randint(7, 365))).strftime('%Y-%m-%d'),
774
- 'requester': f"Dr. {random.choice(['Smith', 'Johnson', 'Williams', 'Brown', 'Davis'])}",
775
- 'dosage': f"{random.randint(1, 3)} tablet(s) daily",
776
- 'intent': 'order',
777
- 'priority': 'routine'
778
- })
779
-
780
- # Add 1-5 encounters
781
- num_encounters = random.randint(1, 5)
782
- for j in range(num_encounters):
783
- encounter_date = datetime.now() - timedelta(days=random.randint(1, 365))
784
- patient_encounters.append({
785
- 'id': f"encounter-{random.randint(1000, 9999)}",
786
- 'type': random.choice(['Office Visit', 'Emergency Room', 'Hospital Admission', 'Telemedicine', 'Lab Visit']),
787
- 'status': 'finished',
788
- 'period': {
789
- 'start': encounter_date.strftime('%Y-%m-%dT%H:%M:%S'),
790
- 'end': (encounter_date + timedelta(hours=random.randint(1, 4))).strftime('%Y-%m-%dT%H:%M:%S')
791
- },
792
- 'service_provider': f"{random.choice(['General Hospital', 'Medical Center', 'Clinic'])}",
793
- 'class': 'ambulatory',
794
- 'reason': random.choice(['Routine Checkup', 'Follow-up', 'Emergency', 'Lab Work', 'Consultation'])
795
- })
796
-
797
- # Add 2-8 observations
798
- num_observations = random.randint(2, 8)
799
- for _ in range(num_observations):
800
- observation_date = datetime.now() - timedelta(days=random.randint(1, 365))
801
- patient_observations.append({
802
- 'id': f"obs-{random.randint(1000, 9999)}",
803
- 'code': random.choice(['Blood Pressure', 'Heart Rate', 'Temperature', 'Weight', 'Height', 'Blood Glucose', 'Cholesterol']),
804
- 'value': f"{random.randint(70, 200)}",
805
- 'unit': random.choice(['mmHg', 'bpm', '°F', 'lbs', 'inches', 'mg/dL']),
806
- 'status': 'final',
807
- 'effective_date': observation_date.strftime('%Y-%m-%dT%H:%M:%S'),
808
- 'category': 'vital-signs'
809
- })
810
-
811
- patient_data = {
812
- 'fhir_id': f"mock-patient-{i+1}",
813
- 'full_name': full_name,
814
- 'gender': patient_gender,
815
- 'date_of_birth': birth_date.strftime('%Y-%m-%d'),
816
- 'address': f"{street_number} {street_name}",
817
- 'city': city,
818
- 'state': state,
819
- 'postal_code': postal_code,
820
- 'country': 'US',
821
- 'marital_status': random.choice(['single', 'married', 'divorced', 'widowed']),
822
- 'language': 'English',
823
- 'source': 'synthea_mock',
824
- 'import_date': datetime.utcnow().isoformat(),
825
- 'last_updated': datetime.utcnow().isoformat(),
826
- 'conditions': patient_conditions,
827
- 'medications': patient_medications,
828
- 'encounters': patient_encounters,
829
- 'observations': patient_observations,
830
- 'procedures': patient_procedures,
831
- 'immunizations': patient_immunizations,
832
- 'allergies': patient_allergies
833
- }
834
-
835
- patients.append(patient_data)
836
-
837
- # Save mock patients to database
838
- if patients:
839
- db_result = await self.save_patients_to_database(patients)
840
- logger.info(f"💾 Mock patients database save result: {db_result}")
841
- else:
842
- db_result = {"saved_count": 0, "failed_count": 0, "errors": ["No mock patients to save"], "success": False}
843
-
844
- return {
845
- "status": "success",
846
- "generated_patients": len(patients),
847
- "saved_to_database": db_result["saved_count"],
848
- "failed_to_save": db_result["failed_count"],
849
- "database_errors": db_result["errors"],
850
- "patients": patients,
851
- "config": {
852
- "population": population,
853
- "age_min": age_min,
854
- "age_max": age_max,
855
- "gender": gender,
856
- "location": location
857
- },
858
- "output_directory": "mock_data",
859
- "source": "synthea_mock"
860
- }
861
-
862
  async def get_synthea_statistics(self) -> Dict[str, Any]:
863
  """
864
  Get statistics about Synthea capabilities and generated data
@@ -873,8 +822,7 @@ class SyntheaIntegrationService:
873
  "synthea_available": False,
874
  "java_available": False,
875
  "directories_accessible": False,
876
- "environment": "local",
877
- "using_mock_data": False
878
  }
879
 
880
  # Set environment info
@@ -883,8 +831,7 @@ class SyntheaIntegrationService:
883
  else:
884
  stats["environment"] = "local"
885
 
886
- # Always try to use real Synthea
887
- stats["using_mock_data"] = False
888
 
889
  # Check if directories are accessible
890
  try:
@@ -957,6 +904,5 @@ class SyntheaIntegrationService:
957
  "synthea_available": False,
958
  "java_available": False,
959
  "directories_accessible": False,
960
- "environment": "unknown",
961
- "using_mock_data": True
962
  }
 
28
  # Check if we're in a containerized environment (like Hugging Face Spaces)
29
  self.is_containerized = os.path.exists('/.dockerenv') or os.environ.get('HF_SPACE_ID') is not None
30
 
31
+ # Check if Java is available locally
32
+ self.java_available = self._check_java_availability()
33
+
34
+ # Always use real Synthea data - no fallback to mock data
35
  self.use_mock_data = False
36
 
37
  # Try multiple directory locations for better compatibility
 
58
  logger.warning("⚠️ No writable temp directory found, using current directory")
59
 
60
  self.synthea_dir = base_temp_dir / "cps_synthea"
61
+ # Use the actual output directory where Synthea creates files
62
+ self.output_dir = Path.cwd() / "output" / "fhir"
63
  self.synthea_jar_path = self.synthea_dir / "synthea-with-dependencies.jar"
64
 
65
  # Try to create directories
 
69
  logger.info(f"✅ Using directories: synthea={self.synthea_dir}, output={self.output_dir}")
70
  except Exception as e:
71
  logger.warning(f"⚠️ Could not create directories: {e}, will try to use existing paths")
72
+
73
+ # Log the configuration
74
+ logger.info("🚀 Using real Synthea generation (no fallback to mock data)")
75
 
76
  # Synthea configuration
77
  self.default_config = {
 
96
  "exporter.fhir.include_practitioners": "false"
97
  }
98
 
99
+ def _check_java_availability(self) -> bool:
100
+ """
101
+ Check if Java is available in the system
102
+ """
103
+ try:
104
+ import subprocess
105
+ result = subprocess.run(['java', '-version'],
106
+ capture_output=True,
107
+ text=True,
108
+ timeout=10)
109
+ return result.returncode == 0
110
+ except (subprocess.TimeoutExpired, FileNotFoundError, Exception):
111
+ return False
112
+
113
  async def download_synthea(self) -> bool:
114
  """
115
  Download Synthea JAR file if not present
 
180
  file.unlink()
181
 
182
  # Run Synthea with command line arguments (more reliable)
183
+ # Use the parent directory of output/fhir as the output directory
184
+ output_parent = Path.cwd() / "output"
185
  cmd = [
186
  "java", "-jar", str(self.synthea_jar_path),
187
  "-p", str(population),
188
+ "-o", str(output_parent.absolute()),
189
  "--seed", str(int(datetime.now().timestamp())),
190
  "--exporter.fhir.transaction_bundle=true",
191
  "--exporter.fhir.include_patient_summary=true",
 
203
  logger.info(f"Output directory exists before generation: {self.output_dir.exists()}")
204
 
205
  # Try multiple working directories for better compatibility
206
+ working_dirs = [str(Path.cwd()), str(self.synthea_dir), str(self.output_dir)]
207
 
208
  process = None
209
  for working_dir in working_dirs:
 
228
 
229
  if process.returncode == 0:
230
  logger.info("✅ Synthea generation completed successfully")
231
+ # Handle potential encoding issues with subprocess output
232
+ try:
233
+ output_text = stdout.decode('utf-8', errors='ignore')
234
+ logger.info(f"Output: {output_text}")
235
+ except Exception as decode_error:
236
+ logger.warning(f"⚠️ Could not decode stdout: {decode_error}")
237
+ logger.info("✅ Synthea generation completed successfully (output not displayed due to encoding)")
238
 
239
  # Debug: Check what files were actually created
240
  logger.info(f"🔍 Checking output directory immediately after generation: {self.output_dir}")
 
250
  for subdir in subdirs:
251
  json_files = list(subdir.glob("*.json"))
252
  logger.info(f"📁 JSON files in {subdir.name}: {[f.name for f in json_files]}")
253
+
254
+ # Specifically check for fhir subdirectory
255
+ fhir_dir = self.output_dir / "fhir"
256
+ if fhir_dir.exists():
257
+ fhir_files = list(fhir_dir.glob("*.json"))
258
+ logger.info(f"📁 JSON files in fhir subdirectory: {[f.name for f in fhir_files]}")
259
+ else:
260
+ logger.warning(f"⚠️ FHIR subdirectory does not exist: {fhir_dir}")
261
 
262
  # Also check if files were created in the working directory
263
  working_dir_files = list(Path.cwd().glob("*.json"))
 
267
  synthea_dir_files = list(self.synthea_dir.glob("*.json"))
268
  logger.info(f"📁 JSON files in synthea directory: {[f.name for f in synthea_dir_files]}")
269
 
270
+ # Check what files were created in the working directory where Synthea actually ran
271
+ for working_dir in working_dirs:
272
+ if Path(working_dir).exists():
273
+ working_dir_files = list(Path(working_dir).glob("*.json"))
274
+ logger.info(f"📁 JSON files in working directory {working_dir}: {[f.name for f in working_dir_files]}")
275
+
276
+ # Also check subdirectories in the working directory
277
+ for subdir in Path(working_dir).iterdir():
278
+ if subdir.is_dir():
279
+ subdir_files = list(subdir.glob("*.json"))
280
+ if subdir_files:
281
+ logger.info(f"📁 JSON files in subdirectory {subdir}: {[f.name for f in subdir_files]}")
282
+
283
  else:
284
  logger.warning(f"⚠️ Output directory does not exist: {self.output_dir}")
285
 
286
  return True
287
  else:
288
+ # Handle potential encoding issues with stderr
289
+ try:
290
+ error_output = stderr.decode('utf-8', errors='ignore')
291
+ except Exception as decode_error:
292
+ error_output = f"Could not decode error output: {decode_error}"
293
  logger.error(f"❌ Synthea generation failed with return code {process.returncode}")
294
  logger.error(f"Error output: {error_output}")
295
  return False
 
298
  logger.error(f"❌ Error running Synthea: {str(e)}")
299
  return False
300
 
301
+ async def process_synthea_output(self, require_medical_data: bool = False) -> List[Dict[str, Any]]:
302
  """
303
  Process Synthea output files and convert to application format
304
  """
 
310
 
311
  # List of directories to search for Synthea output
312
  search_dirs = [
313
+ self.output_dir, # The actual fhir directory where files are created
314
+ Path.cwd() / "output" / "fhir", # Explicit path to fhir directory
315
+ Path.cwd() / "output", # Parent output directory
316
+ Path.cwd(), # Current working directory
317
  Path('/tmp'),
318
  Path('/app'),
319
  Path('/app/tmp')
 
369
  return []
370
 
371
  # Process each patient file
372
+ valid_patients = 0
373
+ invalid_patients = 0
374
+
375
  for file_path in patient_files:
376
  try:
377
  logger.info(f"📄 Processing file: {file_path}")
 
381
 
382
  patient_data = await self._extract_patient_data(bundle, file_path.name)
383
  if patient_data:
384
+ # Validate patient data completeness
385
+ if self._validate_patient_data_completeness(patient_data, require_medical_data):
386
+ patients.append(patient_data)
387
+ valid_patients += 1
388
+ logger.info(f"✅ Validated and extracted patient: {patient_data.get('full_name', 'Unknown')}")
389
+ else:
390
+ invalid_patients += 1
391
+ logger.warning(f"❌ Patient validation failed: {patient_data.get('full_name', 'Unknown')}")
392
  else:
393
  logger.warning(f"⚠️ No patient data extracted from {file_path}")
394
 
 
396
  logger.error(f"❌ Error processing {file_path}: {str(e)}")
397
  continue
398
 
399
+ logger.info(f"📊 Patient validation summary: {valid_patients} valid, {invalid_patients} invalid")
400
+
401
  logger.info(f"✅ Successfully processed {len(patients)} patients from Synthea output")
402
  return patients
403
 
 
592
  except:
593
  return []
594
 
595
+ def _validate_patient_data_completeness(self, patient_data: Dict[str, Any], require_medical_data: bool = True) -> bool:
596
+ """
597
+ Validate that a patient has all required data fields
598
+
599
+ Args:
600
+ patient_data: The patient data to validate
601
+ require_medical_data: If True, patient must have at least some medical data (default: True for complete data)
602
+ """
603
+ try:
604
+ # Required basic fields - all must be present and not empty
605
+ required_fields = [
606
+ 'full_name', 'gender', 'date_of_birth', 'address',
607
+ 'city', 'state', 'postal_code', 'country'
608
+ ]
609
+
610
+ # Check if all required fields are present and not empty
611
+ for field in required_fields:
612
+ value = patient_data.get(field)
613
+ if not value or (isinstance(value, str) and not value.strip()):
614
+ logger.warning(f"⚠️ Missing or empty required field '{field}' for patient {patient_data.get('full_name', 'Unknown')}")
615
+ return False
616
+
617
+ # Validate name is not just whitespace
618
+ if not patient_data.get('full_name', '').strip():
619
+ logger.warning(f"⚠️ Empty or invalid name for patient")
620
+ return False
621
+
622
+ # Validate gender is valid
623
+ if patient_data.get('gender') not in ['male', 'female', 'other', 'unknown']:
624
+ logger.warning(f"⚠️ Invalid gender '{patient_data.get('gender')}' for patient {patient_data.get('full_name', 'Unknown')}")
625
+ return False
626
+
627
+ # Validate date of birth format
628
+ try:
629
+ if patient_data.get('date_of_birth'):
630
+ datetime.strptime(patient_data['date_of_birth'], '%Y-%m-%d')
631
+ else:
632
+ logger.warning(f"⚠️ Missing date of birth for patient {patient_data.get('full_name', 'Unknown')}")
633
+ return False
634
+ except ValueError:
635
+ logger.warning(f"⚠️ Invalid date of birth format '{patient_data.get('date_of_birth')}' for patient {patient_data.get('full_name', 'Unknown')}")
636
+ return False
637
+
638
+ # For complete data, we require medical data
639
+ if require_medical_data:
640
+ # Check if patient has at least some medical data
641
+ medical_data_present = (
642
+ len(patient_data.get('conditions', [])) > 0 or
643
+ len(patient_data.get('medications', [])) > 0 or
644
+ len(patient_data.get('encounters', [])) > 0 or
645
+ len(patient_data.get('observations', [])) > 0
646
+ )
647
+
648
+ if not medical_data_present:
649
+ logger.warning(f"❌ Patient {patient_data.get('full_name', 'Unknown')} rejected: no medical data (conditions, medications, encounters, or observations)")
650
+ return False
651
+ else:
652
+ logger.info(f"✅ Patient {patient_data.get('full_name', 'Unknown')} has medical data")
653
+
654
+ logger.info(f"✅ Patient {patient_data.get('full_name', 'Unknown')} passed complete validation")
655
+ return True
656
+
657
+ except Exception as e:
658
+ logger.error(f"❌ Error validating patient data: {str(e)}")
659
+ return False
660
+
661
  async def save_patients_to_database(self, patients: List[Dict[str, Any]]) -> Dict[str, Any]:
662
  """
663
  Save generated patients directly to the database
 
684
  'marital_status': patient.get('marital_status', ''),
685
  'language': patient.get('language', 'English'),
686
  'source': patient.get('source', 'synthea'),
687
+ 'status': 'active',
688
+ 'created_at': datetime.utcnow(),
689
+ 'updated_at': datetime.utcnow(),
690
  'import_date': datetime.utcnow(),
691
  'last_updated': datetime.utcnow(),
692
  'conditions': patient.get('conditions', []),
 
733
  age_min: int = 18,
734
  age_max: int = 80,
735
  gender: str = "both",
736
+ location: str = "Massachusetts",
737
+ require_medical_data: bool = True
738
  ) -> Dict[str, Any]:
739
  """
740
  Complete workflow: generate Synthea data and prepare for import
 
742
  try:
743
  logger.info(f"🎯 Starting Synthea generation for {population} patients")
744
 
745
+ # Always use real Synthea - no fallback to mock data
746
  try:
747
  # Download Synthea if needed
748
  if not await self.download_synthea():
 
765
  logger.error("❌ Synthea generation failed")
766
  raise Exception("Synthea generation failed")
767
 
768
+ # Process output - only get patients with complete data
769
+ patients = await self.process_synthea_output(require_medical_data=True)
770
 
771
  if not patients:
772
+ logger.error("❌ No patients with complete data generated from Synthea")
773
+ raise Exception("No patients with complete data generated from Synthea")
774
+
775
+ # Limit to exactly 10 patients with complete data
776
+ if len(patients) > 10:
777
+ logger.info(f"📊 Limiting from {len(patients)} to 10 patients with complete data")
778
+ patients = patients[:10]
779
+
780
+ logger.info(f"📊 Final patient count for database storage: {len(patients)}")
781
 
782
  # Save patients to database
783
  db_result = await self.save_patients_to_database(patients)
 
792
  "patients": patients,
793
  "config": config_overrides,
794
  "output_directory": str(self.output_dir),
795
+ "source": "synthea_real",
796
+ "message": f"Successfully stored {db_result['saved_count']} patients with complete data to database"
797
  }
798
 
799
  except Exception as e:
800
  logger.error(f"❌ Synthea integration failed: {str(e)}")
801
+ # No fallback to mock data - raise the error
802
+ raise Exception(f"Synthea generation failed: {str(e)}")
 
 
803
 
804
  except Exception as e:
805
  logger.error(f"❌ Error in generate_and_import_patients: {str(e)}")
 
808
  detail=f"Patient generation failed: {str(e)}"
809
  )
810
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
811
  async def get_synthea_statistics(self) -> Dict[str, Any]:
812
  """
813
  Get statistics about Synthea capabilities and generated data
 
822
  "synthea_available": False,
823
  "java_available": False,
824
  "directories_accessible": False,
825
+ "environment": "local"
 
826
  }
827
 
828
  # Set environment info
 
831
  else:
832
  stats["environment"] = "local"
833
 
834
+ # Always use real Synthea - no mock data fallback
 
835
 
836
  # Check if directories are accessible
837
  try:
 
904
  "synthea_available": False,
905
  "java_available": False,
906
  "directories_accessible": False,
907
+ "environment": "unknown"
 
908
  }
data/new_tool.json DELETED
@@ -1 +0,0 @@
1
- []