VED-AGI-1 commited on
Commit
1b29d16
Β·
verified Β·
1 Parent(s): 5651d3e

Update narrative_safetynet.py

Browse files
Files changed (1) hide show
  1. narrative_safetynet.py +278 -1
narrative_safetynet.py CHANGED
@@ -128,6 +128,283 @@ def _pluralize(word: str, n: int) -> str:
128
 
129
  # -------------------- geo join (Top-5 only) --------------------
130
 
131
- def _canon(s: str) -> s_
 
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
 
 
128
 
129
  # -------------------- geo join (Top-5 only) --------------------
130
 
131
+ def _canon(s: str) -> str:
132
+ return re.sub(r"[^a-z0-9]+", "", (s or "").lower())
133
 
134
+ def _map_top_facilities_to_odhf(
135
+ top_facilities: pd.DataFrame,
136
+ odhf: pd.DataFrame,
137
+ fac_col: str = "Facility",
138
+ odhf_name_col: str = "facility_name"
139
+ ) -> pd.DataFrame:
140
+ if odhf is None or odhf.empty or top_facilities is None or top_facilities.empty:
141
+ return pd.DataFrame()
142
+ out_rows: List[Dict[str, Any]] = []
143
+ try:
144
+ idx = { _canon(n): i for i, n in odhf[odhf_name_col].dropna().items() }
145
+ except Exception:
146
+ return pd.DataFrame()
147
+ for fac in top_facilities[fac_col].dropna().astype(str).unique():
148
+ key = _canon(fac)
149
+ row = None
150
+ if key in idx:
151
+ row = odhf.loc[idx[key]]
152
+ else:
153
+ # contains fallback (case-insensitive)
154
+ cand = odhf[odhf[odhf_name_col].astype(str).str.contains(fac, case=False, na=False)]
155
+ if not cand.empty:
156
+ row = cand.iloc[0]
157
+ if row is not None:
158
+ out_rows.append({
159
+ "Facility": fac,
160
+ "city": row.get("city"),
161
+ "latitude": row.get("latitude"),
162
+ "longitude": row.get("longitude")
163
+ })
164
+ return pd.DataFrame(out_rows)
165
+
166
+ # -------------------- main: narrative builder --------------------
167
+
168
+ def build_narrative(
169
+ scenario_text: str,
170
+ datasets: Dict[str, Any],
171
+ structured_tables: Optional[Dict[str, pd.DataFrame]] = None,
172
+ metric_hints: Optional[List[str]] = None,
173
+ group_hints: Optional[List[str]] = None,
174
+ min_sample: int = _DEF_MIN_SAMPLE,
175
+ baseline_band: float = 0.05 # Β±5% "about average"
176
+ ) -> str:
177
+ """
178
+ Scenario-agnostic narrative fallback:
179
+ 1) Choose best (df, metric) dynamically using name hints + numeric sanity
180
+ 2) Prefer structured tables (top facilities/specialties/zones) if provided
181
+ 3) Compute overall baseline + label groups vs baseline
182
+ 4) Geo notes via fuzzy Top-5 ↔ ODHF join (<= 3 bullets)
183
+ 5) Recommendations grounded in the same metric/groups
184
+ """
185
+
186
+ metric_hints = (metric_hints or _HINT_METRICS_DEFAULT)
187
+ group_hints = (group_hints or _HINT_GROUPS_DEFAULT)
188
+
189
+ # ---------- 1) Pick dataset + metric ----------
190
+ choice = _choose_df_and_metric(datasets, metric_hints)
191
+ if not choice:
192
+ return "No tabular data available. Unable to generate a narrative."
193
+ df_key, df, primary_metric = choice
194
+
195
+ # Ensure numeric
196
+ df = _nanlike_to_nan(df)
197
+ if primary_metric not in df.columns:
198
+ return "Chosen metric missing. Unable to generate a narrative."
199
+ df[primary_metric] = _to_numeric(df[primary_metric])
200
+
201
+ # Optional comparator metric (e.g., consult vs surgery)
202
+ comparator_metric = None
203
+ for c in df.columns:
204
+ if c == primary_metric:
205
+ continue
206
+ if _is_numeric_series(_to_numeric(df[c])):
207
+ name = c.lower()
208
+ if any(h in name for h in ["consult", "median", "wait", "p90", "90th"]):
209
+ comparator_metric = c
210
+ break
211
+
212
+ # ---------- 2) Prefer structured tables if present ----------
213
+ top_fac = None
214
+ top_spec = None
215
+ zone_tbl = None
216
+ odhf_df = None
217
+
218
+ if structured_tables:
219
+ top_fac = structured_tables.get("top_facilities")
220
+ top_spec = structured_tables.get("top_specialties")
221
+ zone_tbl = structured_tables.get("zone_summary")
222
+ # try to detect ODHF-like table by column fingerprint
223
+ for k, v in datasets.items():
224
+ if isinstance(v, pd.DataFrame) and {"facility_name", "city"}.issubset(set(map(str.lower, v.columns.str.lower()))):
225
+ odhf_df = v
226
+ break
227
+
228
+ # Compute baseline from the selected df/metric (not from ODHF)
229
+ baseline = df[primary_metric].mean(skipna=True)
230
+
231
+ # ---------- 3) Build sections ----------
232
+
233
+ sections: List[str] = []
234
+
235
+ # Methodology
236
+ meth: List[str] = []
237
+ meth.append(f"Primary metric: **{primary_metric}**; overall average: **{_fmt_num(baseline)}**.")
238
+ if comparator_metric:
239
+ meth.append(f"Comparator metric detected: **{comparator_metric}** (means shown when available).")
240
+ # Missing value note
241
+ if df.isna().sum().sum() > 0:
242
+ meth.append("Missing values (blank/dash) were treated as nulls and excluded from means.")
243
+ # Group hints (informative only)
244
+ g1 = _find_group_col(df, group_hints, avoid=[primary_metric])
245
+ if g1:
246
+ meth.append(f"Primary grouping inferred: **{g1}**.")
247
+ g2 = _find_group_col(df.drop(columns=[g1], errors="ignore") if g1 else df, group_hints, avoid=[primary_metric, g1 or ""])
248
+ if g2:
249
+ meth.append(f"Secondary grouping inferred: **{g2}**.")
250
+
251
+ sections.append("## Methodology (Auto-generated)")
252
+ for m in meth:
253
+ sections.append(f"- {m}")
254
+ sections.append("")
255
+
256
+ # Highest averages by primary grouping (prefer structured Top-5 if given)
257
+ top_lines: List[str] = []
258
+ if isinstance(top_fac, pd.DataFrame) and not top_fac.empty:
259
+ # Expect columns like: Facility, Zone, avg_Surgery_Median, count_*
260
+ # Keep dynamic: find a metric column in top_fac aligned to primary_metric by hint matching
261
+ metric_col = None
262
+ for c in top_fac.columns:
263
+ if primary_metric.lower() in c.lower() or any(h in c.lower() for h in ["avg_", "mean"]):
264
+ if _is_numeric_series(_to_numeric(top_fac[c])):
265
+ metric_col = c
266
+ break
267
+ if metric_col is None:
268
+ # fallback: first numeric col
269
+ for c in top_fac.columns:
270
+ if _is_numeric_series(_to_numeric(top_fac[c])):
271
+ metric_col = c; break
272
+
273
+ cnt_col = next((c for c in top_fac.columns if "count" in c.lower() or c.lower() in {"n", "records"}), None)
274
+ lab_col = next((c for c in top_fac.columns if "facility" in c.lower()), None)
275
+
276
+ if metric_col and lab_col:
277
+ # already sorted in your executor; if not, sort desc
278
+ tf = top_fac.copy()
279
+ tf[metric_col] = _to_numeric(tf[metric_col])
280
+ tf = tf.sort_values(by=metric_col, ascending=False)
281
+ for i, row in enumerate(tf.head(5).itertuples(index=False), 1):
282
+ label = getattr(row, lab_col)
283
+ met = getattr(row, metric_col)
284
+ cnt = getattr(row, cnt_col) if cnt_col and hasattr(row, cnt_col) else np.nan
285
+ dev = _label_vs_baseline(met, baseline, baseline_band)
286
+ caution = _small_sample_note(int(cnt)) if (isinstance(cnt, (int, float)) and not pd.isna(cnt)) else None
287
+ msg = f"{i}. **{label}** β€” {primary_metric}: {_fmt_num(met)}"
288
+ if cnt_col and hasattr(row, cnt_col):
289
+ msg += f"; {_pluralize('record', int(cnt))}: {int(cnt)}"
290
+ msg += f" β†’ {dev}"
291
+ if caution:
292
+ msg += f" ({caution})"
293
+ top_lines.append(msg)
294
+
295
+ else:
296
+ # No structured Top-5 provided: derive from g1
297
+ if g1:
298
+ tmp = df.copy()
299
+ tmp[primary_metric] = _to_numeric(tmp[primary_metric])
300
+ if comparator_metric in tmp.columns:
301
+ tmp[comparator_metric] = _to_numeric(tmp[comparator_metric])
302
+ agg = (
303
+ tmp.groupby(g1, dropna=False)
304
+ .agg(metric=(primary_metric, "mean"), count=(primary_metric, "count"))
305
+ .reset_index()
306
+ ).sort_values(by="metric", ascending=False)
307
+ for i, row in enumerate(agg.head(5).itertuples(index=False), 1):
308
+ label = getattr(row, g1)
309
+ met = getattr(row, "metric")
310
+ cnt = getattr(row, "count")
311
+ dev = _label_vs_baseline(met, baseline, baseline_band)
312
+ caution = _small_sample_note(int(cnt), min_sample)
313
+ msg = f"{i}. **{label}** β€” {primary_metric}: {_fmt_num(met)}; {_pluralize('record', int(cnt))}: {cnt} β†’ {dev}"
314
+ if caution:
315
+ msg += f" ({caution})"
316
+ top_lines.append(msg)
317
+
318
+ if top_lines:
319
+ sections.append("## Highest average values by group")
320
+ sections.extend(top_lines)
321
+ sections.append("")
322
+
323
+ # Zone comparison (prefer structured zone table if present)
324
+ zone_lines: List[str] = []
325
+ if isinstance(zone_tbl, pd.DataFrame) and not zone_tbl.empty:
326
+ z = zone_tbl.copy()
327
+ # find zone label & metric columns dynamically
328
+ zone_col = next((c for c in z.columns if "zone" in c.lower()), None)
329
+ zmet_col = next((c for c in z.columns if primary_metric.lower() in c.lower() or "avg" in c.lower()), None)
330
+ zcnt_col = next((c for c in z.columns if "count" in c.lower() or c.lower() in {"n", "records"}), None)
331
+
332
+ if zone_col and zmet_col:
333
+ # Clean truly missing zones but keep literal "Total" if present
334
+ z[zone_col] = z[zone_col].astype("string")
335
+ keep = (z[zone_col].notna()) | (z[zone_col].str.upper() == "TOTAL")
336
+ z = z[keep]
337
+ z[zmet_col] = _to_numeric(z[zmet_col])
338
+ z = z.sort_values(by=zmet_col, ascending=False)
339
+
340
+ for row in z.itertuples(index=False):
341
+ zone = getattr(row, zone_col)
342
+ met = getattr(row, zmet_col)
343
+ cnt = getattr(row, zcnt_col) if zcnt_col and hasattr(row, zcnt_col) else np.nan
344
+ lab = _label_vs_baseline(met, baseline, baseline_band)
345
+ msg = f"- **{zone}**: {_fmt_num(met)} (vs overall {_fmt_num(baseline)} β†’ {lab})"
346
+ if zcnt_col and hasattr(row, zcnt_col) and not pd.isna(cnt):
347
+ msg += f"; n={int(cnt)}"
348
+ zone_lines.append(msg)
349
+
350
+ else:
351
+ # Derive zones dynamically if a zone-like column exists
352
+ zcol = _find_group_col(df, ["zone"])
353
+ if zcol:
354
+ z = df.copy()
355
+ z[zcol] = z[zcol].astype("string").str.strip()
356
+ # drop true NaN zones, but do NOT fabricate totals
357
+ z = z[z[zcol].notna()]
358
+ agg = (
359
+ z.groupby(zcol, dropna=False)[primary_metric]
360
+ .agg(["mean", "count"]).reset_index()
361
+ .rename(columns={"mean": "metric", "count": "count"})
362
+ .sort_values(by="metric", ascending=False)
363
+ )
364
+ for row in agg.itertuples(index=False):
365
+ zone = getattr(row, zcol)
366
+ met = getattr(row, "metric")
367
+ cnt = getattr(row, "count")
368
+ lab = _label_vs_baseline(met, baseline, baseline_band)
369
+ msg = f"- **{zone}**: {_fmt_num(met)} (vs overall {_fmt_num(baseline)} β†’ {lab}); n={cnt}"
370
+ zone_lines.append(msg)
371
+
372
+ if zone_lines:
373
+ sections.append(f"## {( 'Zone' if 'zone' in ''.join(df.columns).lower() else 'Category')} comparison vs overall")
374
+ sections.extend(zone_lines)
375
+ sections.append("")
376
+
377
+ # Geographic notes β€” map Top-5 facilities only (if we have both Top-5 and ODHF df)
378
+ geo_lines: List[str] = []
379
+ if isinstance(top_fac, pd.DataFrame) and not top_fac.empty and isinstance(odhf_df, pd.DataFrame) and not odhf_df.empty:
380
+ fac_col = next((c for c in top_fac.columns if "facility" in c.lower()), None)
381
+ if fac_col:
382
+ mapped = _map_top_facilities_to_odhf(top_fac.head(5), odhf_df, fac_col=fac_col, odhf_name_col=next(
383
+ (c for c in odhf_df.columns if c.lower() == "facility_name"), "facility_name"
384
+ ))
385
+ if not mapped.empty:
386
+ for r in mapped.head(3).to_dict(orient="records"):
387
+ f = r.get("Facility")
388
+ city = r.get("city")
389
+ geo_lines.append(f"- **{f}** ({city}) is among the highest-average groups; consider capacity and referral patterns.")
390
+ if geo_lines:
391
+ sections.append("## Geographic notes")
392
+ sections.extend(geo_lines)
393
+ sections.append("")
394
+
395
+ # Recommendations β€” grounded in the above
396
+ recs: List[str] = []
397
+ if top_lines:
398
+ recs.append("Prioritize operating room time and staffing for the highest-average groups, especially those with substantial volume.")
399
+ if comparator_metric:
400
+ recs.append(f"Track **{comparator_metric}** alongside {primary_metric} to identify upstream bottlenecks (e.g., long consult waits driving surgical delays).")
401
+ if zone_lines:
402
+ recs.append("Address zones persistently above the provincial baseline; deploy targeted resources and load balancing across facilities.")
403
+ recs.append("Apply small-sample caution; pool or validate categories with very few records before acting on outliers.")
404
+ recs.append("Standardize specialty/facility naming to reduce coding-induced variance in aggregates.")
405
+
406
+ sections.append("## Recommendations (Auto-generated)")
407
+ for r in recs:
408
+ sections.append(f"- {r}")
409
 
410
+ return "\n".join(sections).strip()