Update narrative_safetynet.py
Browse files- narrative_safetynet.py +278 -1
narrative_safetynet.py
CHANGED
@@ -128,6 +128,283 @@ def _pluralize(word: str, n: int) -> str:
|
|
128 |
|
129 |
# -------------------- geo join (Top-5 only) --------------------
|
130 |
|
131 |
-
def _canon(s: str) ->
|
|
|
132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
|
|
|
128 |
|
129 |
# -------------------- geo join (Top-5 only) --------------------
|
130 |
|
131 |
+
def _canon(s: str) -> str:
|
132 |
+
return re.sub(r"[^a-z0-9]+", "", (s or "").lower())
|
133 |
|
134 |
+
def _map_top_facilities_to_odhf(
|
135 |
+
top_facilities: pd.DataFrame,
|
136 |
+
odhf: pd.DataFrame,
|
137 |
+
fac_col: str = "Facility",
|
138 |
+
odhf_name_col: str = "facility_name"
|
139 |
+
) -> pd.DataFrame:
|
140 |
+
if odhf is None or odhf.empty or top_facilities is None or top_facilities.empty:
|
141 |
+
return pd.DataFrame()
|
142 |
+
out_rows: List[Dict[str, Any]] = []
|
143 |
+
try:
|
144 |
+
idx = { _canon(n): i for i, n in odhf[odhf_name_col].dropna().items() }
|
145 |
+
except Exception:
|
146 |
+
return pd.DataFrame()
|
147 |
+
for fac in top_facilities[fac_col].dropna().astype(str).unique():
|
148 |
+
key = _canon(fac)
|
149 |
+
row = None
|
150 |
+
if key in idx:
|
151 |
+
row = odhf.loc[idx[key]]
|
152 |
+
else:
|
153 |
+
# contains fallback (case-insensitive)
|
154 |
+
cand = odhf[odhf[odhf_name_col].astype(str).str.contains(fac, case=False, na=False)]
|
155 |
+
if not cand.empty:
|
156 |
+
row = cand.iloc[0]
|
157 |
+
if row is not None:
|
158 |
+
out_rows.append({
|
159 |
+
"Facility": fac,
|
160 |
+
"city": row.get("city"),
|
161 |
+
"latitude": row.get("latitude"),
|
162 |
+
"longitude": row.get("longitude")
|
163 |
+
})
|
164 |
+
return pd.DataFrame(out_rows)
|
165 |
+
|
166 |
+
# -------------------- main: narrative builder --------------------
|
167 |
+
|
168 |
+
def build_narrative(
|
169 |
+
scenario_text: str,
|
170 |
+
datasets: Dict[str, Any],
|
171 |
+
structured_tables: Optional[Dict[str, pd.DataFrame]] = None,
|
172 |
+
metric_hints: Optional[List[str]] = None,
|
173 |
+
group_hints: Optional[List[str]] = None,
|
174 |
+
min_sample: int = _DEF_MIN_SAMPLE,
|
175 |
+
baseline_band: float = 0.05 # Β±5% "about average"
|
176 |
+
) -> str:
|
177 |
+
"""
|
178 |
+
Scenario-agnostic narrative fallback:
|
179 |
+
1) Choose best (df, metric) dynamically using name hints + numeric sanity
|
180 |
+
2) Prefer structured tables (top facilities/specialties/zones) if provided
|
181 |
+
3) Compute overall baseline + label groups vs baseline
|
182 |
+
4) Geo notes via fuzzy Top-5 β ODHF join (<= 3 bullets)
|
183 |
+
5) Recommendations grounded in the same metric/groups
|
184 |
+
"""
|
185 |
+
|
186 |
+
metric_hints = (metric_hints or _HINT_METRICS_DEFAULT)
|
187 |
+
group_hints = (group_hints or _HINT_GROUPS_DEFAULT)
|
188 |
+
|
189 |
+
# ---------- 1) Pick dataset + metric ----------
|
190 |
+
choice = _choose_df_and_metric(datasets, metric_hints)
|
191 |
+
if not choice:
|
192 |
+
return "No tabular data available. Unable to generate a narrative."
|
193 |
+
df_key, df, primary_metric = choice
|
194 |
+
|
195 |
+
# Ensure numeric
|
196 |
+
df = _nanlike_to_nan(df)
|
197 |
+
if primary_metric not in df.columns:
|
198 |
+
return "Chosen metric missing. Unable to generate a narrative."
|
199 |
+
df[primary_metric] = _to_numeric(df[primary_metric])
|
200 |
+
|
201 |
+
# Optional comparator metric (e.g., consult vs surgery)
|
202 |
+
comparator_metric = None
|
203 |
+
for c in df.columns:
|
204 |
+
if c == primary_metric:
|
205 |
+
continue
|
206 |
+
if _is_numeric_series(_to_numeric(df[c])):
|
207 |
+
name = c.lower()
|
208 |
+
if any(h in name for h in ["consult", "median", "wait", "p90", "90th"]):
|
209 |
+
comparator_metric = c
|
210 |
+
break
|
211 |
+
|
212 |
+
# ---------- 2) Prefer structured tables if present ----------
|
213 |
+
top_fac = None
|
214 |
+
top_spec = None
|
215 |
+
zone_tbl = None
|
216 |
+
odhf_df = None
|
217 |
+
|
218 |
+
if structured_tables:
|
219 |
+
top_fac = structured_tables.get("top_facilities")
|
220 |
+
top_spec = structured_tables.get("top_specialties")
|
221 |
+
zone_tbl = structured_tables.get("zone_summary")
|
222 |
+
# try to detect ODHF-like table by column fingerprint
|
223 |
+
for k, v in datasets.items():
|
224 |
+
if isinstance(v, pd.DataFrame) and {"facility_name", "city"}.issubset(set(map(str.lower, v.columns.str.lower()))):
|
225 |
+
odhf_df = v
|
226 |
+
break
|
227 |
+
|
228 |
+
# Compute baseline from the selected df/metric (not from ODHF)
|
229 |
+
baseline = df[primary_metric].mean(skipna=True)
|
230 |
+
|
231 |
+
# ---------- 3) Build sections ----------
|
232 |
+
|
233 |
+
sections: List[str] = []
|
234 |
+
|
235 |
+
# Methodology
|
236 |
+
meth: List[str] = []
|
237 |
+
meth.append(f"Primary metric: **{primary_metric}**; overall average: **{_fmt_num(baseline)}**.")
|
238 |
+
if comparator_metric:
|
239 |
+
meth.append(f"Comparator metric detected: **{comparator_metric}** (means shown when available).")
|
240 |
+
# Missing value note
|
241 |
+
if df.isna().sum().sum() > 0:
|
242 |
+
meth.append("Missing values (blank/dash) were treated as nulls and excluded from means.")
|
243 |
+
# Group hints (informative only)
|
244 |
+
g1 = _find_group_col(df, group_hints, avoid=[primary_metric])
|
245 |
+
if g1:
|
246 |
+
meth.append(f"Primary grouping inferred: **{g1}**.")
|
247 |
+
g2 = _find_group_col(df.drop(columns=[g1], errors="ignore") if g1 else df, group_hints, avoid=[primary_metric, g1 or ""])
|
248 |
+
if g2:
|
249 |
+
meth.append(f"Secondary grouping inferred: **{g2}**.")
|
250 |
+
|
251 |
+
sections.append("## Methodology (Auto-generated)")
|
252 |
+
for m in meth:
|
253 |
+
sections.append(f"- {m}")
|
254 |
+
sections.append("")
|
255 |
+
|
256 |
+
# Highest averages by primary grouping (prefer structured Top-5 if given)
|
257 |
+
top_lines: List[str] = []
|
258 |
+
if isinstance(top_fac, pd.DataFrame) and not top_fac.empty:
|
259 |
+
# Expect columns like: Facility, Zone, avg_Surgery_Median, count_*
|
260 |
+
# Keep dynamic: find a metric column in top_fac aligned to primary_metric by hint matching
|
261 |
+
metric_col = None
|
262 |
+
for c in top_fac.columns:
|
263 |
+
if primary_metric.lower() in c.lower() or any(h in c.lower() for h in ["avg_", "mean"]):
|
264 |
+
if _is_numeric_series(_to_numeric(top_fac[c])):
|
265 |
+
metric_col = c
|
266 |
+
break
|
267 |
+
if metric_col is None:
|
268 |
+
# fallback: first numeric col
|
269 |
+
for c in top_fac.columns:
|
270 |
+
if _is_numeric_series(_to_numeric(top_fac[c])):
|
271 |
+
metric_col = c; break
|
272 |
+
|
273 |
+
cnt_col = next((c for c in top_fac.columns if "count" in c.lower() or c.lower() in {"n", "records"}), None)
|
274 |
+
lab_col = next((c for c in top_fac.columns if "facility" in c.lower()), None)
|
275 |
+
|
276 |
+
if metric_col and lab_col:
|
277 |
+
# already sorted in your executor; if not, sort desc
|
278 |
+
tf = top_fac.copy()
|
279 |
+
tf[metric_col] = _to_numeric(tf[metric_col])
|
280 |
+
tf = tf.sort_values(by=metric_col, ascending=False)
|
281 |
+
for i, row in enumerate(tf.head(5).itertuples(index=False), 1):
|
282 |
+
label = getattr(row, lab_col)
|
283 |
+
met = getattr(row, metric_col)
|
284 |
+
cnt = getattr(row, cnt_col) if cnt_col and hasattr(row, cnt_col) else np.nan
|
285 |
+
dev = _label_vs_baseline(met, baseline, baseline_band)
|
286 |
+
caution = _small_sample_note(int(cnt)) if (isinstance(cnt, (int, float)) and not pd.isna(cnt)) else None
|
287 |
+
msg = f"{i}. **{label}** β {primary_metric}: {_fmt_num(met)}"
|
288 |
+
if cnt_col and hasattr(row, cnt_col):
|
289 |
+
msg += f"; {_pluralize('record', int(cnt))}: {int(cnt)}"
|
290 |
+
msg += f" β {dev}"
|
291 |
+
if caution:
|
292 |
+
msg += f" ({caution})"
|
293 |
+
top_lines.append(msg)
|
294 |
+
|
295 |
+
else:
|
296 |
+
# No structured Top-5 provided: derive from g1
|
297 |
+
if g1:
|
298 |
+
tmp = df.copy()
|
299 |
+
tmp[primary_metric] = _to_numeric(tmp[primary_metric])
|
300 |
+
if comparator_metric in tmp.columns:
|
301 |
+
tmp[comparator_metric] = _to_numeric(tmp[comparator_metric])
|
302 |
+
agg = (
|
303 |
+
tmp.groupby(g1, dropna=False)
|
304 |
+
.agg(metric=(primary_metric, "mean"), count=(primary_metric, "count"))
|
305 |
+
.reset_index()
|
306 |
+
).sort_values(by="metric", ascending=False)
|
307 |
+
for i, row in enumerate(agg.head(5).itertuples(index=False), 1):
|
308 |
+
label = getattr(row, g1)
|
309 |
+
met = getattr(row, "metric")
|
310 |
+
cnt = getattr(row, "count")
|
311 |
+
dev = _label_vs_baseline(met, baseline, baseline_band)
|
312 |
+
caution = _small_sample_note(int(cnt), min_sample)
|
313 |
+
msg = f"{i}. **{label}** β {primary_metric}: {_fmt_num(met)}; {_pluralize('record', int(cnt))}: {cnt} β {dev}"
|
314 |
+
if caution:
|
315 |
+
msg += f" ({caution})"
|
316 |
+
top_lines.append(msg)
|
317 |
+
|
318 |
+
if top_lines:
|
319 |
+
sections.append("## Highest average values by group")
|
320 |
+
sections.extend(top_lines)
|
321 |
+
sections.append("")
|
322 |
+
|
323 |
+
# Zone comparison (prefer structured zone table if present)
|
324 |
+
zone_lines: List[str] = []
|
325 |
+
if isinstance(zone_tbl, pd.DataFrame) and not zone_tbl.empty:
|
326 |
+
z = zone_tbl.copy()
|
327 |
+
# find zone label & metric columns dynamically
|
328 |
+
zone_col = next((c for c in z.columns if "zone" in c.lower()), None)
|
329 |
+
zmet_col = next((c for c in z.columns if primary_metric.lower() in c.lower() or "avg" in c.lower()), None)
|
330 |
+
zcnt_col = next((c for c in z.columns if "count" in c.lower() or c.lower() in {"n", "records"}), None)
|
331 |
+
|
332 |
+
if zone_col and zmet_col:
|
333 |
+
# Clean truly missing zones but keep literal "Total" if present
|
334 |
+
z[zone_col] = z[zone_col].astype("string")
|
335 |
+
keep = (z[zone_col].notna()) | (z[zone_col].str.upper() == "TOTAL")
|
336 |
+
z = z[keep]
|
337 |
+
z[zmet_col] = _to_numeric(z[zmet_col])
|
338 |
+
z = z.sort_values(by=zmet_col, ascending=False)
|
339 |
+
|
340 |
+
for row in z.itertuples(index=False):
|
341 |
+
zone = getattr(row, zone_col)
|
342 |
+
met = getattr(row, zmet_col)
|
343 |
+
cnt = getattr(row, zcnt_col) if zcnt_col and hasattr(row, zcnt_col) else np.nan
|
344 |
+
lab = _label_vs_baseline(met, baseline, baseline_band)
|
345 |
+
msg = f"- **{zone}**: {_fmt_num(met)} (vs overall {_fmt_num(baseline)} β {lab})"
|
346 |
+
if zcnt_col and hasattr(row, zcnt_col) and not pd.isna(cnt):
|
347 |
+
msg += f"; n={int(cnt)}"
|
348 |
+
zone_lines.append(msg)
|
349 |
+
|
350 |
+
else:
|
351 |
+
# Derive zones dynamically if a zone-like column exists
|
352 |
+
zcol = _find_group_col(df, ["zone"])
|
353 |
+
if zcol:
|
354 |
+
z = df.copy()
|
355 |
+
z[zcol] = z[zcol].astype("string").str.strip()
|
356 |
+
# drop true NaN zones, but do NOT fabricate totals
|
357 |
+
z = z[z[zcol].notna()]
|
358 |
+
agg = (
|
359 |
+
z.groupby(zcol, dropna=False)[primary_metric]
|
360 |
+
.agg(["mean", "count"]).reset_index()
|
361 |
+
.rename(columns={"mean": "metric", "count": "count"})
|
362 |
+
.sort_values(by="metric", ascending=False)
|
363 |
+
)
|
364 |
+
for row in agg.itertuples(index=False):
|
365 |
+
zone = getattr(row, zcol)
|
366 |
+
met = getattr(row, "metric")
|
367 |
+
cnt = getattr(row, "count")
|
368 |
+
lab = _label_vs_baseline(met, baseline, baseline_band)
|
369 |
+
msg = f"- **{zone}**: {_fmt_num(met)} (vs overall {_fmt_num(baseline)} β {lab}); n={cnt}"
|
370 |
+
zone_lines.append(msg)
|
371 |
+
|
372 |
+
if zone_lines:
|
373 |
+
sections.append(f"## {( 'Zone' if 'zone' in ''.join(df.columns).lower() else 'Category')} comparison vs overall")
|
374 |
+
sections.extend(zone_lines)
|
375 |
+
sections.append("")
|
376 |
+
|
377 |
+
# Geographic notes β map Top-5 facilities only (if we have both Top-5 and ODHF df)
|
378 |
+
geo_lines: List[str] = []
|
379 |
+
if isinstance(top_fac, pd.DataFrame) and not top_fac.empty and isinstance(odhf_df, pd.DataFrame) and not odhf_df.empty:
|
380 |
+
fac_col = next((c for c in top_fac.columns if "facility" in c.lower()), None)
|
381 |
+
if fac_col:
|
382 |
+
mapped = _map_top_facilities_to_odhf(top_fac.head(5), odhf_df, fac_col=fac_col, odhf_name_col=next(
|
383 |
+
(c for c in odhf_df.columns if c.lower() == "facility_name"), "facility_name"
|
384 |
+
))
|
385 |
+
if not mapped.empty:
|
386 |
+
for r in mapped.head(3).to_dict(orient="records"):
|
387 |
+
f = r.get("Facility")
|
388 |
+
city = r.get("city")
|
389 |
+
geo_lines.append(f"- **{f}** ({city}) is among the highest-average groups; consider capacity and referral patterns.")
|
390 |
+
if geo_lines:
|
391 |
+
sections.append("## Geographic notes")
|
392 |
+
sections.extend(geo_lines)
|
393 |
+
sections.append("")
|
394 |
+
|
395 |
+
# Recommendations β grounded in the above
|
396 |
+
recs: List[str] = []
|
397 |
+
if top_lines:
|
398 |
+
recs.append("Prioritize operating room time and staffing for the highest-average groups, especially those with substantial volume.")
|
399 |
+
if comparator_metric:
|
400 |
+
recs.append(f"Track **{comparator_metric}** alongside {primary_metric} to identify upstream bottlenecks (e.g., long consult waits driving surgical delays).")
|
401 |
+
if zone_lines:
|
402 |
+
recs.append("Address zones persistently above the provincial baseline; deploy targeted resources and load balancing across facilities.")
|
403 |
+
recs.append("Apply small-sample caution; pool or validate categories with very few records before acting on outliers.")
|
404 |
+
recs.append("Standardize specialty/facility naming to reduce coding-induced variance in aggregates.")
|
405 |
+
|
406 |
+
sections.append("## Recommendations (Auto-generated)")
|
407 |
+
for r in recs:
|
408 |
+
sections.append(f"- {r}")
|
409 |
|
410 |
+
return "\n".join(sections).strip()
|