Spaces:

VED-AGI-1
/

Medica_DecisionSupportAI

Sleeping

App Files Files

xet

Community

VED-AGI-1 commited on 23 days ago

Commit

1b29d16

verified ·

1 Parent(s): 5651d3e

Update narrative_safetynet.py

Browse files

Files changed (1) hide show

narrative_safetynet.py +278 -1

narrative_safetynet.py CHANGED Viewed

@@ -128,6 +128,283 @@ def _pluralize(word: str, n: int) -> str:
 # -------------------- geo join (Top-5 only) --------------------
-def _canon(s: str) -> s_

 # -------------------- geo join (Top-5 only) --------------------
+def _canon(s: str) -> str:
+    return re.sub(r"[^a-z0-9]+", "", (s or "").lower())
+def _map_top_facilities_to_odhf(
+    top_facilities: pd.DataFrame,
+    odhf: pd.DataFrame,
+    fac_col: str = "Facility",
+    odhf_name_col: str = "facility_name"
+) -> pd.DataFrame:
+    if odhf is None or odhf.empty or top_facilities is None or top_facilities.empty:
+        return pd.DataFrame()
+    out_rows: List[Dict[str, Any]] = []
+    try:
+        idx = { _canon(n): i for i, n in odhf[odhf_name_col].dropna().items() }
+    except Exception:
+        return pd.DataFrame()
+    for fac in top_facilities[fac_col].dropna().astype(str).unique():
+        key = _canon(fac)
+        row = None
+        if key in idx:
+            row = odhf.loc[idx[key]]
+        else:
+            # contains fallback (case-insensitive)
+            cand = odhf[odhf[odhf_name_col].astype(str).str.contains(fac, case=False, na=False)]
+            if not cand.empty:
+                row = cand.iloc[0]
+        if row is not None:
+            out_rows.append({
+                "Facility": fac,
+                "city": row.get("city"),
+                "latitude": row.get("latitude"),
+                "longitude": row.get("longitude")
+            })
+    return pd.DataFrame(out_rows)
+# -------------------- main: narrative builder --------------------
+def build_narrative(
+    scenario_text: str,
+    datasets: Dict[str, Any],
+    structured_tables: Optional[Dict[str, pd.DataFrame]] = None,
+    metric_hints: Optional[List[str]] = None,
+    group_hints: Optional[List[str]] = None,
+    min_sample: int = _DEF_MIN_SAMPLE,
+    baseline_band: float = 0.05  # ±5% "about average"
+) -> str:
+    """
+    Scenario-agnostic narrative fallback:
+      1) Choose best (df, metric) dynamically using name hints + numeric sanity
+      2) Prefer structured tables (top facilities/specialties/zones) if provided
+      3) Compute overall baseline + label groups vs baseline
+      4) Geo notes via fuzzy Top-5 ↔ ODHF join (<= 3 bullets)
+      5) Recommendations grounded in the same metric/groups
+    """
+    metric_hints = (metric_hints or _HINT_METRICS_DEFAULT)
+    group_hints  = (group_hints  or _HINT_GROUPS_DEFAULT)
+    # ---------- 1) Pick dataset + metric ----------
+    choice = _choose_df_and_metric(datasets, metric_hints)
+    if not choice:
+        return "No tabular data available. Unable to generate a narrative."
+    df_key, df, primary_metric = choice
+    # Ensure numeric
+    df = _nanlike_to_nan(df)
+    if primary_metric not in df.columns:
+        return "Chosen metric missing. Unable to generate a narrative."
+    df[primary_metric] = _to_numeric(df[primary_metric])
+    # Optional comparator metric (e.g., consult vs surgery)
+    comparator_metric = None
+    for c in df.columns:
+        if c == primary_metric:
+            continue
+        if _is_numeric_series(_to_numeric(df[c])):
+            name = c.lower()
+            if any(h in name for h in ["consult", "median", "wait", "p90", "90th"]):
+                comparator_metric = c
+                break
+    # ---------- 2) Prefer structured tables if present ----------
+    top_fac = None
+    top_spec = None
+    zone_tbl = None
+    odhf_df = None
+    if structured_tables:
+        top_fac = structured_tables.get("top_facilities")
+        top_spec = structured_tables.get("top_specialties")
+        zone_tbl = structured_tables.get("zone_summary")
+        # try to detect ODHF-like table by column fingerprint
+        for k, v in datasets.items():
+            if isinstance(v, pd.DataFrame) and {"facility_name", "city"}.issubset(set(map(str.lower, v.columns.str.lower()))):
+                odhf_df = v
+                break
+    # Compute baseline from the selected df/metric (not from ODHF)
+    baseline = df[primary_metric].mean(skipna=True)
+    # ---------- 3) Build sections ----------
+    sections: List[str] = []
+    # Methodology
+    meth: List[str] = []
+    meth.append(f"Primary metric: **{primary_metric}**; overall average: **{_fmt_num(baseline)}**.")
+    if comparator_metric:
+        meth.append(f"Comparator metric detected: **{comparator_metric}** (means shown when available).")
+    # Missing value note
+    if df.isna().sum().sum() > 0:
+        meth.append("Missing values (blank/dash) were treated as nulls and excluded from means.")
+    # Group hints (informative only)
+    g1 = _find_group_col(df, group_hints, avoid=[primary_metric])
+    if g1:
+        meth.append(f"Primary grouping inferred: **{g1}**.")
+    g2 = _find_group_col(df.drop(columns=[g1], errors="ignore") if g1 else df, group_hints, avoid=[primary_metric, g1 or ""])
+    if g2:
+        meth.append(f"Secondary grouping inferred: **{g2}**.")
+    sections.append("## Methodology (Auto-generated)")
+    for m in meth:
+        sections.append(f"- {m}")
+    sections.append("")
+    # Highest averages by primary grouping (prefer structured Top-5 if given)
+    top_lines: List[str] = []
+    if isinstance(top_fac, pd.DataFrame) and not top_fac.empty:
+        # Expect columns like: Facility, Zone, avg_Surgery_Median, count_*
+        # Keep dynamic: find a metric column in top_fac aligned to primary_metric by hint matching
+        metric_col = None
+        for c in top_fac.columns:
+            if primary_metric.lower() in c.lower() or any(h in c.lower() for h in ["avg_", "mean"]):
+                if _is_numeric_series(_to_numeric(top_fac[c])):
+                    metric_col = c
+                    break
+        if metric_col is None:
+            # fallback: first numeric col
+            for c in top_fac.columns:
+                if _is_numeric_series(_to_numeric(top_fac[c])):
+                    metric_col = c; break
+        cnt_col = next((c for c in top_fac.columns if "count" in c.lower() or c.lower() in {"n", "records"}), None)
+        lab_col = next((c for c in top_fac.columns if "facility" in c.lower()), None)
+        if metric_col and lab_col:
+            # already sorted in your executor; if not, sort desc
+            tf = top_fac.copy()
+            tf[metric_col] = _to_numeric(tf[metric_col])
+            tf = tf.sort_values(by=metric_col, ascending=False)
+            for i, row in enumerate(tf.head(5).itertuples(index=False), 1):
+                label = getattr(row, lab_col)
+                met   = getattr(row, metric_col)
+                cnt   = getattr(row, cnt_col) if cnt_col and hasattr(row, cnt_col) else np.nan
+                dev   = _label_vs_baseline(met, baseline, baseline_band)
+                caution = _small_sample_note(int(cnt)) if (isinstance(cnt, (int, float)) and not pd.isna(cnt)) else None
+                msg = f"{i}. **{label}** — {primary_metric}: {_fmt_num(met)}"
+                if cnt_col and hasattr(row, cnt_col):
+                    msg += f"; {_pluralize('record', int(cnt))}: {int(cnt)}"
+                msg += f" → {dev}"
+                if caution:
+                    msg += f" ({caution})"
+                top_lines.append(msg)
+    else:
+        # No structured Top-5 provided: derive from g1
+        if g1:
+            tmp = df.copy()
+            tmp[primary_metric] = _to_numeric(tmp[primary_metric])
+            if comparator_metric in tmp.columns:
+                tmp[comparator_metric] = _to_numeric(tmp[comparator_metric])
+            agg = (
+                tmp.groupby(g1, dropna=False)
+                   .agg(metric=(primary_metric, "mean"), count=(primary_metric, "count"))
+                   .reset_index()
+            ).sort_values(by="metric", ascending=False)
+            for i, row in enumerate(agg.head(5).itertuples(index=False), 1):
+                label = getattr(row, g1)
+                met   = getattr(row, "metric")
+                cnt   = getattr(row, "count")
+                dev   = _label_vs_baseline(met, baseline, baseline_band)
+                caution = _small_sample_note(int(cnt), min_sample)
+                msg = f"{i}. **{label}** — {primary_metric}: {_fmt_num(met)}; {_pluralize('record', int(cnt))}: {cnt} → {dev}"
+                if caution:
+                    msg += f" ({caution})"
+                top_lines.append(msg)
+    if top_lines:
+        sections.append("## Highest average values by group")
+        sections.extend(top_lines)
+        sections.append("")
+    # Zone comparison (prefer structured zone table if present)
+    zone_lines: List[str] = []
+    if isinstance(zone_tbl, pd.DataFrame) and not zone_tbl.empty:
+        z = zone_tbl.copy()
+        # find zone label & metric columns dynamically
+        zone_col = next((c for c in z.columns if "zone" in c.lower()), None)
+        zmet_col = next((c for c in z.columns if primary_metric.lower() in c.lower() or "avg" in c.lower()), None)
+        zcnt_col = next((c for c in z.columns if "count" in c.lower() or c.lower() in {"n", "records"}), None)
+        if zone_col and zmet_col:
+            # Clean truly missing zones but keep literal "Total" if present
+            z[zone_col] = z[zone_col].astype("string")
+            keep = (z[zone_col].notna()) | (z[zone_col].str.upper() == "TOTAL")
+            z = z[keep]
+            z[zmet_col] = _to_numeric(z[zmet_col])
+            z = z.sort_values(by=zmet_col, ascending=False)
+            for row in z.itertuples(index=False):
+                zone = getattr(row, zone_col)
+                met  = getattr(row, zmet_col)
+                cnt  = getattr(row, zcnt_col) if zcnt_col and hasattr(row, zcnt_col) else np.nan
+                lab  = _label_vs_baseline(met, baseline, baseline_band)
+                msg = f"- **{zone}**: {_fmt_num(met)} (vs overall {_fmt_num(baseline)} → {lab})"
+                if zcnt_col and hasattr(row, zcnt_col) and not pd.isna(cnt):
+                    msg += f"; n={int(cnt)}"
+                zone_lines.append(msg)
+    else:
+        # Derive zones dynamically if a zone-like column exists
+        zcol = _find_group_col(df, ["zone"])
+        if zcol:
+            z = df.copy()
+            z[zcol] = z[zcol].astype("string").str.strip()
+            # drop true NaN zones, but do NOT fabricate totals
+            z = z[z[zcol].notna()]
+            agg = (
+                z.groupby(zcol, dropna=False)[primary_metric]
+                 .agg(["mean", "count"]).reset_index()
+                 .rename(columns={"mean": "metric", "count": "count"})
+                 .sort_values(by="metric", ascending=False)
+            )
+            for row in agg.itertuples(index=False):
+                zone = getattr(row, zcol)
+                met  = getattr(row, "metric")
+                cnt  = getattr(row, "count")
+                lab  = _label_vs_baseline(met, baseline, baseline_band)
+                msg = f"- **{zone}**: {_fmt_num(met)} (vs overall {_fmt_num(baseline)} → {lab}); n={cnt}"
+                zone_lines.append(msg)
+    if zone_lines:
+        sections.append(f"## {( 'Zone' if 'zone' in ''.join(df.columns).lower() else 'Category')} comparison vs overall")
+        sections.extend(zone_lines)
+        sections.append("")
+    # Geographic notes — map Top-5 facilities only (if we have both Top-5 and ODHF df)
+    geo_lines: List[str] = []
+    if isinstance(top_fac, pd.DataFrame) and not top_fac.empty and isinstance(odhf_df, pd.DataFrame) and not odhf_df.empty:
+        fac_col = next((c for c in top_fac.columns if "facility" in c.lower()), None)
+        if fac_col:
+            mapped = _map_top_facilities_to_odhf(top_fac.head(5), odhf_df, fac_col=fac_col, odhf_name_col=next(
+                (c for c in odhf_df.columns if c.lower() == "facility_name"), "facility_name"
+            ))
+            if not mapped.empty:
+                for r in mapped.head(3).to_dict(orient="records"):
+                    f = r.get("Facility")
+                    city = r.get("city")
+                    geo_lines.append(f"- **{f}** ({city}) is among the highest-average groups; consider capacity and referral patterns.")
+    if geo_lines:
+        sections.append("## Geographic notes")
+        sections.extend(geo_lines)
+        sections.append("")
+    # Recommendations — grounded in the above
+    recs: List[str] = []
+    if top_lines:
+        recs.append("Prioritize operating room time and staffing for the highest-average groups, especially those with substantial volume.")
+    if comparator_metric:
+        recs.append(f"Track **{comparator_metric}** alongside {primary_metric} to identify upstream bottlenecks (e.g., long consult waits driving surgical delays).")
+    if zone_lines:
+        recs.append("Address zones persistently above the provincial baseline; deploy targeted resources and load balancing across facilities.")
+    recs.append("Apply small-sample caution; pool or validate categories with very few records before acting on outliers.")
+    recs.append("Standardize specialty/facility naming to reduce coding-induced variance in aggregates.")
+    sections.append("## Recommendations (Auto-generated)")
+    for r in recs:
+        sections.append(f"- {r}")
+    return "\n".join(sections).strip()