Spaces:

causalscience
/

Impact_Analysis_Tools

Sleeping

App Files Files Community

causalscience commited on Aug 20

Commit

9bf8127

verified ·

1 Parent(s): 3551cf7

Aug 2025 Bug Fixes

Browse files

Files changed (1) hide show

models/propensity.py +736 -127

models/propensity.py CHANGED Viewed

@@ -1,127 +1,736 @@
-# causalscience/models/propensity.py
-import numpy as np
-import pandas as pd
-from sklearn.linear_model import LogisticRegression
-import seaborn as sns
-import matplotlib.pyplot as plt
-from io import BytesIO
-from PIL import Image
-from scipy.special import logit  # MODIFIED: import for logit transform
-# no imports from models.propensity here!  # MODIFIED
-from utils.helpers import detect_column_type
-from utils.plotting import calculate_standardized_differences, love_plot
-def fit_propensity_score(
-    df,
-    treatment_col,
-    feature_cols=None,
-    C=1e6,
-    max_iter=1000
-):
-    """
-    Fit a logistic regression model to estimate propensity scores.
-    # MODIFIED: C (regularization strength) and max_iter are now configurable parameters.
-    """
-    if feature_cols is None:
-        feature_cols = [col for col in df.columns if col != treatment_col]
-    X = df[feature_cols].copy()
-    for col in X.columns:
-        if not pd.api.types.is_numeric_dtype(X[col]):
-            try:
-                X[col] = pd.to_numeric(X[col], errors='coerce')
-            except:
-                X = X.drop(columns=[col])
-                if feature_cols:
-                    feature_cols.remove(col)
-    y = df[treatment_col].astype(int)
-    model = LogisticRegression(C=C, max_iter=max_iter)
-    model.fit(X, y)
-    df_scores = df.copy()
-    df_scores['propensity_score'] = model.predict_proba(X)[:, 1]
-    return model, df_scores
-def match_with_caliper(
-    df,
-    treatment_col,
-    caliper_width=0.2,
-    with_replacement=True,
-    use_logit_caliper=False  # MODIFIED: option to compute caliper on logit scale
-):
-    """
-    Perform 1:1 nearest neighbor matching on propensity scores within a caliper.
-    """
-    # select scale for caliper
-    if use_logit_caliper:
-        scores = logit(df['propensity_score'])  # MODIFIED
-    else:
-        scores = df['propensity_score']
-    ps_std = scores.std()
-    caliper = caliper_width * ps_std
-    treated = df[df[treatment_col] == 1].copy()
-    control = df[df[treatment_col] == 0].copy()
-    matches = []
-    pair_id = 0
-    for idx, row in treated.iterrows():
-        if use_logit_caliper:
-            tgt = logit(row['propensity_score'])  # MODIFIED
-            diffs = (logit(control['propensity_score']) - tgt).abs()  # MODIFIED
-        else:
-            diffs = (control['propensity_score'] - row['propensity_score']).abs()
-        eligible = control[diffs <= caliper]
-        if eligible.empty:
-            continue
-        best_idx = diffs[diffs <= caliper].idxmin()  # MODIFIED
-        match = control.loc[best_idx].copy()
-        match['pair_id'] = pair_id
-        treated.at[idx, 'pair_id'] = pair_id
-        matches.append(match)
-        pair_id += 1
-        if not with_replacement:
-            control = control.drop(best_idx)
-    matched_controls = pd.DataFrame(matches)
-    matched_df = pd.concat([
-        treated.dropna(subset=['pair_id']),
-        matched_controls
-    ], ignore_index=True)
-    return matched_df
-def assess_balance(
-    df,
-    matched_df,
-    treatment_col,
-    covariate_cols,
-    threshold=0.1
-):
-    """
-    Compute standardized differences and create a Love plot.
-    """
-    covariates = covariate_cols  # MODIFIED: use explicit covariates list
-    std_unadj = calculate_standardized_differences(df, covariates, treatment_col)
-    std_matched = calculate_standardized_differences(matched_df, covariates, treatment_col)
-    love_img = love_plot(
-        [std_unadj, std_matched],
-        labels=['Unadjusted', 'Matched'],
-        threshold=threshold,
-        abs_val=True
-    )
-    return love_img, std_unadj, std_matched

+# Runtime-safe installs
+try:
+    import numpy  # noqa
+    import pandas  # noqa
+    import sklearn  # noqa
+    import matplotlib  # noqa
+    import PIL  # noqa
+except Exception:
+    import sys, subprocess
+    subprocess.run(
+        [sys.executable, "-m", "pip", "install", "-q",
+         "numpy", "pandas", "scikit-learn", "matplotlib", "pillow"],
+        check=False
+    )
+# models/propensity.py
+from dataclasses import dataclass
+from typing import List, Tuple, Optional, Union, Dict
+import io
+import numpy as np
+import pandas as pd
+from PIL import Image
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from sklearn.linear_model import LogisticRegression
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+# -----------------------------
+# Helpers
+# -----------------------------
+def _ensure_binary(series: pd.Series) -> pd.Series:
+    s = series.copy()
+    if s.dtype == bool:
+        return s.astype(int)
+    if s.dtype == object:
+        mapping = {"t":1,"true":1,"yes":1,"y":1,"1":1,"f":0,"false":0,"no":0,"n":0,"0":0}
+        m = s.astype(str).str.strip().str.lower().map(mapping).astype("Int64")
+        if m.isna().any():
+            try:
+                sn = pd.to_numeric(s, errors="raise")
+                if set(pd.unique(sn)) <= {0,1}:
+                    return sn.astype(int)
+            except Exception:
+                pass
+            raise ValueError("Treatment column cannot be coerced to binary 0/1.")
+        return m.astype(int)
+    uniq = set(pd.unique(s.dropna()))
+    if uniq <= {0,1} or uniq <= {0.0,1.0}:
+        return s.astype(int)
+    raise ValueError("Treatment column must contain values that map to 0/1.")
+def _select_features(df: pd.DataFrame, feature_cols: List[str], outcome_col: Optional[str]) -> List[str]:
+    cols = [c for c in feature_cols if c in df.columns]
+    if outcome_col and outcome_col in df.columns and outcome_col not in cols:
+        cols.append(outcome_col)  # include outcome for balance view only
+    return cols
+def _split_num_cat(df: pd.DataFrame, cols: List[str]) -> Tuple[List[str], List[str]]:
+    num, cat = [], []
+    for c in cols:
+        (num if pd.api.types.is_numeric_dtype(df[c]) else cat).append(c)
+    return num, cat
+# -----------------------------
+# Propensity model
+# -----------------------------
+def _fit_propensity(df: pd.DataFrame, treatment_col: str, features: List[str]) -> Tuple[np.ndarray, Pipeline]:
+    y = _ensure_binary(df[treatment_col])
+    num, cat = _split_num_cat(df, features)
+    transformers = []
+    if num:
+        transformers.append(("num", StandardScaler(), num))
+    if cat:
+        try:
+            ohe = OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False)  # sklearn>=1.2
+        except TypeError:
+            ohe = OneHotEncoder(handle_unknown="ignore", drop="first", sparse=False)
+        transformers.append(("cat", ohe, cat))
+    pre = ColumnTransformer(transformers, remainder="drop", verbose_feature_names_out=False)
+    clf = LogisticRegression(max_iter=2000, solver="liblinear")
+    pipe = Pipeline([("pre", pre), ("clf", clf)])
+    pipe.fit(df[features], y.values)
+    ps = pipe.predict_proba(df[features])[:, 1]
+    return ps, pipe
+# -----------------------------
+# Matching
+# -----------------------------
+@dataclass
+class MatchSummary:
+    method: str
+    treated_rows: int
+    control_rows: int
+    unique_controls: int
+    min_controls: int
+    max_controls: int
+    replacement: bool
+    caliper: Optional[float]
+    n_strata: Optional[int] = None
+    caliper_dropped_treated: int = 0  # diagnostic
+def _greedy_nearest(
+    df: pd.DataFrame,
+    ps_col: str,
+    treatment_col: str,
+    min_controls: int,
+    max_controls: int,
+    replacement: bool,
+    caliper: Optional[float] = None,
+) -> Tuple[pd.DataFrame, MatchSummary]:
+    work = df.copy()
+    work["__rowid__"] = np.arange(len(work))
+    treated = work[work[treatment_col] == 1]
+    control = work[work[treatment_col] == 0].copy()
+    if control.empty or treated.empty:
+        return pd.DataFrame(), MatchSummary("nearest", 0, 0, 0, min_controls, max_controls, replacement, caliper)
+    used = set()
+    pairs: List[Tuple[int, int]] = []
+    dropped_due_to_caliper = 0
+    for _, t in treated.iterrows():
+        diffs = control.copy()
+        diffs["__dist__"] = (diffs[ps_col] - t[ps_col]).abs()
+        if caliper is not None and caliper >= 0:
+            diffs = diffs[diffs["__dist__"] <= caliper]
+        if not replacement:
+            diffs = diffs[~diffs["__rowid__"].isin(used)]
+        diffs = diffs.sort_values("__dist__", ascending=True).head(max_controls)
+        if len(diffs) < min_controls:
+            if caliper is not None and caliper >= 0:
+                dropped_due_to_caliper += 1
+            continue
+        for _, c in diffs.iterrows():
+            pairs.append((int(t["__rowid__"]), int(c["__rowid__"])))
+            if not replacement:
+                used.add(int(c["__rowid__"]))
+    if not pairs:
+        return pd.DataFrame(), MatchSummary("nearest", 0, 0, 0, min_controls, max_controls, replacement, caliper, caliper_dropped_treated=dropped_due_to_caliper)
+    idx_t = [p[0] for p in pairs]
+    idx_c = [p[1] for p in pairs]
+    wsi = work.set_index("__rowid__")
+    # Build a UNIQUE mapping of treated rowid -> group id (in first-seen order)
+    # This avoids Pandas "Reindexing only valid with uniquely valued Index objects" during map().
+    treated_seq = idx_t  # sequence with duplicates, one per pair
+    seen = set()
+    unique_treated_in_order = []
+    for t_id in treated_seq:
+        if t_id not in seen:
+            unique_treated_in_order.append(t_id)
+            seen.add(t_id)
+    group_map = {t_id: gid for gid, t_id in enumerate(unique_treated_in_order)}
+    mt = wsi.loc[idx_t].copy()
+    mt["__role__"] = "treated"
+    mt["__match_group__"] = [group_map[t_id] for t_id in mt.index]
+    mc = wsi.loc[idx_c].copy()
+    mc["__role__"] = "control"
+    # align groups to each pair order (idx_t and idx_c are parallel)
+    mc["__match_group__"] = [group_map[t_id] for t_id in idx_t]
+    matched_stack = pd.concat([mt, mc], ignore_index=True)
+    summary = MatchSummary(
+        method="nearest",
+        treated_rows=mt.shape[0],
+        control_rows=mc.shape[0],
+        unique_controls=len(set(idx_c)),
+        min_controls=min_controls, max_controls=max_controls,
+        replacement=replacement, caliper=caliper,
+        n_strata=None,
+        caliper_dropped_treated=dropped_due_to_caliper,
+    )
+    return matched_stack, summary
+def _caliper_matching(
+    df: pd.DataFrame,
+    ps_col: str,
+    treatment_col: str,
+    min_controls: int,
+    max_controls: int,
+    replacement: bool,
+    caliper: float,
+) -> Tuple[pd.DataFrame, MatchSummary]:
+    if caliper is None or caliper < 0:
+        raise ValueError("`caliper` must be a non-negative float for caliper matching.")
+    matched, base_summary = _greedy_nearest(
+        df, ps_col, treatment_col, min_controls, max_controls, replacement, caliper=caliper
+    )
+    summary = MatchSummary(
+        method="caliper",  # MODIFIED: correct method label
+        treated_rows=base_summary.treated_rows,
+        control_rows=base_summary.control_rows,
+        unique_controls=base_summary.unique_controls,
+        min_controls=min_controls, max_controls=max_controls,
+        replacement=replacement, caliper=caliper,
+        n_strata=None,
+        caliper_dropped_treated=base_summary.caliper_dropped_treated,
+    )
+    return matched, summary
+def _stratification(
+    df: pd.DataFrame,
+    ps_col: str,
+    treatment_col: str,
+    n_strata: int,
+) -> Tuple[pd.DataFrame, MatchSummary]:
+    # MODIFIED: Stratification implemented with ATT weights per stratum
+    work = df.copy()
+    work["__stratum__"] = pd.qcut(work[ps_col], q=n_strata, labels=False, duplicates="drop")
+    work["__role__"] = work[treatment_col].apply(lambda x: "treated" if int(x) == 1 else "control")
+    work["__match_group__"] = work["__stratum__"]
+    work["__weight__"] = 1.0
+    for s in sorted(work["__stratum__"].dropna().unique()):
+        sm = work["__stratum__"] == s
+        n_treated = int((work.loc[sm, treatment_col] == 1).sum())
+        n_control = int((work.loc[sm, treatment_col] == 0).sum())
+        if n_treated > 0 and n_control > 0:
+            work.loc[sm & (work[treatment_col] == 1), "__weight__"] = 1.0
+            work.loc[sm & (work[treatment_col] == 0), "__weight__"] = n_treated / n_control
+        else:
+            work.loc[sm, "__weight__"] = 1.0  # MODIFIED: if unbalanced stratum, keep neutral weights
+    # Diagnostics — which strata contain both treated and control
+    strata_balance = work.groupby("__stratum__")[treatment_col].agg(["sum", "count"])
+    balanced_strata = strata_balance[(strata_balance["sum"] > 0) & (strata_balance["sum"] < strata_balance["count"])].index
+    work["__balanced_stratum__"] = work["__stratum__"].isin(balanced_strata)
+    treated_count = int((work[treatment_col] == 1).sum())
+    control_count = int((work[treatment_col] == 0).sum())
+    summary = MatchSummary(
+        method="stratification",
+        treated_rows=treated_count,
+        control_rows=control_count,
+        unique_controls=control_count,  # all controls retained
+        min_controls=0,
+        max_controls=0,
+        replacement=True,
+        caliper=None,
+        n_strata=n_strata,
+    )
+    return work, summary
+# -----------------------------
+# Balance + plotting
+# -----------------------------
+def _standardized_mean_differences(df: pd.DataFrame, treatment_col: str, covariates: List[str]) -> pd.DataFrame:
+    # MODIFIED: support optional weighting via "__weight__" if present (for stratification)
+    if df is None or len(df) == 0:
+        return pd.DataFrame(columns=["variable", "smd", "abs_smd"])
+    out = []
+    tmask = df[treatment_col] == 1
+    cmask = df[treatment_col] == 0
+    has_w = "__weight__" in df.columns
+    wt = df["__weight__"] if has_w else None
+    for v in covariates:
+        if v not in df.columns:  # guard for missing cols
+            continue
+        a = pd.to_numeric(df.loc[tmask, v], errors="coerce")
+        b = pd.to_numeric(df.loc[cmask, v], errors="coerce")
+        if has_w:
+            wa = pd.to_numeric(wt.loc[tmask], errors="coerce")
+            wb = pd.to_numeric(wt.loc[cmask], errors="coerce")
+            # Drop NaNs aligned
+            am = a.notna() & wa.notna()
+            bm = b.notna() & wb.notna()
+            a, wa = a[am], wa[am]
+            b, wb = b[bm], wb[bm]
+            def wmean(x, w):
+                sw = float(w.sum())
+                return np.nan if sw == 0 else float(np.sum(w * x) / sw)
+            def wvar(x, w, mean):
+                sw = float(w.sum())
+                return np.nan if sw == 0 else float(np.sum(w * (x - mean) ** 2) / sw)
+            ma = wmean(a, wa); mb = wmean(b, wb)
+            va = wvar(a, wa, ma); vb = wvar(b, wb, mb)
+        else:
+            ma, mb = a.mean(), b.mean()
+            va, vb = a.var(ddof=1), b.var(ddof=1)
+        denom = np.sqrt(np.nanmean([va, vb])) if not (np.isnan(va) and np.isnan(vb)) else np.nan
+        smd = np.nan if (denom == 0 or np.isnan(denom)) else (ma - mb) / float(denom)
+        out.append((v, smd, abs(smd) if pd.notna(smd) else np.nan))
+    return pd.DataFrame(out, columns=["variable", "smd", "abs_smd"])
+def _plot_love_before_after(
+    smd_pre: pd.DataFrame,
+    smd_post: pd.DataFrame,
+    title: str,
+    *,
+    empty_msg: Optional[str] = None,
+    xmax: Optional[float] = None,
+    fixed_order: Optional[List[str]] = None
+) -> Image.Image:
+    # Reconstructed plotting helper (equivalent to prior version)
+    def frame(df: Optional[pd.DataFrame], key: str) -> pd.DataFrame:
+        if df is None or df.empty:
+            return pd.DataFrame(columns=["variable", key])
+        return df[["variable", "abs_smd"]].rename(columns={"abs_smd": key})
+    a = frame(smd_pre, "before")
+    b = frame(smd_post, "after")
+    m = pd.merge(a, b, on="variable", how="outer")
+    m = m[~(m["before"].isna() & m["after"].isna())]
+    if m.empty:
+        fig, ax = plt.subplots(figsize=(6.5, 3.0))
+        ax.text(0.5, 0.5, empty_msg or "No balance data to plot.", ha="center", va="center", transform=ax.transAxes)
+        ax.axis("off")
+        buf = io.BytesIO(); fig.savefig(buf, format="png", dpi=150, bbox_inches="tight")
+        plt.close(fig); buf.seek(0)
+        return Image.open(buf)
+    # MODIFIED: stable ordering across methods
+    if fixed_order:
+        cat = pd.Categorical(m["variable"], categories=fixed_order, ordered=True)
+        m = m.assign(_ord=cat).sort_values("_ord").drop(columns=["_ord"])
+    else:
+        m["_sort"] = m["before"].fillna(-np.inf)
+        m.sort_values(["_sort"], ascending=[False], inplace=True)
+        m.drop(columns=["_sort"], inplace=True)
+    y = np.arange(len(m))
+    fig, ax = plt.subplots(figsize=(7.5, max(3.0, 0.6 * len(m))))
+    for i, row in m.reset_index(drop=True).iterrows():
+        bi, ai = row["before"], row["after"]
+        if pd.notna(bi) and pd.notna(ai):
+            ax.plot([bi, ai], [i, i], linewidth=1)
+    ax.scatter(m["before"], y, label="Before", zorder=3)
+    ax.scatter(m["after"],  y, label="After",  zorder=3, marker="s")
+    ax.set_yticks(y)
+    ax.set_yticklabels(m["variable"].tolist())
+    ax.invert_yaxis()
+    ax.set_xlabel("|SMD|")
+    ax.set_title(title)
+    ax.axvline(0.10, linestyle="--")
+    ax.grid(axis="x", linestyle=":", alpha=0.4)
+    if xmax is not None:
+        ax.set_xlim(0, xmax)
+    ax.legend(loc="center left", bbox_to_anchor=(1.02, 0.5), frameon=False)
+    fig.tight_layout(rect=[0.0, 0.0, 0.82, 1.0])
+    buf = io.BytesIO(); fig.savefig(buf, format="png", dpi=150, bbox_inches="tight")
+    plt.close(fig); buf.seek(0)
+    return Image.open(buf)
+# -----------------------------
+# Public pipeline (legacy)
+# -----------------------------
+def run_propensity_analysis(
+    data: Union[pd.DataFrame, str],
+    treatment_col: str,
+    feature_cols: List[str],
+    outcome_col: str = "",
+    matching_method: str = "nearest",
+    caliper: Optional[float] = None,
+    min_controls: int = 1,
+    max_controls: int = 1,
+    replacement: bool = True,
+    n_strata: int = 5,
+    include_balance: bool = True,
+) -> Tuple[str, Optional[Image.Image]]:
+    try:
+        # Load data
+        if isinstance(data, str):
+            if data.lower().endswith(".csv"):
+                df = pd.read_csv(data)
+            else:
+                raise ValueError("Only CSV paths are supported when passing a string to `data`.")
+        elif isinstance(data, pd.DataFrame):
+            df = data.copy()
+        else:
+            raise ValueError("`data` must be a pandas DataFrame or a CSV file path.")
+        if treatment_col not in df.columns:
+            raise ValueError(f"Treatment column '{treatment_col}' not found in data.")
+        covariates_used = _select_features(df, feature_cols, outcome_col if outcome_col else None)
+        # Fit propensity model
+        df[treatment_col] = _ensure_binary(df[treatment_col])
+        ps, _ = _fit_propensity(df, treatment_col, covariates_used)
+        df["__ps__"] = ps
+        # Matching
+        method = (matching_method or "nearest").lower()
+        matched = pd.DataFrame()
+        summary: Optional[MatchSummary] = None
+        if method == "nearest":
+            matched, summary = _greedy_nearest(
+                df, "__ps__", treatment_col, min_controls, max_controls, replacement, caliper=None  # MODIFIED: force pure nearest
+            )
+        elif method == "caliper":
+            if caliper is None or caliper < 0:
+                raise ValueError("Caliper matching requires a non-negative `caliper`.")
+            matched, summary = _caliper_matching(
+                df, "__ps__", treatment_col, min_controls, max_controls, replacement, caliper
+            )
+        elif method == "stratification":
+            matched, summary = _stratification(df, "__ps__", treatment_col, n_strata)
+        else:
+            raise ValueError("matching_method must be one of {'nearest','caliper','stratification'}.")
+        # Build report
+        report_lines = [
+            f"Matching Method: {summary.method if summary else method}",
+            f"Knobs -> min_controls={summary.min_controls if summary else min_controls}, "
+            f"max_controls={summary.max_controls if summary else max_controls}, "
+            f"replacement={summary.replacement if summary else replacement}, "
+            f"caliper={summary.caliper if summary else caliper}, "
+            f"n_strata={summary.n_strata if summary else (n_strata if method=='stratification' else None)}",
+            f"Matched counts -> treated_rows={summary.treated_rows if summary else 0}, "
+            f"control_rows={summary.control_rows if summary else 0}, "
+            f"unique_controls={summary.unique_controls if summary else 0}",
+        ]
+        if summary and summary.caliper is not None:
+            binding = bool(summary.caliper_dropped_treated > 0)
+            report_lines.append(f"caliper_dropped_treated={summary.caliper_dropped_treated}")
+            report_lines.append(f"caliper_binding={'True' if binding else 'False'}")
+        love_img = None
+        if include_balance:
+            smd_pre = _standardized_mean_differences(df, treatment_col, covariates_used)
+            smd_post = _standardized_mean_differences(matched, treatment_col, covariates_used) if not matched.empty else pd.DataFrame()
+            # MODIFIED: compute unified x-axis from PRE (identical across methods) + small headroom
+            if not smd_pre.empty and smd_pre["abs_smd"].notna().any():
+                max_pre = float(np.nanmax(smd_pre["abs_smd"]))
+                xmax = max(0.10, max_pre) * 1.1
+            else:
+                xmax = 0.5  # safe fallback
+            # MODIFIED: fixed order = by PRE imbalance ensures all methods align
+            fixed_order = smd_pre.sort_values("abs_smd", ascending=False)["variable"].tolist()
+            love_img = _plot_love_before_after(
+                smd_pre, smd_post,
+                title=f"Love Plot — {summary.method.title() if summary else method.title()} Matching",
+                empty_msg="No matched sample to assess." if matched.empty else None,
+                xmax=xmax,                      # MODIFIED
+                fixed_order=fixed_order         # MODIFIED
+            )
+            preview = (smd_post if not smd_post.empty else smd_pre).sort_values("abs_smd", ascending=False).head(10)
+            report_lines.append("\nBalance (|SMD|) summary (first 10):")
+            for _, r in preview.iterrows():
+                val = (np.round(r["abs_smd"], 4) if pd.notna(r["abs_smd"]) else "NaN")
+                report_lines.append(f"  {r['variable']}: {val}")
+        return "\n".join(report_lines), love_img
+    except Exception as e:
+        return f"An unexpected error occurred: {e}", None
+# -----------------------------
+# MODIFIED: Data export helpers for v2 (Edges & Units)
+# -----------------------------
+_EDGES_COLUMNS = [
+    "method", "group_id", "treated_unit_id", "control_unit_id",
+    "neighbor_rank", "distance", "edge_weight",
+    "replacement", "min_controls", "max_controls", "caliper", "n_strata"
+]  # MODIFIED
+def _build_edges_units_nearest_caliper(  # MODIFIED: new helper builds export frames
+    df: pd.DataFrame,
+    ps_col: str,
+    treatment_col: str,
+    min_controls: int,
+    max_controls: int,
+    replacement: bool,
+    caliper: Optional[float],
+    method: str,
+) -> Tuple[pd.DataFrame, pd.DataFrame, MatchSummary]:
+    work = df.copy()
+    work["__unit_id__"] = np.arange(len(work))  # MODIFIED: stable integer id derived from row order
+    treated = work[work[treatment_col] == 1].copy()
+    control = work[work[treatment_col] == 0].copy()
+    used = set()
+    edges_records: List[Dict] = []
+    dropped_due_to_caliper = 0
+    # Build edges per treated → top-K nearest controls (respect replacement & optional caliper)
+    for _, t in treated.iterrows():
+        diffs = control.copy()
+        diffs["__dist__"] = (diffs[ps_col] - t[ps_col]).abs()
+        if caliper is not None and caliper >= 0:
+            diffs = diffs[diffs["__dist__"] <= caliper]
+        if not replacement:
+            diffs = diffs[~diffs["__unit_id__"].isin(used)]
+        diffs = diffs.sort_values("__dist__", ascending=True).head(max_controls)
+        if len(diffs) < min_controls:
+            if caliper is not None and caliper >= 0:
+                dropped_due_to_caliper += 1
+            continue
+        # neighbor_rank assigned in sorted order
+        for rank, (_, crow) in enumerate(diffs.iterrows(), start=1):
+            edges_records.append({
+                "method": method,
+                "group_id": int(t["__unit_id__"]),
+                "treated_unit_id": int(t["__unit_id__"]),
+                "control_unit_id": int(crow["__unit_id__"]),
+                "neighbor_rank": int(rank),
+                "distance": float(crow["__dist__"]),
+                # edge_weight filled later after we know k-per-group
+                "edge_weight": np.nan,
+                "replacement": bool(replacement),
+                "min_controls": int(min_controls),
+                "max_controls": int(max_controls),
+                "caliper": (float(caliper) if caliper is not None else np.nan),
+                "n_strata": np.nan,
+            })
+            if not replacement:
+                used.add(int(crow["__unit_id__"]))
+    if not edges_records:
+        # Empty frames with proper schema
+        edges_df = pd.DataFrame(columns=_EDGES_COLUMNS)
+        units_df = pd.DataFrame(columns=["unit_id", "role", "ps", "treatment", "group_id", "method",
+                                         "replacement", "min_controls", "max_controls", "caliper", "n_strata"])
+        summary = MatchSummary(method=method, treated_rows=0, control_rows=0, unique_controls=0,
+                               min_controls=min_controls, max_controls=max_controls,
+                               replacement=replacement, caliper=caliper, n_strata=None,
+                               caliper_dropped_treated=dropped_due_to_caliper)
+        return units_df, edges_df, summary
+    edges_df = pd.DataFrame.from_records(edges_records)[_EDGES_COLUMNS]
+    # Fill edge_weight = 1/k within each treated group (synthetic control equal weights)
+    sizes = edges_df.groupby("group_id")["control_unit_id"].transform("count")
+    edges_df["edge_weight"] = 1.0 / sizes
+    included_groups = edges_df["group_id"].unique()
+    # Treated rows (one per group)
+    tre = work[work["__unit_id__"].isin(included_groups)].copy()
+    tre_df = tre.assign(
+        unit_id=tre["__unit_id__"].astype(int),
+        role="treated",
+        ps=tre[ps_col].astype(float),
+        treatment=1,
+        group_id=tre["__unit_id__"].astype(int),
+        method=method,
+        replacement=bool(replacement),
+        min_controls=int(min_controls),
+        max_controls=int(max_controls),
+        caliper=(float(caliper) if caliper is not None else np.nan),
+        n_strata=np.nan,
+    )
+    # Controls (one row per edge, allows replacement across groups)
+    ctrl_rows = []
+    for _, e in edges_df.iterrows():
+        c = work.loc[work["__unit_id__"] == e["control_unit_id"]].iloc[0]
+        ctrl_rows.append({
+            **{col: c[col] for col in work.columns},  # original columns
+            "unit_id": int(c["__unit_id__"]),
+            "role": "control",
+            "ps": float(c[ps_col]),
+            "treatment": 0,
+            "group_id": int(e["group_id"]),
+            "method": method,
+            "replacement": bool(replacement),
+            "min_controls": int(min_controls),
+            "max_controls": int(max_controls),
+            "caliper": (float(caliper) if caliper is not None else np.nan),
+            "n_strata": np.nan,
+        })
+    ctrl_df = pd.DataFrame(ctrl_rows) if ctrl_rows else pd.DataFrame(columns=list(tre_df.columns))
+    base_cols = [c for c in work.columns if not c.startswith("__")]
+    export_cols = ["unit_id", "role", "ps", "treatment", "group_id", "method",
+                   "replacement", "min_controls", "max_controls", "caliper", "n_strata"]
+    tre_df = tre_df[base_cols + export_cols]
+    if not ctrl_df.empty:
+        ctrl_df = ctrl_df[base_cols + export_cols]
+    units_df = pd.concat([tre_df, ctrl_df], ignore_index=True)
+    summary = MatchSummary(
+        method=method,
+        treated_rows=tre_df.shape[0],
+        control_rows=ctrl_df.shape[0],
+        unique_controls=int(edges_df["control_unit_id"].nunique()),
+        min_controls=min_controls,
+        max_controls=max_controls,
+        replacement=replacement,
+        caliper=caliper,
+        n_strata=None,
+        caliper_dropped_treated=dropped_due_to_caliper,
+    )
+    return units_df, edges_df, summary
+def _build_units_stratification(  # MODIFIED: new helper for stratification exports
+    df: pd.DataFrame,
+    ps_col: str,
+    treatment_col: str,
+    n_strata: int,
+) -> Tuple[pd.DataFrame, pd.DataFrame, MatchSummary]:
+    work = df.copy()
+    work["__unit_id__"] = np.arange(len(work))
+    strat_df, summary = _stratification(work, ps_col, treatment_col, n_strata)
+    base_cols = [c for c in strat_df.columns if not c.startswith("__")] + ["__unit_id__"]
+    units = strat_df.copy()
+    units = units.assign(
+        unit_id=units["__unit_id__"].astype(int),
+        role=units["__role__"],
+        ps=units[ps_col].astype(float),
+        treatment=units[treatment_col].astype(int),
+        group_id=units["__stratum__"].astype(int),
+        method="stratification",
+        replacement=True,
+        min_controls=0,
+        max_controls=0,
+        caliper=np.nan,
+        n_strata=int(n_strata),
+        weight=units["__weight__"].astype(float),
+        balanced_stratum=units["__balanced_stratum__"].astype(bool),
+        stratum=units["__stratum__"].astype(int),
+    )
+    # Order and drop helpers
+    keep_export = [c for c in base_cols if c != "__unit_id__"] + [
+        "unit_id", "role", "ps", "treatment", "group_id", "method",
+        "replacement", "min_controls", "max_controls", "caliper", "n_strata",
+        "weight", "balanced_stratum", "stratum"
+    ]
+    units_df = units[keep_export]
+    edges_df = pd.DataFrame(columns=_EDGES_COLUMNS)  # no combinatorial pairings for stratification
+    return units_df, edges_df, summary
+# -----------------------------
+# API returning exportable DataFrames
+# -----------------------------
+def run_propensity_analysis_v2(  # MODIFIED: new function; keeps legacy API intact
+    data: Union[pd.DataFrame, str],
+    treatment_col: str,
+    feature_cols: List[str],
+    outcome_col: str = "",
+    matching_method: str = "nearest",
+    caliper: Optional[float] = None,
+    min_controls: int = 1,
+    max_controls: int = 1,
+    replacement: bool = True,
+    n_strata: int = 5,
+    include_balance: bool = True,
+    return_dataframes: bool = True,
+) -> Tuple[str, Optional[Image.Image], Optional[pd.DataFrame], Optional[pd.DataFrame]]:
+    """
+    Returns:
+        report (str), love_plot (PIL.Image or None), units_df (or None), edges_df (or None)
+    """
+    # Load data (same behavior as legacy)
+    if isinstance(data, str):
+        if data.lower().endswith(".csv"):
+            df = pd.read_csv(data)
+        else:
+            raise ValueError("Only CSV paths are supported when passing a string to `data`.")
+    elif isinstance(data, pd.DataFrame):
+        df = data.copy()
+    else:
+        raise ValueError("`data` must be a pandas DataFrame or a CSV file path.")
+    if treatment_col not in df.columns:
+        raise ValueError(f"Treatment column '{treatment_col}' not found in data.")
+    # Prepare covariates and PS
+    covariates_used = _select_features(df, feature_cols, outcome_col if outcome_col else None)
+    df[treatment_col] = _ensure_binary(df[treatment_col])
+    ps, _ = _fit_propensity(df, treatment_col, covariates_used)
+    df["__ps__"] = ps
+    method = (matching_method or "nearest").lower()
+    units_df: Optional[pd.DataFrame] = None
+    edges_df: Optional[pd.DataFrame] = None
+    # Build export frames by method (correct multi-match semantics)
+    if method == "nearest":
+        units_df, edges_df, _ = _build_edges_units_nearest_caliper(
+            df, "__ps__", treatment_col, min_controls, max_controls, replacement, caliper=None, method="nearest"
+        )
+    elif method == "caliper":
+        if caliper is None or caliper < 0:
+            raise ValueError("Caliper matching requires a non-negative `caliper`.")
+        units_df, edges_df, _ = _build_edges_units_nearest_caliper(
+            df, "__ps__", treatment_col, min_controls, max_controls, replacement, caliper=caliper, method="caliper"
+        )
+    elif method == "stratification":
+        units_df, edges_df, _ = _build_units_stratification(df, "__ps__", treatment_col, n_strata=n_strata)
+    else:
+        raise ValueError("matching_method must be one of {'nearest','caliper','stratification'}.")
+    # Produce report + plot using the legacy function to preserve visuals/diagnostics
+    report, love_img = run_propensity_analysis(
+        data=df,  # already a DataFrame with __ps__
+        treatment_col=treatment_col,
+        feature_cols=feature_cols or [],
+        outcome_col=outcome_col or "",
+        matching_method=method,
+        caliper=(caliper if method == "caliper" else None),
+        min_controls=min_controls if method in ("nearest", "caliper") else 0,
+        max_controls=max_controls if method in ("nearest", "caliper") else 0,
+        replacement=replacement if method in ("nearest", "caliper") else True,
+        n_strata=n_strata if method == "stratification" else 5,
+        include_balance=include_balance,
+    )
+    return report, love_img, (units_df if return_dataframes else None), (edges_df if return_dataframes else None)