Spaces:
Sleeping
Sleeping
| """ | |
| Missing Value Analyzer β Statistically Rigorous Pipeline | |
| ========================================================= | |
| Phases: | |
| 1 Upload CSV & Train/Test Split | |
| 2 Missing Value Overview (train set only) | |
| 3 Per-Column Diagnostics (Tables for all tests) | |
| 4 Imputation Feasibility Gate (KDE plots, Variance %, New Outliers) | |
| 5 Final Report & Recommendations | |
| """ | |
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from scipy import stats | |
| from scipy.stats import chi2_contingency, ttest_ind, norm, chi2 | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.impute import KNNImputer | |
| from sklearn.experimental import enable_iterative_imputer | |
| from sklearn.impute import IterativeImputer | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| # βββββββββββββββββββββββββββ Page config ββββββββββββββββββββββββββββ | |
| st.set_page_config( | |
| page_title="Missing Value Analyzer", | |
| page_icon="π¬", | |
| layout="wide", | |
| initial_sidebar_state="expanded", | |
| ) | |
| # βββββββββββββββββββββββββββ CSS ββββββββββββββββββββββββββββββββββββ | |
| st.markdown(""" | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); | |
| html,body,[class*="css"]{font-family:'Inter',sans-serif;} | |
| section[data-testid="stSidebar"]{background:#17172b;} | |
| section[data-testid="stSidebar"] *{color:#ffffff !important;} | |
| section[data-testid="stSidebar"] hr{border-color:#ffffff33 !important;} | |
| .main-title{font-size:2rem;font-weight:700;color:#17172b;margin-bottom:.2rem;} | |
| .main-sub{font-size:1rem;color:#6060a0;margin-bottom:1.5rem;} | |
| .metric-box{background:#f5f3ee;border-radius:8px;padding:12px 16px;text-align:center;margin-bottom:8px;} | |
| .metric-val{font-size:1.4rem;font-weight:700;color:#17172b !important;} | |
| .metric-lbl{font-size:.78rem;color:#6060a0 !important;margin-top:2px;} | |
| .big-stat-box{border-radius:12px;padding:20px 24px;text-align:center;margin-bottom:8px;min-height:110px;} | |
| .big-stat-val{font-size:2.0rem;font-weight:800;margin-bottom:4px;line-height:1.1;} | |
| .big-stat-lbl{font-size:.80rem;font-weight:600;opacity:0.85;text-transform:uppercase;letter-spacing:.05em;} | |
| .big-stat-sub{font-size:.76rem;opacity:0.65;margin-top:6px;} | |
| .stat-ok{background:#edfaf3;border:2px solid #89d9ac;} | |
| .stat-ok .big-stat-val,.stat-ok .big-stat-lbl{color:#0a5c30 !important;} | |
| .stat-warn{background:#fffaeb;border:2px solid #f0cc7a;} | |
| .stat-warn .big-stat-val,.stat-warn .big-stat-lbl{color:#7a4f00 !important;} | |
| .stat-fail{background:#fff0ed;border:2px solid #f5a898;} | |
| .stat-fail .big-stat-val,.stat-fail .big-stat-lbl{color:#900000 !important;} | |
| .card-mcar{background:#edfaf3;border:2px solid #89d9ac;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;} | |
| .card-mar {background:#fffaeb;border:2px solid #f0cc7a;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;} | |
| .card-mnar{background:#fff0ed;border:2px solid #f5a898;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;} | |
| .card-info{background:#eef2ff;border:2px solid #bdc8f5;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;} | |
| .card-warn{background:#fff8e1;border:2px solid #ffe082;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;} | |
| .card-danger{background:#fde8e8;border:2px solid #f5a8a8;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;} | |
| .card-ok{background:#edfaf3;border:2px solid #89d9ac;border-radius:10px;padding:14px 18px;margin-bottom:10px; color:#1a1a2e !important;} | |
| .card-mcar *, .card-mar *, .card-mnar *, .card-info *, .card-warn *, .card-danger *, .card-ok * {color: #1a1a2e !important;} | |
| .verdict-label{font-size:1.1rem;font-weight:700;margin-bottom:4px;} | |
| .verdict-desc{font-size:.88rem;color:#333 !important;} | |
| code{background:#e8e8eb;padding:2px 6px;border-radius:4px;font-size:.85rem; color:#d6336c !important;} | |
| hr.divider{border:none;border-top:2px solid #e0ddd8;margin:1.5rem 0;} | |
| .theory-box {background:#fafafa; border-left:4px solid #4f8ef7; border-radius:4px; padding:12px 18px; margin-bottom:16px;} | |
| .theory-box h4 {color:#17172b; margin-bottom:6px; font-size:1.05rem;} | |
| .theory-box p {color:#444; font-size:0.92rem; line-height:1.5;} | |
| .stat-highlight { font-size: 1.2rem; font-weight: bold; color: #d6336c; background: #ffe4e1; padding: 2px 8px; border-radius: 4px;} | |
| .test-header{font-size:1.05rem;font-weight:700;color:#17172b;margin:18px 0 8px;} | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # SESSION STATE INIT | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| defaults = {"df_full": None, "df_train": None, "df_test": None, "target_col": None, "split_ratio": 0.8, "col_diagnostics": {}} | |
| for k, v in defaults.items(): | |
| if k not in st.session_state: st.session_state[k] = v | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # STATISTICAL TEST HELPERS | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def littles_mcar_test(df: pd.DataFrame, cols_with_missing: list) -> dict: | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() | |
| chi2_total, df_total = 0.0, 0 | |
| for col in cols_with_missing: | |
| if col not in numeric_cols: continue | |
| missing_mask = df[col].isnull() | |
| if missing_mask.sum() < 5 or (~missing_mask).sum() < 5: continue | |
| for other in numeric_cols: | |
| if other == col: continue | |
| g1, g2 = df.loc[missing_mask, other].dropna(), df.loc[~missing_mask, other].dropna() | |
| if len(g1) < 3 or len(g2) < 3: continue | |
| grand_mean, grand_var = df[other].mean(), df[other].var() | |
| if grand_var < 1e-12: continue | |
| chi2_total += (len(g1)*(g1.mean() - grand_mean)**2 + len(g2)*(g2.mean() - grand_mean)**2) / grand_var | |
| df_total += 1 | |
| if df_total == 0: return {"chi2": None, "p_value": None, "verdict": "Insufficient numeric data"} | |
| p_val = 1 - chi2.cdf(chi2_total, df_total) | |
| verdict = f"Fail to reject MCAR" if p_val >= 0.05 else f"Reject MCAR" | |
| return {"chi2": round(chi2_total, 4), "df": df_total, "p_value": round(p_val, 4), "verdict": verdict, "reject_mcar": p_val < 0.05} | |
| def feature_dependency_tests(df: pd.DataFrame, col: str) -> dict: | |
| missing_mask = df[col].isnull() | |
| if missing_mask.sum() < 5 or (~missing_mask).sum() < 5: return {"results": {}, "n_significant": 0, "signal": "Insufficient data"} | |
| results = {} | |
| for other in df.columns: | |
| if other == col: continue | |
| g_miss, g_obs = df.loc[missing_mask, other].dropna(), df.loc[~missing_mask, other].dropna() | |
| if len(g_miss) < 3 or len(g_obs) < 3: continue | |
| try: | |
| if pd.api.types.is_numeric_dtype(df[other]): | |
| n1, n2 = len(g_miss), len(g_obs) | |
| if min(n1, n2) >= 30: | |
| se = np.sqrt(g_miss.var()/n1 + g_obs.var()/n2) | |
| if se < 1e-12: continue | |
| z_stat = (g_miss.mean() - g_obs.mean()) / se | |
| p_val = 2 * (1 - norm.cdf(abs(z_stat))) | |
| test_name, stat = "z-test", round(z_stat, 4) | |
| else: | |
| t_stat, p_val = ttest_ind(g_miss, g_obs, equal_var=False) | |
| test_name, stat = "Welch t-test", round(t_stat, 4) | |
| results[other] = {"test": test_name, "stat": stat, "p_value": round(p_val, 4), "significant": p_val < 0.05, "type": "numeric"} | |
| else: | |
| ct = pd.crosstab(missing_mask.astype(int), df[other]) | |
| if ct.shape[0] < 2 or ct.shape[1] < 2: continue | |
| chi2_stat, p_val, _, _ = chi2_contingency(ct) | |
| results[other] = {"test": "chiΒ²", "stat": round(chi2_stat, 4), "p_value": round(p_val, 4), "significant": p_val < 0.05, "type": "categorical"} | |
| except Exception: continue | |
| n_sig = sum(1 for r in results.values() if r["significant"]) | |
| sig_pct = n_sig / max(len(results), 1) * 100 | |
| signal = "No features differ significantly" if sig_pct == 0 else f"{n_sig}/{len(results)} features differ (p<0.05)" | |
| return {"results": results, "n_significant": n_sig, "total_tested": len(results), "sig_pct": round(sig_pct, 1), "signal": signal} | |
| def target_dependency_test(df: pd.DataFrame, col: str, target_col: str) -> dict: | |
| missing_mask = df[col].isnull() | |
| if missing_mask.sum() < 5 or (~missing_mask).sum() < 5: return {"p_value": None, "signal": "Insufficient data", "significant": False} | |
| try: | |
| g_miss, g_obs = df.loc[missing_mask, target_col].dropna(), df.loc[~missing_mask, target_col].dropna() | |
| if pd.api.types.is_numeric_dtype(df[target_col]): | |
| n1, n2 = len(g_miss), len(g_obs) | |
| if min(n1, n2) >= 30: | |
| se = np.sqrt(g_miss.var()/n1 + g_obs.var()/n2) | |
| if se < 1e-12: return {"p_value": None, "signal": "Zero variance", "significant": False} | |
| z_stat = (g_miss.mean() - g_obs.mean()) / se | |
| p_val = 2 * (1 - norm.cdf(abs(z_stat))) | |
| else: | |
| _, p_val = ttest_ind(g_miss, g_obs, equal_var=False) | |
| diff_pct = abs(g_miss.mean() - g_obs.mean()) / max(abs(g_obs.mean()), 1e-9) * 100 | |
| else: | |
| ct = pd.crosstab(missing_mask.astype(int), df[target_col]) | |
| _, p_val, _, _ = chi2_contingency(ct) | |
| p1, p2 = g_miss.value_counts(normalize=True).iloc[0]*100, g_obs.value_counts(normalize=True).iloc[0]*100 | |
| diff_pct = abs(p1 - p2) | |
| sig = p_val < 0.05 | |
| signal = f"Not significant (p={p_val:.4f})" if not sig else f"Significant β target differs by {diff_pct:.1f}%" | |
| return {"p_value": round(p_val, 4), "significant": sig, "diff_pct": round(diff_pct, 2), "signal": signal} | |
| except Exception as e: return {"p_value": None, "signal": f"Error: {e}", "significant": False} | |
| def classify_mechanism(t_feat, t_target, little): | |
| tgt_sig, tgt_diff = t_target.get("significant", False), t_target.get("diff_pct", 0) | |
| sig_pct = t_feat.get("sig_pct", 0) | |
| if tgt_sig and tgt_diff >= 10: return "MNAR", "High", "Missingness strongly correlates with the outcome." | |
| elif tgt_sig and tgt_diff >= 5: return "MNAR", "Moderate", "Moderate dependency on target. Treat conservatively as MNAR." | |
| elif sig_pct > 30: return "MAR", "High", "Strong dependency on observed features detected." | |
| elif sig_pct > 0: return "MAR", "Moderate", "Weak but present dependency on observed features." | |
| elif little.get("reject_mcar"): return "MAR", "Low", "Little's test rejects MCAR, but feature tests show weak dependency." | |
| else: return "MCAR", "High", "No statistical evidence of systematic missingness." | |
| def run_single_diagnostic(df, col, target_col): | |
| little, t_feat = littles_mcar_test(df, [col]), feature_dependency_tests(df, col) | |
| t_target = {"p_value": None, "significant": False, "signal": "Skipped (Is Target)", "diff_pct": 0} if col == target_col else target_dependency_test(df, col, target_col) | |
| mech, conf, expl = classify_mechanism(t_feat, t_target, little) | |
| st.session_state["col_diagnostics"][col] = { | |
| "mechanism": mech, "confidence": conf, "explanation": expl, | |
| "miss_pct": round(df[col].isnull().mean()*100, 2), | |
| "dtype": str(df[col].dtype), | |
| "little": little, "t_feat": t_feat, "t_target": t_target | |
| } | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # IMPUTATION SIMULATION HELPERS | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def feasibility_checks(df: pd.DataFrame, col: str, target_col: str, impute_method: str) -> dict: | |
| series = df[col].dropna() | |
| if len(series) < 5 or not pd.api.types.is_numeric_dtype(df[col]): | |
| return {"applicable": False} | |
| results = {"applicable": True, "escalate_to_knn": False, "reasons": []} | |
| # ββ 1. Impute ββ | |
| if impute_method == "Mean": imputed_series = df[col].fillna(series.mean()) | |
| elif impute_method == "Median": imputed_series = df[col].fillna(series.median()) | |
| else: | |
| numeric_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c != target_col] | |
| X_num = df[numeric_cols].copy() | |
| try: | |
| scaler = StandardScaler() | |
| X_scaled = pd.DataFrame(scaler.fit_transform(X_num), columns=X_num.columns) | |
| imputer = KNNImputer(n_neighbors=5) if impute_method == "KNN" else IterativeImputer(random_state=42, max_iter=10) | |
| X_imputed_scaled = pd.DataFrame(imputer.fit_transform(X_scaled), columns=X_num.columns) | |
| X_imputed = pd.DataFrame(scaler.inverse_transform(X_imputed_scaled), columns=X_num.columns) | |
| imputed_series = X_imputed[col] | |
| except Exception: | |
| imputed_series = df[col].fillna(series.median()) | |
| results["imputed_series"] = imputed_series | |
| # ββ 2. Skewness & Outliers ββ | |
| skew = series.skew() | |
| Q1_b, Q3_b = series.quantile(0.25), series.quantile(0.75) | |
| IQR_b = Q3_b - Q1_b | |
| outliers_before = ((series < Q1_b - 1.5*IQR_b) | (series > Q3_b + 1.5*IQR_b)).sum() | |
| Q1_a, Q3_a = imputed_series.quantile(0.25), imputed_series.quantile(0.75) | |
| IQR_a = Q3_a - Q1_a | |
| outliers_after = ((imputed_series < Q1_a - 1.5*IQR_a) | (imputed_series > Q3_a + 1.5*IQR_a)).sum() | |
| new_outliers = max(0, outliers_after - outliers_before) | |
| if impute_method == "Mean": | |
| skew_verdict = "fail" if abs(skew) > 1 else "ok" | |
| elif impute_method == "Median": | |
| skew_verdict = "warn" if abs(skew) > 3 else "ok" | |
| else: | |
| skew_verdict = "ok" | |
| results["skewness"] = {"verdict": skew_verdict, "value": skew, "msg": f"Skewness = {skew:.3f}"} | |
| if new_outliers > (len(series) * 0.05): | |
| out_verdict = "warn" | |
| else: | |
| out_verdict = "ok" | |
| results["outliers"] = { | |
| "verdict": out_verdict, | |
| "new_outliers": new_outliers, | |
| "outliers_before": outliers_before, | |
| "outliers_after": outliers_after | |
| } | |
| # ββ 3. Variance Impact ββ | |
| var_before = series.var() | |
| var_after = imputed_series.var() | |
| var_drop_pct = (var_before - var_after) / var_before * 100 if var_before > 1e-12 else 0 | |
| if var_drop_pct <= 10: var_verdict, var_msg = "ok", f"Variance Change: {var_drop_pct:.1f}%" | |
| elif var_drop_pct <= 20: var_verdict, var_msg = "warn", f"Variance Change: {var_drop_pct:.1f}%" | |
| else: var_verdict, var_msg = "fail", f"Variance Change: {var_drop_pct:.1f}%" | |
| results["variance"] = {"verdict": var_verdict, "msg": var_msg, "var_drop_pct": var_drop_pct} | |
| # ββ 4. Correlation Preservation ββ | |
| numeric_others = [c for c in df.select_dtypes(include=[np.number]).columns if c != col and c != target_col] | |
| corr_results, max_corr_shift, sign_flip = {}, 0.0, False | |
| for other in numeric_others[:10]: | |
| s_before = df[[col, other]].dropna() | |
| if len(s_before) < 5: continue | |
| r_before = s_before[col].corr(s_before[other]) | |
| r_after = imputed_series.corr(df[other]) | |
| delta = abs(r_before - r_after) | |
| flipped = (r_before * r_after < 0) and (abs(r_before) > 0.1) | |
| corr_results[other] = {"r_before": round(r_before, 4), "r_after": round(r_after, 4), "delta": round(delta, 4), "sign_flip": flipped} | |
| max_corr_shift = max(max_corr_shift, delta) | |
| if flipped: sign_flip = True | |
| if max_corr_shift <= 0.05 and not sign_flip: corr_verdict, corr_msg = "ok", f"Max Ξ = {max_corr_shift:.3f} β Correlation well preserved" | |
| elif sign_flip: corr_verdict, corr_msg = "fail", f"Sign flip detected! Correlation direction reversed." | |
| elif max_corr_shift <= 0.10: corr_verdict, corr_msg = "warn", f"Max Ξ = {max_corr_shift:.3f} β Moderate correlation shift" | |
| else: corr_verdict, corr_msg = "fail", f"Max Ξ = {max_corr_shift:.3f} β Large correlation shift detected" | |
| results["correlation"] = {"details": corr_results, "verdict": corr_verdict, "msg": corr_msg, "max_shift": round(max_corr_shift, 4)} | |
| return results | |
| def get_auto_recommendation(df, col, target, mechanism, miss_pct, dtype): | |
| """Determine best imputation strategy with explicit labeling.""" | |
| needs_indicator = (mechanism == "MNAR") or (mechanism == "MAR" and miss_pct >= 10) | |
| indicator_suffix = " + Missing Indicator" if needs_indicator else "" | |
| # High missingness β always flag | |
| if miss_pct > 70: | |
| return f"Drop Column" | |
| if mechanism == "MCAR" and miss_pct <= 5: | |
| return "Drop Rows" | |
| # Categorical / non-numeric | |
| if not pd.api.types.is_numeric_dtype(df[col]): | |
| return f"Mode Imputation{indicator_suffix}" | |
| # Numeric: run quick feasibility to decide | |
| feas_med = feasibility_checks(df, col, target, "Median") | |
| if not feas_med.get("applicable"): | |
| return f"Median Imputation{indicator_suffix}" | |
| var_ok = feas_med["variance"]["var_drop_pct"] <= 20 | |
| corr_ok = feas_med["correlation"]["verdict"] != "fail" | |
| skew_val = abs(feas_med["skewness"].get("value", 0)) | |
| if var_ok and corr_ok: | |
| if skew_val <= 1: | |
| return f"Mean Imputation{indicator_suffix}" | |
| else: | |
| return f"Median Imputation{indicator_suffix}" | |
| else: | |
| if miss_pct > 30: | |
| return f"MICE Imputer{indicator_suffix}" | |
| else: | |
| return f"KNN Imputer{indicator_suffix}" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # SIDEBAR NAVIGATION | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| STEPS = ["1 Β· Upload & Split", "2 Β· Overview", "3 Β· Column Diagnostics", "4 Β· Feasibility Gate", "5 Β· Final Report"] | |
| with st.sidebar: | |
| st.markdown("## π¬ Missing Value Analyzer") | |
| st.markdown("---") | |
| step = st.radio("Navigate:", STEPS, label_visibility="collapsed") | |
| st.markdown("---") | |
| if st.session_state.get("df_train") is not None: | |
| st.markdown(f"**Train set:** {st.session_state['df_train'].shape[0]} rows Γ {st.session_state['df_train'].shape[1]} cols") | |
| st.markdown(f"**Diagnosed:** {len(st.session_state['col_diagnostics'])} columns") | |
| st.markdown("<small style='color:#9090c0'>Analysis runs on TRAIN SET only to prevent data leakage.</small>", unsafe_allow_html=True) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # STEP 1 β UPLOAD & SPLIT | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def render_step1(): | |
| st.markdown('<div class="main-title">π Step 1 β Upload CSV & Train/Test Split</div>', unsafe_allow_html=True) | |
| uploaded = st.file_uploader("Choose a CSV file", type=["csv"]) | |
| if not uploaded: return st.info("π Upload a CSV file to begin.") | |
| df = pd.read_csv(uploaded) | |
| st.success(f"β Loaded **{uploaded.name}**") | |
| col1, col2 = st.columns(2) | |
| target = col1.selectbox("Target column (Y):", df.columns.tolist(), index=len(df.columns)-1) | |
| split_pct = col2.slider("Train size:", 50, 95, 80, 5, format="%d%%") | |
| if st.button("β Confirm & Split", type="primary"): | |
| df_train, df_test = train_test_split(df, train_size=split_pct/100.0, random_state=42) | |
| st.session_state.update({"df_full": df, "df_train": df_train.reset_index(drop=True), "df_test": df_test.reset_index(drop=True), "target_col": target, "col_diagnostics": {}}) | |
| st.success("β Split complete!") | |
| st.dataframe(df_train.head(), use_container_width=True) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # STEP 2 β OVERVIEW | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def render_step2(): | |
| st.markdown('<div class="main-title">π Step 2 β Missing Value Overview</div>', unsafe_allow_html=True) | |
| df = st.session_state.get("df_train") | |
| if df is None: return st.warning("β οΈ Please complete Step 1.") | |
| miss_cols = [c for c in df.columns if df[c].isnull().any()] | |
| if not miss_cols: return st.success("π No missing values!") | |
| # ββ Summary table ββ | |
| summary = pd.DataFrame({ | |
| "Missing Count": df[miss_cols].isnull().sum(), | |
| "Missing %": (df[miss_cols].isnull().sum() / len(df) * 100).round(2) | |
| }).sort_values("Missing %", ascending=False) | |
| st.dataframe( | |
| summary.style.background_gradient(cmap="YlOrRd", subset=["Missing %"]), | |
| use_container_width=True | |
| ) | |
| st.markdown("---") | |
| # ββ Missingness Heatmap ββ | |
| st.markdown("### πΊοΈ Missingness Heatmap") | |
| st.caption("Each dark stripe = a missing value in that row. Aligned stripes across columns = rows missing together (MAR signal).") | |
| fig_h, ax_h = plt.subplots(figsize=(14, max(3, len(miss_cols) * 0.6))) | |
| fig_h.patch.set_facecolor('#f8f8f8') | |
| ax_h.set_facecolor('#f0f0f0') | |
| miss_matrix = df[miss_cols].isnull().astype(int) | |
| # Subsample rows for performance if large | |
| if len(miss_matrix) > 2000: | |
| miss_matrix = miss_matrix.sample(2000, random_state=42).reset_index(drop=True) | |
| ax_h.imshow( | |
| miss_matrix.T.values, | |
| aspect='auto', | |
| cmap=sns.color_palette(["#f0f0f0", "#17172b"], as_cmap=True), | |
| interpolation='none' | |
| ) | |
| ax_h.set_yticks(range(len(miss_cols))) | |
| ax_h.set_yticklabels(miss_cols, fontsize=10) | |
| ax_h.set_xlabel("Row index (sampled)" if len(df) > 2000 else "Row index", fontsize=10) | |
| ax_h.set_title("Missing Value Pattern (dark = missing)", fontsize=12, fontweight='bold', pad=10) | |
| ax_h.spines[['top','right','bottom','left']].set_visible(False) | |
| plt.tight_layout() | |
| st.pyplot(fig_h, use_container_width=True) | |
| plt.close() | |
| st.markdown("---") | |
| # ββ Missingness Correlation Heatmap ββ | |
| st.markdown("### π Missingness Correlation") | |
| st.caption("Correlation between missing patterns of columns. Values near 1.0 = these columns tend to be missing in the same rows β strong MAR signal.") | |
| if len(miss_cols) >= 2: | |
| miss_indicator = df[miss_cols].isnull().astype(int) | |
| corr_matrix = miss_indicator.corr() | |
| fig_c, ax_c = plt.subplots(figsize=(max(6, len(miss_cols) * 1.2), max(5, len(miss_cols) * 1.0))) | |
| fig_c.patch.set_facecolor('#f8f8f8') | |
| mask = np.zeros_like(corr_matrix, dtype=bool) | |
| mask[np.triu_indices_from(mask, k=1)] = True # show lower triangle only | |
| sns.heatmap( | |
| corr_matrix, | |
| mask=mask, | |
| annot=True, | |
| fmt=".2f", | |
| cmap="RdYlGn", | |
| vmin=-1, vmax=1, | |
| center=0, | |
| ax=ax_c, | |
| square=True, | |
| linewidths=0.5, | |
| linecolor='white', | |
| annot_kws={"size": 10, "weight": "bold"}, | |
| cbar_kws={"shrink": 0.8} | |
| ) | |
| ax_c.set_title("Pairwise Missingness Correlation", fontsize=12, fontweight='bold', pad=12) | |
| ax_c.tick_params(axis='x', rotation=45, labelsize=10) | |
| ax_c.tick_params(axis='y', rotation=0, labelsize=10) | |
| plt.tight_layout() | |
| st.pyplot(fig_c, use_container_width=True) | |
| plt.close() | |
| # Interpretation callout | |
| max_corr_pair = None | |
| max_val = 0 | |
| for i in range(len(miss_cols)): | |
| for j in range(i): | |
| val = abs(corr_matrix.iloc[i, j]) | |
| if val > max_val: | |
| max_val = val | |
| max_corr_pair = (miss_cols[i], miss_cols[j], corr_matrix.iloc[i, j]) | |
| if max_corr_pair: | |
| c1, c2, v = max_corr_pair | |
| if v >= 0.9: | |
| st.markdown(f'<div class="card-danger">π¨ <b>Very high missingness correlation ({v:.2f})</b> between <code>{c1}</code> and <code>{c2}</code> β these rows go missing together. Strong MAR signal; consider joint imputation (KNN/MICE).</div>', unsafe_allow_html=True) | |
| elif v >= 0.5: | |
| st.markdown(f'<div class="card-warn">β οΈ <b>Moderate missingness correlation ({v:.2f})</b> between <code>{c1}</code> and <code>{c2}</code> β partial co-occurrence of missingness detected.</div>', unsafe_allow_html=True) | |
| else: | |
| st.markdown(f'<div class="card-ok">β <b>Low missingness correlation (max {v:.2f})</b> β columns appear to be missing independently.</div>', unsafe_allow_html=True) | |
| else: | |
| st.info("Only one column with missing values β correlation requires at least two.") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # STEP 3 β DIAGNOSTICS | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def render_step3(): | |
| st.markdown('<div class="main-title">π§ͺ Step 3 β Per-Column Diagnostics</div>', unsafe_allow_html=True) | |
| df, target = st.session_state.get("df_train"), st.session_state.get("target_col") | |
| if df is None: return st.warning("β οΈ Please complete Step 1.") | |
| miss_cols = [c for c in df.columns if df[c].isnull().any()] | |
| if not miss_cols: return st.success("π No missing values.") | |
| col1, col2 = st.columns([1, 4]) | |
| selected_col = col1.selectbox("Select column to view:", miss_cols) | |
| run_single = col1.button("βΆ Run Diagnostics") | |
| run_all = col2.button("βΆ Run ALL columns", type="primary") | |
| if run_single: | |
| run_single_diagnostic(df, selected_col, target) | |
| if run_all: | |
| progress = st.progress(0, text="Running diagnostics...") | |
| for i, c in enumerate(miss_cols): | |
| run_single_diagnostic(df, c, target) | |
| progress.progress((i+1)/len(miss_cols), text=f"Diagnosing: {c}") | |
| progress.empty() | |
| st.success(f"β Diagnosed {len(miss_cols)} columns.") | |
| if selected_col in st.session_state["col_diagnostics"]: | |
| res = st.session_state["col_diagnostics"][selected_col] | |
| little, t_feat, t_target = res["little"], res["t_feat"], res["t_target"] | |
| st.markdown("---") | |
| # ββ Mechanism verdict card ββ | |
| card_class = {"MCAR":"card-mcar","MAR":"card-mar","MNAR":"card-mnar"}[res["mechanism"]] | |
| emoji = {"MCAR":"π’","MAR":"π ","MNAR":"π΄"}[res["mechanism"]] | |
| st.markdown( | |
| f'<div class="{card_class}">' | |
| f'<div class="verdict-label">{emoji} Mechanism: {res["mechanism"]} β {res["confidence"]} Confidence</div>' | |
| f'<div class="verdict-desc">{res["explanation"]}</div>' | |
| f'<div class="verdict-desc" style="margin-top:6px">Missing: <b>{res["miss_pct"]}%</b> | dtype: <b>{res["dtype"]}</b></div>' | |
| f'</div>', | |
| unsafe_allow_html=True | |
| ) | |
| # ββ TEST 1: Little's MCAR ββ | |
| st.markdown('<div class="test-header">π¬ Test 1 β Little\'s MCAR Test</div>', unsafe_allow_html=True) | |
| with st.expander("βΉοΈ What does this test measure?", expanded=False): | |
| st.markdown(""" | |
| **Little's MCAR test** checks if missingness is completely random. | |
| - **Hβ (null):** Data is Missing Completely At Random (MCAR) | |
| - **p β₯ 0.05:** Fail to reject β data may be MCAR | |
| - **p < 0.05:** Reject β systematic missingness detected | |
| """) | |
| little_rows = [{ | |
| "Test": "Little's MCAR", | |
| "ΟΒ² Statistic": little.get("chi2", "N/A"), | |
| "Degrees of Freedom": little.get("df", "N/A"), | |
| "p-value": little.get("p_value", "N/A"), | |
| "Verdict": little.get("verdict", "N/A"), | |
| "Reject MCAR?": "β Yes β systematic" if little.get("reject_mcar") else "β No β may be MCAR" | |
| }] | |
| st.dataframe(pd.DataFrame(little_rows), use_container_width=True, hide_index=True) | |
| # ββ TEST 2: Target Dependency ββ | |
| st.markdown('<div class="test-header">π― Test 2 β Target Dependency Test</div>', unsafe_allow_html=True) | |
| with st.expander("βΉοΈ What does this test measure?", expanded=False): | |
| st.markdown(""" | |
| Tests if the **target variable** has different values when this column is missing vs. observed. | |
| - **Numeric target:** z-test or Welch t-test | |
| - **Categorical target:** Chi-squared test | |
| - **Significant (p<0.05) + large diff % β MNAR** (missingness depends on outcome) | |
| """) | |
| tgt_rows = [{ | |
| "Test Applied": "z-test / Welch t-test / ChiΒ²", | |
| "p-value": t_target.get("p_value", "N/A"), | |
| "Target Diff %": f'{t_target.get("diff_pct", 0):.1f}%' if t_target.get("diff_pct") is not None else "N/A", | |
| "Significant (p<0.05)?": "β Yes" if t_target.get("significant") else "β No", | |
| "Interpretation": t_target.get("signal", "N/A") | |
| }] | |
| st.dataframe(pd.DataFrame(tgt_rows), use_container_width=True, hide_index=True) | |
| # ββ TEST 3: Feature Dependency ββ | |
| st.markdown('<div class="test-header">π Test 3 β Feature Dependency Tests</div>', unsafe_allow_html=True) | |
| with st.expander("βΉοΈ What does this test measure?", expanded=False): | |
| st.markdown(""" | |
| For each other feature, tests if values differ **significantly** between rows where this column is missing vs. observed. | |
| - **Numeric features:** z-test (nβ₯30) or Welch t-test | |
| - **Categorical features:** Chi-squared test | |
| - **Many significant features (>30%) β MAR** (missingness explained by observed data) | |
| """) | |
| # Summary row first | |
| summary_cols = st.columns(3) | |
| summary_cols[0].metric("Features Tested", t_feat.get("total_tested", 0)) | |
| summary_cols[1].metric("Significant (p<0.05)", t_feat.get("n_significant", 0)) | |
| summary_cols[2].metric("% Significant", f'{t_feat.get("sig_pct", 0):.1f}%') | |
| if t_feat["results"]: | |
| rows = [] | |
| for f, r in t_feat["results"].items(): | |
| rows.append({ | |
| "Feature": f, | |
| "Data Type": r["type"].capitalize(), | |
| "Test Used": r["test"], | |
| "Test Statistic": r["stat"], | |
| "p-value": r["p_value"], | |
| "p < 0.05?": "β Significant" if r["significant"] else "β" | |
| }) | |
| feat_df = pd.DataFrame(rows).sort_values("p-value") | |
| def highlight_sig(row): | |
| if row["p < 0.05?"] == "β Significant": | |
| return ["background-color:#ffe4e1; color:#900000"] * len(row) | |
| return [""] * len(row) | |
| st.dataframe( | |
| feat_df.style.apply(highlight_sig, axis=1), | |
| use_container_width=True, | |
| hide_index=True | |
| ) | |
| else: | |
| st.info("No feature dependency results available (insufficient data or no other columns).") | |
| # ββ Decision Logic Summary ββ | |
| st.markdown('<div class="test-header">π§ Decision Logic Summary</div>', unsafe_allow_html=True) | |
| logic_rows = [ | |
| {"Rule Check": "Little's test rejects MCAR?", "Result": "β Yes" if little.get("reject_mcar") else "β No"}, | |
| {"Rule Check": "Target differs significantly?", "Result": "β Yes" if t_target.get("significant") else "β No"}, | |
| {"Rule Check": "Target diff magnitude", "Result": f'{t_target.get("diff_pct", 0):.1f}% difference'}, | |
| {"Rule Check": "% of features with significant diff", "Result": f'{t_feat.get("sig_pct", 0):.1f}%'}, | |
| {"Rule Check": "β Final Mechanism", "Result": f'{res["mechanism"]} ({res["confidence"]} confidence)'}, | |
| ] | |
| st.dataframe(pd.DataFrame(logic_rows), use_container_width=True, hide_index=True) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # STEP 4 β FEASIBILITY GATE (Interactive) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def render_step4(): | |
| st.markdown('<div class="main-title">βοΈ Step 4 β Imputation Feasibility Gate</div>', unsafe_allow_html=True) | |
| with st.expander("π Theory & Guide: Why test imputation mathematically? (Click to expand)"): | |
| st.markdown(""" | |
| <div class="theory-box"> | |
| <h4>Why test imputation mathematically?</h4> | |
| <p>Single-value imputations (like filling blanks with Mean or Median) are dangerous if overused. They can:</p> | |
| <ul> | |
| <li><b>Collapse Variance:</b> If you fill 20% of the data with the same number, the spread of your data shrinks unnaturally.</li> | |
| <li><b>Create Artificial Outliers:</b> Because the variance (IQR) shrank, real valid data points at the edges suddenly look like outliers!</li> | |
| <li><b>Destroy Correlation:</b> Assigning a median weight to someone without considering their height breaks the natural relationship between features.</li> | |
| </ul> | |
| <p><b>KNN and MICE</b> solve this by acting like mini machine-learning models β they look at other features to make an educated guess, preserving variance and correlations.</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| df, target = st.session_state.get("df_train"), st.session_state.get("target_col") | |
| col_diag = st.session_state.get("col_diagnostics", {}) | |
| if not col_diag: return st.warning("β οΈ Please run diagnostics in Step 3 first.") | |
| numeric_diag = {c: v for c, v in col_diag.items() if pd.api.types.is_numeric_dtype(df[c])} | |
| if not numeric_diag: return st.info("No numeric columns available.") | |
| col1, col2 = st.columns([1, 2]) | |
| selected_col = col1.selectbox("Select numeric column:", list(numeric_diag.keys())) | |
| impute_choice = col2.radio("Simulate impact of:", ["Mean", "Median", "KNN", "MICE"], horizontal=True) | |
| if st.button(f"βΆ Simulate {impute_choice} Imputation", type="primary"): | |
| with st.spinner(f"Running {impute_choice} simulation (may take a moment for KNN/MICE)..."): | |
| feas = feasibility_checks(df, selected_col, target, impute_choice) | |
| if not feas.get("applicable"): | |
| return st.error("Column not applicable for numeric feasibility checks.") | |
| ICONS = {"ok": "β ", "warn": "β οΈ", "fail": "β"} | |
| COLORS = {"ok": "stat-ok", "warn": "stat-warn", "fail": "stat-fail"} | |
| # ββ Big Stats Banner ββ | |
| st.markdown("### π Imputation Impact β Key Statistics") | |
| m1, m2, m3, m4 = st.columns(4) | |
| var_pct = feas["variance"]["var_drop_pct"] | |
| var_verd = feas["variance"]["verdict"] | |
| new_out = feas["outliers"]["new_outliers"] | |
| out_verd = feas["outliers"]["verdict"] | |
| corr_verd = feas["correlation"]["verdict"] | |
| corr_max = feas["correlation"]["max_shift"] | |
| skew_val = feas["skewness"]["value"] | |
| skew_verd = feas["skewness"]["verdict"] | |
| m1.markdown( | |
| f'<div class="big-stat-box {COLORS[var_verd]}">' | |
| f'<div class="big-stat-val">-{var_pct:.1f}%</div>' | |
| f'<div class="big-stat-lbl">Variance Change</div>' | |
| f'<div class="big-stat-sub">{ICONS[var_verd]} {"Safe" if var_verd=="ok" else "Caution" if var_verd=="warn" else "High Risk"}</div>' | |
| f'</div>', unsafe_allow_html=True | |
| ) | |
| m2.markdown( | |
| f'<div class="big-stat-box {COLORS[out_verd]}">' | |
| f'<div class="big-stat-val">+{new_out}</div>' | |
| f'<div class="big-stat-lbl">New Outliers Created</div>' | |
| f'<div class="big-stat-sub">{ICONS[out_verd]} Before: {feas["outliers"]["outliers_before"]} β After: {feas["outliers"]["outliers_after"]}</div>' | |
| f'</div>', unsafe_allow_html=True | |
| ) | |
| m3.markdown( | |
| f'<div class="big-stat-box {COLORS[corr_verd]}">' | |
| f'<div class="big-stat-val">Ξ{corr_max:.3f}</div>' | |
| f'<div class="big-stat-lbl">Max Corr. Shift</div>' | |
| f'<div class="big-stat-sub">{ICONS[corr_verd]} {corr_verd.capitalize()}</div>' | |
| f'</div>', unsafe_allow_html=True | |
| ) | |
| m4.markdown( | |
| f'<div class="big-stat-box {COLORS[skew_verd]}">' | |
| f'<div class="big-stat-val">{skew_val:.3f}</div>' | |
| f'<div class="big-stat-lbl">Skewness</div>' | |
| f'<div class="big-stat-sub">{ICONS[skew_verd]} {"Low" if abs(skew_val)<=1 else "Moderate" if abs(skew_val)<=3 else "High"} skew</div>' | |
| f'</div>', unsafe_allow_html=True | |
| ) | |
| st.markdown("---") | |
| # ββ KDE Plots β Two clear separate charts ββ | |
| st.markdown("### π Distribution Comparison (KDE)") | |
| series = df[selected_col].dropna() | |
| imputed = feas["imputed_series"] | |
| miss_pct_col = df[selected_col].isnull().mean() * 100 | |
| fig, axes = plt.subplots(1, 2, figsize=(16, 5)) | |
| fig.patch.set_facecolor('#fafafa') | |
| # Plot 1: Overlapping KDE | |
| ax = axes[0] | |
| ax.set_facecolor('#f8f8f8') | |
| try: | |
| from scipy.stats import gaussian_kde | |
| # Original KDE | |
| kde_orig = gaussian_kde(series.values, bw_method='scott') | |
| x_range = np.linspace(min(series.min(), imputed.min()), max(series.max(), imputed.max()), 300) | |
| ax.fill_between(x_range, kde_orig(x_range), alpha=0.35, color='#17172b', label='Original (observed only)') | |
| ax.plot(x_range, kde_orig(x_range), color='#17172b', lw=2.5) | |
| # Imputed KDE | |
| kde_imp = gaussian_kde(imputed.values, bw_method='scott') | |
| ax.fill_between(x_range, kde_imp(x_range), alpha=0.35, color='#d6336c', label=f'After {impute_choice}') | |
| ax.plot(x_range, kde_imp(x_range), color='#d6336c', lw=2.5, linestyle='--') | |
| except Exception: | |
| ax.hist(series.values, bins=25, alpha=0.5, color='#17172b', label='Original', density=True) | |
| ax.hist(imputed.values, bins=25, alpha=0.4, color='#d6336c', label=f'After {impute_choice}', density=True) | |
| ax.set_title(f'KDE: Original vs After {impute_choice}\n({miss_pct_col:.1f}% was missing)', fontsize=13, fontweight='bold', pad=12) | |
| ax.set_xlabel(selected_col, fontsize=11) | |
| ax.set_ylabel('Density', fontsize=11) | |
| ax.legend(fontsize=10) | |
| ax.grid(axis='y', alpha=0.3) | |
| ax.spines[['top','right']].set_visible(False) | |
| # Plot 2: Box plots side by side | |
| ax2 = axes[1] | |
| ax2.set_facecolor('#f8f8f8') | |
| bp = ax2.boxplot( | |
| [series.values, imputed.values], | |
| labels=['Original\n(non-missing)', f'After\n{impute_choice}'], | |
| patch_artist=True, | |
| widths=0.5, | |
| medianprops=dict(color='#d6336c', linewidth=2.5), | |
| flierprops=dict(marker='o', markerfacecolor='#d6336c', markersize=5, alpha=0.5), | |
| whiskerprops=dict(linewidth=1.5), | |
| capprops=dict(linewidth=1.5), | |
| ) | |
| bp['boxes'][0].set_facecolor('#c8d8f0') | |
| bp['boxes'][1].set_facecolor('#f5c6d0') | |
| # Annotate variance change | |
| ax2.set_title( | |
| f'Spread & Outliers\nVariance Change: {var_pct:.1f}% | New Outliers: +{new_out}', | |
| fontsize=13, fontweight='bold', pad=12 | |
| ) | |
| ax2.set_ylabel('Value', fontsize=11) | |
| ax2.grid(axis='y', alpha=0.3) | |
| ax2.spines[['top','right']].set_visible(False) | |
| plt.tight_layout(pad=2.5) | |
| st.pyplot(fig, use_container_width=True) | |
| plt.close() | |
| # ββ Correlation Details ββ | |
| st.markdown("---") | |
| st.markdown("#### π Correlation Preservation Details") | |
| st.markdown(f'<div class="card-{"ok" if corr_verd=="ok" else "warn" if corr_verd=="warn" else "danger"}">{ICONS[corr_verd]} <b>{feas["correlation"]["msg"]}</b></div>', unsafe_allow_html=True) | |
| if feas["correlation"]["details"]: | |
| rows = [{ | |
| "Feature": f, | |
| "r (before)": r["r_before"], | |
| "r (after)": r["r_after"], | |
| "Ξ (shift)": r["delta"], | |
| "Sign Flip?": "π¨ YES" if r["sign_flip"] else "No" | |
| } for f, r in feas["correlation"]["details"].items()] | |
| corr_df = pd.DataFrame(rows).sort_values("Ξ (shift)", ascending=False) | |
| def highlight_corr(row): | |
| if row["Sign Flip?"] == "π¨ YES": return ["background-color:#fde8e8; color:#900000"] * len(row) | |
| if row["Ξ (shift)"] > 0.10: return ["background-color:#fff0ed; color:#900000"] * len(row) | |
| return [""] * len(row) | |
| st.dataframe(corr_df.style.apply(highlight_corr, axis=1), use_container_width=True, hide_index=True) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # STEP 5 β FINAL REPORT | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def render_step5(): | |
| st.markdown('<div class="main-title">π Step 5 β Final Diagnostic Report</div>', unsafe_allow_html=True) | |
| df, target = st.session_state.get("df_train"), st.session_state.get("target_col") | |
| col_diag = st.session_state.get("col_diagnostics", {}) | |
| if not col_diag: return st.warning("β οΈ Run diagnostics in Step 3 first.") | |
| # ββ Legend ββ | |
| with st.expander("π How to read the Recommended Strategy column"): | |
| st.markdown(""" | |
| | Label | Meaning | | |
| |-------|---------| | |
| | **Drop Rows** | MCAR + <5% missing β safe to delete affected rows | | |
| | **Drop Column** | >70% missing β too little data to impute reliably | | |
| | **Mean Imputation** | Low-skew numeric, variance loss is acceptable | | |
| | **Median Imputation** | Skewed numeric; median is more robust than mean | | |
| | **Mode Imputation** | Categorical / non-numeric columns | | |
| | **KNN Imputer** | Moderate missingness; feature relationships preserved | | |
| | **MICE Imputer** | High missingness (>30%); multiple-imputation approach | | |
| | **+ Missing Indicator** | Added when mechanism is MNAR, or MAR β₯ 10% missing β add a binary flag column `col_missing` alongside imputed values | | |
| """) | |
| table_rows = [] | |
| for col, res in col_diag.items(): | |
| rec_string = get_auto_recommendation(df, col, target, res["mechanism"], res["miss_pct"], res["dtype"]) | |
| table_rows.append({ | |
| "Column": col, | |
| "dtype": res["dtype"], | |
| "Missing %": f'{res["miss_pct"]:.1f}%', | |
| "Mechanism": res["mechanism"], | |
| "Confidence": res["confidence"], | |
| "Recommended Strategy": rec_string | |
| }) | |
| report_df = pd.DataFrame(table_rows).sort_values("Missing %", ascending=False) | |
| def color_rows(row): | |
| mech_colors = { | |
| "MNAR": "background-color:#fff0ed; color:#000", | |
| "MAR": "background-color:#fffaeb; color:#000", | |
| "MCAR": "background-color:#edfaf3; color:#000" | |
| } | |
| return [mech_colors.get(row["Mechanism"], "")] * len(row) | |
| st.dataframe( | |
| report_df.style.apply(color_rows, axis=1), | |
| use_container_width=True, | |
| hide_index=True | |
| ) | |
| # ββ Summary counts ββ | |
| st.markdown("---") | |
| c1, c2, c3 = st.columns(3) | |
| mcar_n = sum(1 for r in col_diag.values() if r["mechanism"] == "MCAR") | |
| mar_n = sum(1 for r in col_diag.values() if r["mechanism"] == "MAR") | |
| mnar_n = sum(1 for r in col_diag.values() if r["mechanism"] == "MNAR") | |
| c1.markdown(f'<div class="metric-box"><div class="metric-val" style="color:#0a5c30">π’ {mcar_n}</div><div class="metric-lbl">MCAR columns</div></div>', unsafe_allow_html=True) | |
| c2.markdown(f'<div class="metric-box"><div class="metric-val" style="color:#7a4f00">π {mar_n}</div><div class="metric-lbl">MAR columns</div></div>', unsafe_allow_html=True) | |
| c3.markdown(f'<div class="metric-box"><div class="metric-val" style="color:#900000">π΄ {mnar_n}</div><div class="metric-lbl">MNAR columns</div></div>', unsafe_allow_html=True) | |
| if step == STEPS[0]: render_step1() | |
| elif step == STEPS[1]: render_step2() | |
| elif step == STEPS[2]: render_step3() | |
| elif step == STEPS[3]: render_step4() | |
| elif step == STEPS[4]: render_step5() |