from __future__ import annotations import re, math, unicodedata import pandas as pd import numpy as np try: import phonenumbers HAVE_PHONENUM = True except Exception: HAVE_PHONENUM = False def _norm_colname(c: str) -> str: c = c.strip().lower() c = re.sub(r"\s+", "_", c) c = re.sub(r"[^\w]+", "_", c) return c.strip("_") def _nfkc(s: str) -> str: return unicodedata.normalize("NFKC", s) def _collapse_ws(s: str) -> str: return re.sub(r"\s+", " ", s).strip() def _clean_str(x): if pd.isna(x): return x return _collapse_ws(_nfkc(str(x))) def _is_email(s: str) -> bool: return bool(re.match(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$", s or "")) def _clean_phone(s: str, default_region: str = "IN"): if s is None or str(s).strip() == "": return None, "missing_phone" raw = re.sub(r"[^\d+]", "", str(s)) if HAVE_PHONENUM: try: pn = phonenumbers.parse(raw, default_region) if phonenumbers.is_possible_number(pn) and phonenumbers.is_valid_number(pn): return phonenumbers.format_number(pn, phonenumbers.PhoneNumberFormat.E164), None return raw, "invalid_phone" except Exception: return raw, "invalid_phone" digits = re.sub(r"\D", "", raw) return (digits, None) if 8 <= len(digits) <= 15 else (digits, "invalid_phone") def _parse_datetime(s): try: return pd.to_datetime(s, errors="coerce", utc=True) except Exception: return pd.NaT def _to_numeric(series: pd.Series): coerced = pd.to_numeric(series, errors="coerce") return coerced, (coerced.isna() & series.notna()) def _read_csv_any(file_obj) -> pd.DataFrame: if file_obj is None: raise ValueError("No file uploaded.") if hasattr(file_obj, "name"): p = file_obj.name try: return pd.read_csv(p) except Exception: return pd.read_csv(p, encoding="latin-1") try: return pd.read_csv(file_obj) except Exception: file_obj.seek(0) return pd.read_csv(file_obj, encoding="latin-1") def _standardize_df(df: pd.DataFrame) -> pd.DataFrame: df = df.copy() df.columns = [_norm_colname(c) for c in df.columns] for c in df.select_dtypes(include=["object"]).columns: df[c] = df[c].apply(_clean_str) return df def _prepare_generic(df: pd.DataFrame, expected: dict[str, list[str]]): issues = [] df0 = _standardize_df(df) colmap = {} cols = set(df0.columns) for canon, syns in expected.items(): found = None for s in [canon] + syns: s = _norm_colname(s) if s in cols: found = s; break if found: colmap[canon] = found for c in list(df0.columns): if "email" in c: df0[c] = df0[c].apply(lambda x: str(x).lower().strip() if pd.notna(x) else x) for idx, v in df0[c].items(): if pd.isna(v) or str(v).strip()=="": issues.append({"row": idx, "field": c, "issue":"missing_email","value":""}) elif not _is_email(v): issues.append({"row": idx, "field": c, "issue":"invalid_email","value":str(v)}) if "phone" in c or "mobile" in c: vals = [] for idx, v in df0[c].items(): e164, prob = _clean_phone(v) vals.append(e164) if prob: issues.append({"row": idx, "field": c, "issue":prob, "value":str(v)}) df0[c] = vals for c in df0.columns: if any(k in c for k in ["date","time","timestamp","created_at","updated_at"]): parsed = _parse_datetime(df0[c]) bad = parsed.isna() & df0[c].notna() for idx in df0.index[bad]: issues.append({"row": int(idx), "field": c, "issue":"unparseable_timestamp", "value":str(df0.loc[idx, c])}) df0[c] = parsed for nc in ["amount","credit_score","utilization","dti","recent_defaults","income"]: for c in df0.columns: if c == nc or c.endswith("_"+nc) or nc in c: coerced, badmask = _to_numeric(df0[c]) for idx in df0.index[badmask]: issues.append({"row": int(idx), "field": c, "issue":"non_numeric", "value":str(df0.loc[idx, c])}) df0[c] = coerced import pandas as pd issues_df = pd.DataFrame(issues, columns=["row","field","issue","value"]) missing = [k for k in expected.keys() if k not in colmap] quality_summary = f"Rows={len(df0)}, Cols={len(df0.columns)}; Missing required fields: {missing if missing else 'None'}" return df0, issues_df, quality_summary, colmap