# tools/csv_parser.py
# ------------------------------------------------------------
# Reads a CSV / Excel file (sampling ultra‑large CSVs), then
# returns a Markdown report:
#   ▸ dimensions         ▸ schema & dtypes
#   ▸ missing‑value map  ▸ numeric describe()
#   ▸ memory footprint
# If the optional dependency **tabulate** is unavailable,
# it falls back to a plain‑text table wrapped in Markdown
# code fences, so no ImportError ever reaches the UI.

from __future__ import annotations

import os
from typing import Union

import numpy as np
import pandas as pd


# ╭──────────────────────────────────────────────────────────╮
# │  Helper: efficient reader with sampling for huge CSVs    │
# ╰──────────────────────────────────────────────────────────╯
def _safe_read(path: Union[str, bytes], sample_rows: int = 1_000_000) -> pd.DataFrame:
    """Load CSV / Excel.  If CSV has > sample_rows, read a uniform sample."""
    is_str = isinstance(path, str)
    ext = os.path.splitext(path)[1].lower() if is_str else ".csv"

    if ext in (".xls", ".xlsx"):
        return pd.read_excel(path, engine="openpyxl")

    # --- CSV branch --------------------------------------------------------
    if is_str:
        # fast line count (memory‑map); falls back to full read for non‑files
        with open(path, "rb") as fh:
            n_total = sum(1 for _ in fh)
    else:
        n_total = None

    if n_total and n_total > sample_rows:
        # sample without reading entire file
        rng = np.random.default_rng(seed=42)
        skip = sorted(rng.choice(range(1, n_total), n_total - sample_rows, replace=False))
        return pd.read_csv(path, skiprows=skip)

    return pd.read_csv(path)


# ╭──────────────────────────────────────────────────────────╮
# │               Main public helper                         │
# ╰──────────────────────────────────────────────────────────╯
def parse_csv_tool(path: Union[str, bytes]) -> str:
    """
    Return a Markdown report that Streamlit can render.

    Sections:
    • Dimensions
    • Schema & dtypes
    • Missing‑value counts (+%)
    • Numeric describe()
    • Memory usage
    """
    try:
        df = _safe_read(path)
    except Exception as exc:
        return f"❌ Failed to load data: {exc}"

    rows, cols = df.shape
    mem_mb = df.memory_usage(deep=True).sum() / 1024**2

    # ── Schema -------------------------------------------------------------
    schema_md = "\n".join(
        f"- **{col}** – `{dtype}`" for col, dtype in df.dtypes.items()
    )

    # ── Missing map --------------------------------------------------------
    miss_ct = df.isna().sum()
    miss_pct = (miss_ct / len(df) * 100).round(1)
    missing_md = (
        "\n".join(
            f"- **{c}**: {miss_ct[c]} ({miss_pct[c]} %)"
            for c in df.columns
            if miss_ct[c] > 0
        )
        or "None"
    )

    # ── Numeric describe() -------------------------------------------------
    numeric_df = df.select_dtypes("number")
    if numeric_df.empty:
        desc_md = "_No numeric columns_"
    else:
        try:
            # requires the optional 'tabulate' package
            desc_md = numeric_df.describe().T.round(2).to_markdown()
        except ImportError:
            # graceful fallback without extra dependency
            desc_md = (
                "```text\n"
                + numeric_df.describe().T.round(2).to_string()
                + "\n```"
            )

    # ── Assemble markdown --------------------------------------------------
    return f"""
# 📊 Dataset Overview

| metric | value |
| ------ | ----- |
| Rows   | {rows:,} |
| Columns| {cols} |
| Memory | {mem_mb:.2f} MB |

## 🗂 Schema & Dtypes
{schema_md}

## 🛠 Missing Values
{missing_md}

## 📈 Descriptive Statistics (numeric)
{desc_md}
""".strip()