# tools/csv_parser.py # ------------------------------------------------------------ # Reads a CSV / Excel file (sampling ultra‑large CSVs), then # returns a Markdown report: # ▸ dimensions ▸ schema & dtypes # ▸ missing‑value map ▸ numeric describe() # ▸ memory footprint # If the optional dependency **tabulate** is unavailable, # it falls back to a plain‑text table wrapped in Markdown # code fences, so no ImportError ever reaches the UI. from __future__ import annotations import os from typing import Union import numpy as np import pandas as pd # ╭──────────────────────────────────────────────────────────╮ # │ Helper: efficient reader with sampling for huge CSVs │ # ╰──────────────────────────────────────────────────────────╯ def _safe_read(path: Union[str, bytes], sample_rows: int = 1_000_000) -> pd.DataFrame: """Load CSV / Excel. If CSV has > sample_rows, read a uniform sample.""" is_str = isinstance(path, str) ext = os.path.splitext(path)[1].lower() if is_str else ".csv" if ext in (".xls", ".xlsx"): return pd.read_excel(path, engine="openpyxl") # --- CSV branch -------------------------------------------------------- if is_str: # fast line count (memory‑map); falls back to full read for non‑files with open(path, "rb") as fh: n_total = sum(1 for _ in fh) else: n_total = None if n_total and n_total > sample_rows: # sample without reading entire file rng = np.random.default_rng(seed=42) skip = sorted(rng.choice(range(1, n_total), n_total - sample_rows, replace=False)) return pd.read_csv(path, skiprows=skip) return pd.read_csv(path) # ╭──────────────────────────────────────────────────────────╮ # │ Main public helper │ # ╰──────────────────────────────────────────────────────────╯ def parse_csv_tool(path: Union[str, bytes]) -> str: """ Return a Markdown report that Streamlit can render. Sections: • Dimensions • Schema & dtypes • Missing‑value counts (+%) • Numeric describe() • Memory usage """ try: df = _safe_read(path) except Exception as exc: return f"❌ Failed to load data: {exc}" rows, cols = df.shape mem_mb = df.memory_usage(deep=True).sum() / 1024**2 # ── Schema ------------------------------------------------------------- schema_md = "\n".join( f"- **{col}** – `{dtype}`" for col, dtype in df.dtypes.items() ) # ── Missing map -------------------------------------------------------- miss_ct = df.isna().sum() miss_pct = (miss_ct / len(df) * 100).round(1) missing_md = ( "\n".join( f"- **{c}**: {miss_ct[c]} ({miss_pct[c]} %)" for c in df.columns if miss_ct[c] > 0 ) or "None" ) # ── Numeric describe() ------------------------------------------------- numeric_df = df.select_dtypes("number") if numeric_df.empty: desc_md = "_No numeric columns_" else: try: # requires the optional 'tabulate' package desc_md = numeric_df.describe().T.round(2).to_markdown() except ImportError: # graceful fallback without extra dependency desc_md = ( "```text\n" + numeric_df.describe().T.round(2).to_string() + "\n```" ) # ── Assemble markdown -------------------------------------------------- return f""" # 📊 Dataset Overview | metric | value | | ------ | ----- | | Rows | {rows:,} | | Columns| {cols} | | Memory | {mem_mb:.2f} MB | ## 🗂 Schema & Dtypes {schema_md} ## 🛠 Missing Values {missing_md} ## 📈 Descriptive Statistics (numeric) {desc_md} """.strip()