Spaces:

mgbam
/

BizIntel_AI

Running

App Files Files Community

mgbam commited on 2 days ago

Commit

7453b19

verified ·

1 Parent(s): ed51f1d

Update tools/csv_parser.py

Browse files

Files changed (1) hide show

tools/csv_parser.py +75 -52

tools/csv_parser.py CHANGED Viewed

@@ -1,99 +1,122 @@
 # tools/csv_parser.py
 # ------------------------------------------------------------
-# Reads CSV / Excel, samples for very large files, and returns a
-# Markdown‑formatted “quick‑scan” report: dimensions, schema,
-# missing‑value profile, numeric describe(), and memory footprint.
 from __future__ import annotations
 import os
 from typing import Union
 import pandas as pd
-def _safe_read(path_or_buf: Union[str, bytes], sample_rows: int = 1_000_000) -> pd.DataFrame:
-    """Read CSV or Excel.  If the file has > sample_rows, read only a sample."""
-    # Determine extension (best‑effort)
-    ext = ".csv"
-    if isinstance(path_or_buf, str):
-        ext = os.path.splitext(path_or_buf)[1].lower()
     if ext in (".xls", ".xlsx"):
-        # Excel — read first sheet
-        df = pd.read_excel(path_or_buf, engine="openpyxl")
-    else:  # CSV family
-        # First row‑count check: pandas 1.5+ uses memory map ⇒ cheap for header only
-        nrows_total = sum(1 for _ in open(path_or_buf, "rb")) if isinstance(path_or_buf, str) else None
-        if nrows_total and nrows_total > sample_rows:
-            # sample uniformly without loading everything
-            skip = sorted(
-                pd.np.random.choice(range(1, nrows_total), nrows_total - sample_rows, replace=False)
-            )
-            df = pd.read_csv(path_or_buf, skiprows=skip)
-        else:
-            df = pd.read_csv(path_or_buf)
-    return df
-def parse_csv_tool(file: Union[str, bytes]) -> str:
     """
-    Return a **Markdown** report describing the dataset.
     Sections:
     • Dimensions
-    • Schema (+ dtypes)
-    • Missing‑value counts + %
-    • Numeric descriptive statistics
     • Memory usage
     """
     try:
-        df = _safe_read(file)
     except Exception as exc:
         return f"❌ Failed to load data: {exc}"
-    n_rows, n_cols = df.shape
-    # ---------- schema ----------
     schema_md = "\n".join(
-        f"- **{col}** – `{dtype}`"
-        for col, dtype in df.dtypes.items()
     )
-    # ---------- missing ----------
-    miss_ct  = df.isna().sum()
     miss_pct = (miss_ct / len(df) * 100).round(1)
-    missing_md = "\n".join(
-        f"- **{c}**: {miss_ct[c]} ({miss_pct[c]} %)"
-        for c in df.columns if miss_ct[c] > 0
-    ) or "None"
-    # ---------- descriptive stats (numeric only) ----------
-    if df.select_dtypes("number").shape[1]:
-        desc_md = df.describe().T.round(2).to_markdown()
-    else:
-        desc_md = "_No numeric columns_"
-    # ---------- memory ----------
-    mem_mb = df.memory_usage(deep=True).sum() / 1024**2
-    # ---------- assemble ----------
     return f"""
 # 📊 Dataset Overview
 | metric | value |
 | ------ | ----- |
-| Rows   | {n_rows:,} |
-| Columns| {n_cols} |
 | Memory | {mem_mb:.2f} MB |
-## 🗂 Schema
 {schema_md}
 ## 🛠 Missing Values
 {missing_md}
-## 📈 Descriptive Statistics (numeric)
 {desc_md}
 """.strip()

 # tools/csv_parser.py
 # ------------------------------------------------------------
+# Reads a CSV / Excel file (sampling ultra‑large CSVs), then
+# returns a Markdown report:
+#   ▸ dimensions         ▸ schema & dtypes
+#   ▸ missing‑value map  ▸ numeric describe()
+#   ▸ memory footprint
+# If the optional dependency **tabulate** is unavailable,
+# it falls back to a plain‑text table wrapped in Markdown
+# code fences, so no ImportError ever reaches the UI.
 from __future__ import annotations
 import os
 from typing import Union
+import numpy as np
 import pandas as pd
+# ╭──────────────────────────────────────────────────────────╮
+# │  Helper: efficient reader with sampling for huge CSVs    │
+# ╰──────────────────────────────────────────────────────────╯
+def _safe_read(path: Union[str, bytes], sample_rows: int = 1_000_000) -> pd.DataFrame:
+    """Load CSV / Excel.  If CSV has > sample_rows, read a uniform sample."""
+    is_str = isinstance(path, str)
+    ext = os.path.splitext(path)[1].lower() if is_str else ".csv"
     if ext in (".xls", ".xlsx"):
+        return pd.read_excel(path, engine="openpyxl")
+    # --- CSV branch --------------------------------------------------------
+    if is_str:
+        # fast line count (memory‑map); falls back to full read for non‑files
+        with open(path, "rb") as fh:
+            n_total = sum(1 for _ in fh)
+    else:
+        n_total = None
+    if n_total and n_total > sample_rows:
+        # sample without reading entire file
+        rng = np.random.default_rng(seed=42)
+        skip = sorted(rng.choice(range(1, n_total), n_total - sample_rows, replace=False))
+        return pd.read_csv(path, skiprows=skip)
+    return pd.read_csv(path)
+# ╭──────────────────────────────────────────────────────────╮
+# │               Main public helper                         │
+# ╰──────────────────────────────────────────────────────────╯
+def parse_csv_tool(path: Union[str, bytes]) -> str:
     """
+    Return a Markdown report that Streamlit can render.
     Sections:
     • Dimensions
+    • Schema & dtypes
+    • Missing‑value counts (+%)
+    • Numeric describe()
     • Memory usage
     """
     try:
+        df = _safe_read(path)
     except Exception as exc:
         return f"❌ Failed to load data: {exc}"
+    rows, cols = df.shape
+    mem_mb = df.memory_usage(deep=True).sum() / 1024**2
+    # ── Schema -------------------------------------------------------------
     schema_md = "\n".join(
+        f"- **{col}** – `{dtype}`" for col, dtype in df.dtypes.items()
     )
+    # ── Missing map --------------------------------------------------------
+    miss_ct = df.isna().sum()
     miss_pct = (miss_ct / len(df) * 100).round(1)
+    missing_md = (
+        "\n".join(
+            f"- **{c}**: {miss_ct[c]} ({miss_pct[c]} %)"
+            for c in df.columns
+            if miss_ct[c] > 0
+        )
+        or "None"
+    )
+    # ── Numeric describe() -------------------------------------------------
+    numeric_df = df.select_dtypes("number")
+    if numeric_df.empty:
+        desc_md = "_No numeric columns_"
+    else:
+        try:
+            # requires the optional 'tabulate' package
+            desc_md = numeric_df.describe().T.round(2).to_markdown()
+        except ImportError:
+            # graceful fallback without extra dependency
+            desc_md = (
+                "```text\n"
+                + numeric_df.describe().T.round(2).to_string()
+                + "\n```"
+            )
+    # ── Assemble markdown --------------------------------------------------
     return f"""
 # 📊 Dataset Overview
 | metric | value |
 | ------ | ----- |
+| Rows   | {rows:,} |
+| Columns| {cols} |
 | Memory | {mem_mb:.2f} MB |
+## 🗂 Schema & Dtypes
 {schema_md}
 ## 🛠 Missing Values
 {missing_md}
+## 📈 Descriptive Statistics (numeric)
 {desc_md}
 """.strip()