Spaces:
Running
Running
Update tools/csv_parser.py
Browse files- tools/csv_parser.py +75 -52
tools/csv_parser.py
CHANGED
@@ -1,99 +1,122 @@
|
|
1 |
# tools/csv_parser.py
|
2 |
# ------------------------------------------------------------
|
3 |
-
# Reads CSVโฏ/โฏExcel
|
4 |
-
# Markdown
|
5 |
-
#
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
from __future__ import annotations
|
8 |
|
9 |
import os
|
10 |
from typing import Union
|
11 |
|
|
|
12 |
import pandas as pd
|
13 |
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
21 |
|
22 |
if ext in (".xls", ".xlsx"):
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
)
|
33 |
-
df = pd.read_csv(path_or_buf, skiprows=skip)
|
34 |
-
else:
|
35 |
-
df = pd.read_csv(path_or_buf)
|
36 |
|
37 |
-
|
|
|
|
|
|
|
|
|
38 |
|
|
|
39 |
|
40 |
-
|
|
|
|
|
|
|
|
|
41 |
"""
|
42 |
-
Return a
|
43 |
|
44 |
Sections:
|
45 |
โข Dimensions
|
46 |
-
โข Schema
|
47 |
-
โข Missingโvalue counts
|
48 |
-
โข Numeric
|
49 |
โข Memory usage
|
50 |
"""
|
51 |
try:
|
52 |
-
df = _safe_read(
|
53 |
except Exception as exc:
|
54 |
return f"โ Failed to load data: {exc}"
|
55 |
|
56 |
-
|
|
|
57 |
|
58 |
-
#
|
59 |
schema_md = "\n".join(
|
60 |
-
f"- **{col}** โ `{dtype}`"
|
61 |
-
for col, dtype in df.dtypes.items()
|
62 |
)
|
63 |
|
64 |
-
#
|
65 |
-
miss_ct
|
66 |
miss_pct = (miss_ct / len(df) * 100).round(1)
|
67 |
-
missing_md =
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
else:
|
76 |
-
desc_md = "_No numeric columns_"
|
77 |
|
78 |
-
#
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
-
#
|
82 |
return f"""
|
83 |
# ๐ย Dataset Overview
|
84 |
|
85 |
| metric | value |
|
86 |
| ------ | ----- |
|
87 |
-
| Rows | {
|
88 |
-
| Columns| {
|
89 |
| Memory | {mem_mb:.2f}ย MB |
|
90 |
|
91 |
-
## ๐ย Schema
|
92 |
{schema_md}
|
93 |
|
94 |
## ๐ ย Missing Values
|
95 |
{missing_md}
|
96 |
|
97 |
-
## ๐ย Descriptiveย
|
98 |
{desc_md}
|
99 |
""".strip()
|
|
|
1 |
# tools/csv_parser.py
|
2 |
# ------------------------------------------------------------
|
3 |
+
# Reads a CSVโฏ/โฏExcel file (sampling ultraโlarge CSVs), then
|
4 |
+
# returns a Markdown report:
|
5 |
+
# โธ dimensions โธ schema & dtypes
|
6 |
+
# โธ missingโvalue map โธ numeric describe()
|
7 |
+
# โธ memory footprint
|
8 |
+
# If the optional dependency **tabulate** is unavailable,
|
9 |
+
# it falls back to a plainโtext table wrapped in Markdown
|
10 |
+
# code fences, so no ImportError ever reaches the UI.
|
11 |
|
12 |
from __future__ import annotations
|
13 |
|
14 |
import os
|
15 |
from typing import Union
|
16 |
|
17 |
+
import numpy as np
|
18 |
import pandas as pd
|
19 |
|
20 |
|
21 |
+
# โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ
|
22 |
+
# โ Helper: efficient reader with sampling for huge CSVs โ
|
23 |
+
# โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ
|
24 |
+
def _safe_read(path: Union[str, bytes], sample_rows: int = 1_000_000) -> pd.DataFrame:
|
25 |
+
"""Load CSV / Excel. If CSV has >โฏsample_rows, read a uniform sample."""
|
26 |
+
is_str = isinstance(path, str)
|
27 |
+
ext = os.path.splitext(path)[1].lower() if is_str else ".csv"
|
28 |
|
29 |
if ext in (".xls", ".xlsx"):
|
30 |
+
return pd.read_excel(path, engine="openpyxl")
|
31 |
+
|
32 |
+
# --- CSV branch --------------------------------------------------------
|
33 |
+
if is_str:
|
34 |
+
# fast line count (memoryโmap); falls back to full read for nonโfiles
|
35 |
+
with open(path, "rb") as fh:
|
36 |
+
n_total = sum(1 for _ in fh)
|
37 |
+
else:
|
38 |
+
n_total = None
|
|
|
|
|
|
|
|
|
39 |
|
40 |
+
if n_total and n_total > sample_rows:
|
41 |
+
# sample without reading entire file
|
42 |
+
rng = np.random.default_rng(seed=42)
|
43 |
+
skip = sorted(rng.choice(range(1, n_total), n_total - sample_rows, replace=False))
|
44 |
+
return pd.read_csv(path, skiprows=skip)
|
45 |
|
46 |
+
return pd.read_csv(path)
|
47 |
|
48 |
+
|
49 |
+
# โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ
|
50 |
+
# โ Main public helper โ
|
51 |
+
# โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ
|
52 |
+
def parse_csv_tool(path: Union[str, bytes]) -> str:
|
53 |
"""
|
54 |
+
Return a Markdown report that Streamlit can render.
|
55 |
|
56 |
Sections:
|
57 |
โข Dimensions
|
58 |
+
โข Schema & dtypes
|
59 |
+
โข Missingโvalue counts (+%)
|
60 |
+
โข Numeric describe()
|
61 |
โข Memory usage
|
62 |
"""
|
63 |
try:
|
64 |
+
df = _safe_read(path)
|
65 |
except Exception as exc:
|
66 |
return f"โ Failed to load data: {exc}"
|
67 |
|
68 |
+
rows, cols = df.shape
|
69 |
+
mem_mb = df.memory_usage(deep=True).sum() / 1024**2
|
70 |
|
71 |
+
# โโ Schema -------------------------------------------------------------
|
72 |
schema_md = "\n".join(
|
73 |
+
f"- **{col}** โ `{dtype}`" for col, dtype in df.dtypes.items()
|
|
|
74 |
)
|
75 |
|
76 |
+
# โโ Missing map --------------------------------------------------------
|
77 |
+
miss_ct = df.isna().sum()
|
78 |
miss_pct = (miss_ct / len(df) * 100).round(1)
|
79 |
+
missing_md = (
|
80 |
+
"\n".join(
|
81 |
+
f"- **{c}**: {miss_ct[c]}ย ({miss_pct[c]}โฏ%)"
|
82 |
+
for c in df.columns
|
83 |
+
if miss_ct[c] > 0
|
84 |
+
)
|
85 |
+
or "None"
|
86 |
+
)
|
|
|
|
|
87 |
|
88 |
+
# โโ Numeric describe() -------------------------------------------------
|
89 |
+
numeric_df = df.select_dtypes("number")
|
90 |
+
if numeric_df.empty:
|
91 |
+
desc_md = "_No numeric columns_"
|
92 |
+
else:
|
93 |
+
try:
|
94 |
+
# requires the optional 'tabulate' package
|
95 |
+
desc_md = numeric_df.describe().T.round(2).to_markdown()
|
96 |
+
except ImportError:
|
97 |
+
# graceful fallback without extra dependency
|
98 |
+
desc_md = (
|
99 |
+
"```text\n"
|
100 |
+
+ numeric_df.describe().T.round(2).to_string()
|
101 |
+
+ "\n```"
|
102 |
+
)
|
103 |
|
104 |
+
# โโ Assemble markdown --------------------------------------------------
|
105 |
return f"""
|
106 |
# ๐ย Dataset Overview
|
107 |
|
108 |
| metric | value |
|
109 |
| ------ | ----- |
|
110 |
+
| Rows | {rows:,} |
|
111 |
+
| Columns| {cols} |
|
112 |
| Memory | {mem_mb:.2f}ย MB |
|
113 |
|
114 |
+
## ๐ย Schema & Dtypes
|
115 |
{schema_md}
|
116 |
|
117 |
## ๐ ย Missing Values
|
118 |
{missing_md}
|
119 |
|
120 |
+
## ๐ย Descriptive Statisticsย (numeric)
|
121 |
{desc_md}
|
122 |
""".strip()
|