mgbam commited on
Commit
7453b19
ยท
verified ยท
1 Parent(s): ed51f1d

Update tools/csv_parser.py

Browse files
Files changed (1) hide show
  1. tools/csv_parser.py +75 -52
tools/csv_parser.py CHANGED
@@ -1,99 +1,122 @@
1
  # tools/csv_parser.py
2
  # ------------------------------------------------------------
3
- # Reads CSVโ€ฏ/โ€ฏExcel, samples for very large files, and returns a
4
- # Markdownโ€‘formatted โ€œquickโ€‘scanโ€ report: dimensions, schema,
5
- # missingโ€‘value profile, numericย describe(), and memory footprint.
 
 
 
 
 
6
 
7
  from __future__ import annotations
8
 
9
  import os
10
  from typing import Union
11
 
 
12
  import pandas as pd
13
 
14
 
15
- def _safe_read(path_or_buf: Union[str, bytes], sample_rows: int = 1_000_000) -> pd.DataFrame:
16
- """Read CSV or Excel. If the file has >ย sample_rows, read only a sample."""
17
- # Determine extension (bestโ€‘effort)
18
- ext = ".csv"
19
- if isinstance(path_or_buf, str):
20
- ext = os.path.splitext(path_or_buf)[1].lower()
 
21
 
22
  if ext in (".xls", ".xlsx"):
23
- # Excel โ€” read first sheet
24
- df = pd.read_excel(path_or_buf, engine="openpyxl")
25
- else: # CSV family
26
- # First rowโ€‘count check: pandasย 1.5+ uses memory map โ‡’ cheap for header only
27
- nrows_total = sum(1 for _ in open(path_or_buf, "rb")) if isinstance(path_or_buf, str) else None
28
- if nrows_total and nrows_total > sample_rows:
29
- # sample uniformly without loading everything
30
- skip = sorted(
31
- pd.np.random.choice(range(1, nrows_total), nrows_total - sample_rows, replace=False)
32
- )
33
- df = pd.read_csv(path_or_buf, skiprows=skip)
34
- else:
35
- df = pd.read_csv(path_or_buf)
36
 
37
- return df
 
 
 
 
38
 
 
39
 
40
- def parse_csv_tool(file: Union[str, bytes]) -> str:
 
 
 
 
41
  """
42
- Return a **Markdown** report describing the dataset.
43
 
44
  Sections:
45
  โ€ข Dimensions
46
- โ€ข Schema (+ dtypes)
47
- โ€ข Missingโ€‘value counts + %
48
- โ€ข Numeric descriptive statistics
49
  โ€ข Memory usage
50
  """
51
  try:
52
- df = _safe_read(file)
53
  except Exception as exc:
54
  return f"โŒ Failed to load data: {exc}"
55
 
56
- n_rows, n_cols = df.shape
 
57
 
58
- # ---------- schema ----------
59
  schema_md = "\n".join(
60
- f"- **{col}** โ€“ `{dtype}`"
61
- for col, dtype in df.dtypes.items()
62
  )
63
 
64
- # ---------- missing ----------
65
- miss_ct = df.isna().sum()
66
  miss_pct = (miss_ct / len(df) * 100).round(1)
67
- missing_md = "\n".join(
68
- f"- **{c}**: {miss_ct[c]}ย ({miss_pct[c]}โ€ฏ%)"
69
- for c in df.columns if miss_ct[c] > 0
70
- ) or "None"
71
-
72
- # ---------- descriptive stats (numeric only) ----------
73
- if df.select_dtypes("number").shape[1]:
74
- desc_md = df.describe().T.round(2).to_markdown()
75
- else:
76
- desc_md = "_No numeric columns_"
77
 
78
- # ---------- memory ----------
79
- mem_mb = df.memory_usage(deep=True).sum() / 1024**2
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
- # ---------- assemble ----------
82
  return f"""
83
  # ๐Ÿ“Šย Dataset Overview
84
 
85
  | metric | value |
86
  | ------ | ----- |
87
- | Rows | {n_rows:,} |
88
- | Columns| {n_cols} |
89
  | Memory | {mem_mb:.2f}ย MB |
90
 
91
- ## ๐Ÿ—‚ย Schema
92
  {schema_md}
93
 
94
  ## ๐Ÿ› ย Missing Values
95
  {missing_md}
96
 
97
- ## ๐Ÿ“ˆย Descriptiveย Statistics (numeric)
98
  {desc_md}
99
  """.strip()
 
1
  # tools/csv_parser.py
2
  # ------------------------------------------------------------
3
+ # Reads a CSVโ€ฏ/โ€ฏExcel file (sampling ultraโ€‘large CSVs), then
4
+ # returns a Markdown report:
5
+ # โ–ธ dimensions โ–ธ schema & dtypes
6
+ # โ–ธ missingโ€‘value map โ–ธ numeric describe()
7
+ # โ–ธ memory footprint
8
+ # If the optional dependency **tabulate** is unavailable,
9
+ # it falls back to a plainโ€‘text table wrapped in Markdown
10
+ # code fences, so no ImportError ever reaches the UI.
11
 
12
  from __future__ import annotations
13
 
14
  import os
15
  from typing import Union
16
 
17
+ import numpy as np
18
  import pandas as pd
19
 
20
 
21
+ # โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ
22
+ # โ”‚ Helper: efficient reader with sampling for huge CSVs โ”‚
23
+ # โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ
24
+ def _safe_read(path: Union[str, bytes], sample_rows: int = 1_000_000) -> pd.DataFrame:
25
+ """Load CSV / Excel. If CSV has >โ€ฏsample_rows, read a uniform sample."""
26
+ is_str = isinstance(path, str)
27
+ ext = os.path.splitext(path)[1].lower() if is_str else ".csv"
28
 
29
  if ext in (".xls", ".xlsx"):
30
+ return pd.read_excel(path, engine="openpyxl")
31
+
32
+ # --- CSV branch --------------------------------------------------------
33
+ if is_str:
34
+ # fast line count (memoryโ€‘map); falls back to full read for nonโ€‘files
35
+ with open(path, "rb") as fh:
36
+ n_total = sum(1 for _ in fh)
37
+ else:
38
+ n_total = None
 
 
 
 
39
 
40
+ if n_total and n_total > sample_rows:
41
+ # sample without reading entire file
42
+ rng = np.random.default_rng(seed=42)
43
+ skip = sorted(rng.choice(range(1, n_total), n_total - sample_rows, replace=False))
44
+ return pd.read_csv(path, skiprows=skip)
45
 
46
+ return pd.read_csv(path)
47
 
48
+
49
+ # โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ
50
+ # โ”‚ Main public helper โ”‚
51
+ # โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ
52
+ def parse_csv_tool(path: Union[str, bytes]) -> str:
53
  """
54
+ Return a Markdown report that Streamlit can render.
55
 
56
  Sections:
57
  โ€ข Dimensions
58
+ โ€ข Schema & dtypes
59
+ โ€ข Missingโ€‘value counts (+%)
60
+ โ€ข Numeric describe()
61
  โ€ข Memory usage
62
  """
63
  try:
64
+ df = _safe_read(path)
65
  except Exception as exc:
66
  return f"โŒ Failed to load data: {exc}"
67
 
68
+ rows, cols = df.shape
69
+ mem_mb = df.memory_usage(deep=True).sum() / 1024**2
70
 
71
+ # โ”€โ”€ Schema -------------------------------------------------------------
72
  schema_md = "\n".join(
73
+ f"- **{col}** โ€“ `{dtype}`" for col, dtype in df.dtypes.items()
 
74
  )
75
 
76
+ # โ”€โ”€ Missing map --------------------------------------------------------
77
+ miss_ct = df.isna().sum()
78
  miss_pct = (miss_ct / len(df) * 100).round(1)
79
+ missing_md = (
80
+ "\n".join(
81
+ f"- **{c}**: {miss_ct[c]}ย ({miss_pct[c]}โ€ฏ%)"
82
+ for c in df.columns
83
+ if miss_ct[c] > 0
84
+ )
85
+ or "None"
86
+ )
 
 
87
 
88
+ # โ”€โ”€ Numeric describe() -------------------------------------------------
89
+ numeric_df = df.select_dtypes("number")
90
+ if numeric_df.empty:
91
+ desc_md = "_No numeric columns_"
92
+ else:
93
+ try:
94
+ # requires the optional 'tabulate' package
95
+ desc_md = numeric_df.describe().T.round(2).to_markdown()
96
+ except ImportError:
97
+ # graceful fallback without extra dependency
98
+ desc_md = (
99
+ "```text\n"
100
+ + numeric_df.describe().T.round(2).to_string()
101
+ + "\n```"
102
+ )
103
 
104
+ # โ”€โ”€ Assemble markdown --------------------------------------------------
105
  return f"""
106
  # ๐Ÿ“Šย Dataset Overview
107
 
108
  | metric | value |
109
  | ------ | ----- |
110
+ | Rows | {rows:,} |
111
+ | Columns| {cols} |
112
  | Memory | {mem_mb:.2f}ย MB |
113
 
114
+ ## ๐Ÿ—‚ย Schema & Dtypes
115
  {schema_md}
116
 
117
  ## ๐Ÿ› ย Missing Values
118
  {missing_md}
119
 
120
+ ## ๐Ÿ“ˆย Descriptive Statisticsย (numeric)
121
  {desc_md}
122
  """.strip()