TuringsSolutions commited on
Commit
610ef64
Β·
verified Β·
1 Parent(s): ad69efd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -16
app.py CHANGED
@@ -256,10 +256,30 @@ def compute_metrics(df: pd.DataFrame) -> dict:
256
  "monotone_runs": int(runs),
257
  "run_entropy_bits": float(Hruns),
258
  "sortedness_fraction": float(sorted_frac),
 
 
 
 
259
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  else:
261
  H, k = categorical_entropy(s)
262
- col_stats[c] = {"entropy_bits": float(H), "unique_values": int(k)}
 
 
 
263
  report["per_column"] = col_stats
264
 
265
  try:
@@ -310,6 +330,51 @@ def compute_metrics(df: pd.DataFrame) -> dict:
310
 
311
  return report
312
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  # -------------------------------
314
  # UI rendering helpers
315
  # -------------------------------
@@ -360,8 +425,7 @@ def render_columns(report: dict) -> str:
360
  else:
361
  rows.append(f"<tr><td><b>{c}</b></td><td>{miss:.1%}</td><td>-</td><td>-</td><td>-</td><td>-</td></tr>")
362
  header = "<tr><th>Column</th><th>Missing</th><th>Entropy</th><th>Monotone Runs</th><th>Run-Entropy</th><th>Sortedness</th></tr>"
363
- table = "<table style='width:100%;border-collapse:collapse'>"+header+"".join(rows)+"</table>"
364
- # simple row borders
365
  table = table.replace("<tr>", "<tr style='border-bottom:1px solid #e5e7eb'>")
366
  table = table.replace("<th>", "<th style='text-align:left;padding:8px 6px;color:#374151'>")
367
  table = table.replace("<td>", "<td style='padding:8px 6px;color:#111827'>")
@@ -372,39 +436,43 @@ def render_columns(report: dict) -> str:
372
  # -------------------------------
373
  def analyze(file):
374
  if file is None:
375
- return "{}", "Please upload a CSV.", "", ""
376
  try:
377
  df = pd.read_csv(file.name)
378
  except Exception as e:
379
- return "{}", f"Failed to read CSV: {e}", "", ""
380
 
381
  report = compute_metrics(df)
382
  interp = interpret_report(report)
 
383
 
384
  report_json = json.dumps(report, indent=2)
385
  dashboard_html = render_dashboard(report, interp)
386
  recs_html = render_recs(interp)
387
  cols_html = render_columns(report)
 
388
 
389
- return report_json, dashboard_html, recs_html, cols_html
390
 
391
  with gr.Blocks(title="OrderLens β€” Data Interpreter") as demo:
392
  gr.Markdown("# OrderLens β€” Data Interpreter")
393
  gr.Markdown("Upload a CSV and get **readable** structure metrics with plain-language guidance.")
394
  with gr.Row():
395
- inp = gr.File(file_types=[\".csv\"], label=\"CSV file\")
396
- btn = gr.Button(\"Analyze\", variant=\"primary\")
397
- gr.Markdown(\"---\")
398
- gr.Markdown(\"### Dashboard\") # color-coded cards + verdict
399
  dash = gr.HTML()
400
- gr.Markdown(\"### Recommendations\") # actionable tips
401
  recs = gr.HTML()
402
- gr.Markdown(\"### Column Details\") # per-column table
403
  cols = gr.HTML()
404
- gr.Markdown(\"### Raw report (JSON)\") # API-friendly
405
- json_out = gr.Code(label=\"Report\", language=\"json\")
 
 
406
 
407
- btn.click(analyze, inputs=inp, outputs=[json_out, dash, recs, cols])
408
 
409
- if __name__ == \"__main__\":
410
  demo.launch()
 
256
  "monotone_runs": int(runs),
257
  "run_entropy_bits": float(Hruns),
258
  "sortedness_fraction": float(sorted_frac),
259
+ "min": float(np.nanmin(s.values)) if s.dropna().shape[0] else None,
260
+ "max": float(np.nanmax(s.values)) if s.dropna().shape[0] else None,
261
+ "mean": float(np.nanmean(s.values)) if s.dropna().shape[0] else None,
262
+ "std": float(np.nanstd(s.values)) if s.dropna().shape[0] else None,
263
  }
264
+ elif types[c] == "datetime":
265
+ try:
266
+ sd = pd.to_datetime(s, errors="coerce")
267
+ min_dt = sd.min()
268
+ max_dt = sd.max()
269
+ col_stats[c] = {
270
+ "entropy_bits": 0.0,
271
+ "unique_values": int(sd.nunique(dropna=True)),
272
+ "min_datetime": None if pd.isna(min_dt) else min_dt.isoformat(),
273
+ "max_datetime": None if pd.isna(max_dt) else max_dt.isoformat(),
274
+ }
275
+ except Exception:
276
+ col_stats[c] = {"entropy_bits": 0.0, "unique_values": int(s.nunique(dropna=True))}
277
  else:
278
  H, k = categorical_entropy(s)
279
+ # top-5 categories
280
+ vc = s.astype(str).value_counts(dropna=True).head(5)
281
+ top5 = [{"value": str(idx), "count": int(cnt)} for idx, cnt in vc.items()]
282
+ col_stats[c] = {"entropy_bits": float(H), "unique_values": int(k), "top_values": top5}
283
  report["per_column"] = col_stats
284
 
285
  try:
 
330
 
331
  return report
332
 
333
+ # -------------------------------
334
+ # Dataset shape summary for other models
335
+ # -------------------------------
336
+ def dataset_shape_summary(df: pd.DataFrame, report: dict, max_examples: int = 3) -> dict:
337
+ """Compact JSON describing the dataset schema, ranges, and examples for LLM ingestion."""
338
+ cols = []
339
+ for name, t in report["column_types"].items():
340
+ col_info = {"name": name, "type": t}
341
+ per = report["per_column"].get(name, {})
342
+ if t == "numeric":
343
+ col_info.update({
344
+ "min": per.get("min"),
345
+ "max": per.get("max"),
346
+ "mean": per.get("mean"),
347
+ "std": per.get("std"),
348
+ "missing_frac": report["missing_fraction_per_column"].get(name, 0.0)
349
+ })
350
+ elif t == "datetime":
351
+ col_info.update({
352
+ "min": per.get("min_datetime"),
353
+ "max": per.get("max_datetime"),
354
+ "missing_frac": report["missing_fraction_per_column"].get(name, 0.0)
355
+ })
356
+ else: # categorical or other
357
+ col_info.update({
358
+ "unique_values": per.get("unique_values"),
359
+ "top_values": per.get("top_values", []),
360
+ "missing_frac": report["missing_fraction_per_column"].get(name, 0.0)
361
+ })
362
+ cols.append(col_info)
363
+
364
+ # few example rows (stringified to be safe)
365
+ examples = df.head(max_examples).astype(str).to_dict(orient="records")
366
+
367
+ shape = {
368
+ "n_rows": report["shape"]["rows"],
369
+ "n_cols": report["shape"]["cols"],
370
+ "columns": cols,
371
+ "duplicates_fraction": report.get("duplicate_row_fraction", 0.0),
372
+ "gzip_compression_ratio": report.get("gzip_compression_ratio", None),
373
+ "harvestable_energy_score": report.get("harvestable_energy_score", None),
374
+ "examples": examples
375
+ }
376
+ return shape
377
+
378
  # -------------------------------
379
  # UI rendering helpers
380
  # -------------------------------
 
425
  else:
426
  rows.append(f"<tr><td><b>{c}</b></td><td>{miss:.1%}</td><td>-</td><td>-</td><td>-</td><td>-</td></tr>")
427
  header = "<tr><th>Column</th><th>Missing</th><th>Entropy</th><th>Monotone Runs</th><th>Run-Entropy</th><th>Sortedness</th></tr>"
428
+ table = "<table style='width:100%;border-collapse:collapse'>" + header + "".join(rows) + "</table>"
 
429
  table = table.replace("<tr>", "<tr style='border-bottom:1px solid #e5e7eb'>")
430
  table = table.replace("<th>", "<th style='text-align:left;padding:8px 6px;color:#374151'>")
431
  table = table.replace("<td>", "<td style='padding:8px 6px;color:#111827'>")
 
436
  # -------------------------------
437
  def analyze(file):
438
  if file is None:
439
+ return "{}", "Please upload a CSV.", "", "", "{}"
440
  try:
441
  df = pd.read_csv(file.name)
442
  except Exception as e:
443
+ return "{}", f"Failed to read CSV: {e}", "", "", "{}"
444
 
445
  report = compute_metrics(df)
446
  interp = interpret_report(report)
447
+ shape = dataset_shape_summary(df, report, max_examples=3)
448
 
449
  report_json = json.dumps(report, indent=2)
450
  dashboard_html = render_dashboard(report, interp)
451
  recs_html = render_recs(interp)
452
  cols_html = render_columns(report)
453
+ shape_json = json.dumps(shape, indent=2)
454
 
455
+ return report_json, dashboard_html, recs_html, cols_html, shape_json
456
 
457
  with gr.Blocks(title="OrderLens β€” Data Interpreter") as demo:
458
  gr.Markdown("# OrderLens β€” Data Interpreter")
459
  gr.Markdown("Upload a CSV and get **readable** structure metrics with plain-language guidance.")
460
  with gr.Row():
461
+ inp = gr.File(file_types=[".csv"], label="CSV file")
462
+ btn = gr.Button("Analyze", variant="primary")
463
+ gr.Markdown("---")
464
+ gr.Markdown("### Dashboard") # color-coded cards + verdict
465
  dash = gr.HTML()
466
+ gr.Markdown("### Recommendations") # actionable tips
467
  recs = gr.HTML()
468
+ gr.Markdown("### Column Details") # per-column table
469
  cols = gr.HTML()
470
+ gr.Markdown("### Dataset Shape Summary (JSON)") # compact schema for other models
471
+ shape_out = gr.Code(label="Shape", language="json")
472
+ gr.Markdown("### Raw report (JSON)") # API-friendly
473
+ json_out = gr.Code(label="Report", language="json")
474
 
475
+ btn.click(analyze, inputs=inp, outputs=[json_out, dash, recs, cols, shape_out])
476
 
477
+ if __name__ == "__main__":
478
  demo.launch()