Update app.py
Browse files
app.py
CHANGED
|
@@ -256,10 +256,30 @@ def compute_metrics(df: pd.DataFrame) -> dict:
|
|
| 256 |
"monotone_runs": int(runs),
|
| 257 |
"run_entropy_bits": float(Hruns),
|
| 258 |
"sortedness_fraction": float(sorted_frac),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
else:
|
| 261 |
H, k = categorical_entropy(s)
|
| 262 |
-
|
|
|
|
|
|
|
|
|
|
| 263 |
report["per_column"] = col_stats
|
| 264 |
|
| 265 |
try:
|
|
@@ -310,6 +330,51 @@ def compute_metrics(df: pd.DataFrame) -> dict:
|
|
| 310 |
|
| 311 |
return report
|
| 312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
# -------------------------------
|
| 314 |
# UI rendering helpers
|
| 315 |
# -------------------------------
|
|
@@ -360,8 +425,7 @@ def render_columns(report: dict) -> str:
|
|
| 360 |
else:
|
| 361 |
rows.append(f"<tr><td><b>{c}</b></td><td>{miss:.1%}</td><td>-</td><td>-</td><td>-</td><td>-</td></tr>")
|
| 362 |
header = "<tr><th>Column</th><th>Missing</th><th>Entropy</th><th>Monotone Runs</th><th>Run-Entropy</th><th>Sortedness</th></tr>"
|
| 363 |
-
table = "<table style='width:100%;border-collapse:collapse'>"+header+"".join(rows)+"</table>"
|
| 364 |
-
# simple row borders
|
| 365 |
table = table.replace("<tr>", "<tr style='border-bottom:1px solid #e5e7eb'>")
|
| 366 |
table = table.replace("<th>", "<th style='text-align:left;padding:8px 6px;color:#374151'>")
|
| 367 |
table = table.replace("<td>", "<td style='padding:8px 6px;color:#111827'>")
|
|
@@ -372,39 +436,43 @@ def render_columns(report: dict) -> str:
|
|
| 372 |
# -------------------------------
|
| 373 |
def analyze(file):
|
| 374 |
if file is None:
|
| 375 |
-
return "{}", "Please upload a CSV.", "", ""
|
| 376 |
try:
|
| 377 |
df = pd.read_csv(file.name)
|
| 378 |
except Exception as e:
|
| 379 |
-
return "{}", f"Failed to read CSV: {e}", "", ""
|
| 380 |
|
| 381 |
report = compute_metrics(df)
|
| 382 |
interp = interpret_report(report)
|
|
|
|
| 383 |
|
| 384 |
report_json = json.dumps(report, indent=2)
|
| 385 |
dashboard_html = render_dashboard(report, interp)
|
| 386 |
recs_html = render_recs(interp)
|
| 387 |
cols_html = render_columns(report)
|
|
|
|
| 388 |
|
| 389 |
-
return report_json, dashboard_html, recs_html, cols_html
|
| 390 |
|
| 391 |
with gr.Blocks(title="OrderLens β Data Interpreter") as demo:
|
| 392 |
gr.Markdown("# OrderLens β Data Interpreter")
|
| 393 |
gr.Markdown("Upload a CSV and get **readable** structure metrics with plain-language guidance.")
|
| 394 |
with gr.Row():
|
| 395 |
-
inp = gr.File(file_types=[
|
| 396 |
-
btn = gr.Button(
|
| 397 |
-
gr.Markdown(
|
| 398 |
-
gr.Markdown(
|
| 399 |
dash = gr.HTML()
|
| 400 |
-
gr.Markdown(
|
| 401 |
recs = gr.HTML()
|
| 402 |
-
gr.Markdown(
|
| 403 |
cols = gr.HTML()
|
| 404 |
-
gr.Markdown(
|
| 405 |
-
|
|
|
|
|
|
|
| 406 |
|
| 407 |
-
btn.click(analyze, inputs=inp, outputs=[json_out, dash, recs, cols])
|
| 408 |
|
| 409 |
-
if __name__ ==
|
| 410 |
demo.launch()
|
|
|
|
| 256 |
"monotone_runs": int(runs),
|
| 257 |
"run_entropy_bits": float(Hruns),
|
| 258 |
"sortedness_fraction": float(sorted_frac),
|
| 259 |
+
"min": float(np.nanmin(s.values)) if s.dropna().shape[0] else None,
|
| 260 |
+
"max": float(np.nanmax(s.values)) if s.dropna().shape[0] else None,
|
| 261 |
+
"mean": float(np.nanmean(s.values)) if s.dropna().shape[0] else None,
|
| 262 |
+
"std": float(np.nanstd(s.values)) if s.dropna().shape[0] else None,
|
| 263 |
}
|
| 264 |
+
elif types[c] == "datetime":
|
| 265 |
+
try:
|
| 266 |
+
sd = pd.to_datetime(s, errors="coerce")
|
| 267 |
+
min_dt = sd.min()
|
| 268 |
+
max_dt = sd.max()
|
| 269 |
+
col_stats[c] = {
|
| 270 |
+
"entropy_bits": 0.0,
|
| 271 |
+
"unique_values": int(sd.nunique(dropna=True)),
|
| 272 |
+
"min_datetime": None if pd.isna(min_dt) else min_dt.isoformat(),
|
| 273 |
+
"max_datetime": None if pd.isna(max_dt) else max_dt.isoformat(),
|
| 274 |
+
}
|
| 275 |
+
except Exception:
|
| 276 |
+
col_stats[c] = {"entropy_bits": 0.0, "unique_values": int(s.nunique(dropna=True))}
|
| 277 |
else:
|
| 278 |
H, k = categorical_entropy(s)
|
| 279 |
+
# top-5 categories
|
| 280 |
+
vc = s.astype(str).value_counts(dropna=True).head(5)
|
| 281 |
+
top5 = [{"value": str(idx), "count": int(cnt)} for idx, cnt in vc.items()]
|
| 282 |
+
col_stats[c] = {"entropy_bits": float(H), "unique_values": int(k), "top_values": top5}
|
| 283 |
report["per_column"] = col_stats
|
| 284 |
|
| 285 |
try:
|
|
|
|
| 330 |
|
| 331 |
return report
|
| 332 |
|
| 333 |
+
# -------------------------------
|
| 334 |
+
# Dataset shape summary for other models
|
| 335 |
+
# -------------------------------
|
| 336 |
+
def dataset_shape_summary(df: pd.DataFrame, report: dict, max_examples: int = 3) -> dict:
|
| 337 |
+
"""Compact JSON describing the dataset schema, ranges, and examples for LLM ingestion."""
|
| 338 |
+
cols = []
|
| 339 |
+
for name, t in report["column_types"].items():
|
| 340 |
+
col_info = {"name": name, "type": t}
|
| 341 |
+
per = report["per_column"].get(name, {})
|
| 342 |
+
if t == "numeric":
|
| 343 |
+
col_info.update({
|
| 344 |
+
"min": per.get("min"),
|
| 345 |
+
"max": per.get("max"),
|
| 346 |
+
"mean": per.get("mean"),
|
| 347 |
+
"std": per.get("std"),
|
| 348 |
+
"missing_frac": report["missing_fraction_per_column"].get(name, 0.0)
|
| 349 |
+
})
|
| 350 |
+
elif t == "datetime":
|
| 351 |
+
col_info.update({
|
| 352 |
+
"min": per.get("min_datetime"),
|
| 353 |
+
"max": per.get("max_datetime"),
|
| 354 |
+
"missing_frac": report["missing_fraction_per_column"].get(name, 0.0)
|
| 355 |
+
})
|
| 356 |
+
else: # categorical or other
|
| 357 |
+
col_info.update({
|
| 358 |
+
"unique_values": per.get("unique_values"),
|
| 359 |
+
"top_values": per.get("top_values", []),
|
| 360 |
+
"missing_frac": report["missing_fraction_per_column"].get(name, 0.0)
|
| 361 |
+
})
|
| 362 |
+
cols.append(col_info)
|
| 363 |
+
|
| 364 |
+
# few example rows (stringified to be safe)
|
| 365 |
+
examples = df.head(max_examples).astype(str).to_dict(orient="records")
|
| 366 |
+
|
| 367 |
+
shape = {
|
| 368 |
+
"n_rows": report["shape"]["rows"],
|
| 369 |
+
"n_cols": report["shape"]["cols"],
|
| 370 |
+
"columns": cols,
|
| 371 |
+
"duplicates_fraction": report.get("duplicate_row_fraction", 0.0),
|
| 372 |
+
"gzip_compression_ratio": report.get("gzip_compression_ratio", None),
|
| 373 |
+
"harvestable_energy_score": report.get("harvestable_energy_score", None),
|
| 374 |
+
"examples": examples
|
| 375 |
+
}
|
| 376 |
+
return shape
|
| 377 |
+
|
| 378 |
# -------------------------------
|
| 379 |
# UI rendering helpers
|
| 380 |
# -------------------------------
|
|
|
|
| 425 |
else:
|
| 426 |
rows.append(f"<tr><td><b>{c}</b></td><td>{miss:.1%}</td><td>-</td><td>-</td><td>-</td><td>-</td></tr>")
|
| 427 |
header = "<tr><th>Column</th><th>Missing</th><th>Entropy</th><th>Monotone Runs</th><th>Run-Entropy</th><th>Sortedness</th></tr>"
|
| 428 |
+
table = "<table style='width:100%;border-collapse:collapse'>" + header + "".join(rows) + "</table>"
|
|
|
|
| 429 |
table = table.replace("<tr>", "<tr style='border-bottom:1px solid #e5e7eb'>")
|
| 430 |
table = table.replace("<th>", "<th style='text-align:left;padding:8px 6px;color:#374151'>")
|
| 431 |
table = table.replace("<td>", "<td style='padding:8px 6px;color:#111827'>")
|
|
|
|
| 436 |
# -------------------------------
|
| 437 |
def analyze(file):
|
| 438 |
if file is None:
|
| 439 |
+
return "{}", "Please upload a CSV.", "", "", "{}"
|
| 440 |
try:
|
| 441 |
df = pd.read_csv(file.name)
|
| 442 |
except Exception as e:
|
| 443 |
+
return "{}", f"Failed to read CSV: {e}", "", "", "{}"
|
| 444 |
|
| 445 |
report = compute_metrics(df)
|
| 446 |
interp = interpret_report(report)
|
| 447 |
+
shape = dataset_shape_summary(df, report, max_examples=3)
|
| 448 |
|
| 449 |
report_json = json.dumps(report, indent=2)
|
| 450 |
dashboard_html = render_dashboard(report, interp)
|
| 451 |
recs_html = render_recs(interp)
|
| 452 |
cols_html = render_columns(report)
|
| 453 |
+
shape_json = json.dumps(shape, indent=2)
|
| 454 |
|
| 455 |
+
return report_json, dashboard_html, recs_html, cols_html, shape_json
|
| 456 |
|
| 457 |
with gr.Blocks(title="OrderLens β Data Interpreter") as demo:
|
| 458 |
gr.Markdown("# OrderLens β Data Interpreter")
|
| 459 |
gr.Markdown("Upload a CSV and get **readable** structure metrics with plain-language guidance.")
|
| 460 |
with gr.Row():
|
| 461 |
+
inp = gr.File(file_types=[".csv"], label="CSV file")
|
| 462 |
+
btn = gr.Button("Analyze", variant="primary")
|
| 463 |
+
gr.Markdown("---")
|
| 464 |
+
gr.Markdown("### Dashboard") # color-coded cards + verdict
|
| 465 |
dash = gr.HTML()
|
| 466 |
+
gr.Markdown("### Recommendations") # actionable tips
|
| 467 |
recs = gr.HTML()
|
| 468 |
+
gr.Markdown("### Column Details") # per-column table
|
| 469 |
cols = gr.HTML()
|
| 470 |
+
gr.Markdown("### Dataset Shape Summary (JSON)") # compact schema for other models
|
| 471 |
+
shape_out = gr.Code(label="Shape", language="json")
|
| 472 |
+
gr.Markdown("### Raw report (JSON)") # API-friendly
|
| 473 |
+
json_out = gr.Code(label="Report", language="json")
|
| 474 |
|
| 475 |
+
btn.click(analyze, inputs=inp, outputs=[json_out, dash, recs, cols, shape_out])
|
| 476 |
|
| 477 |
+
if __name__ == "__main__":
|
| 478 |
demo.launch()
|