Spaces:

TuringsSolutions
/

Entropy-Harvester

Sleeping

App Files Files Community

TuringsSolutions commited on Sep 2

Commit

610ef64

verified ·

1 Parent(s): ad69efd

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -16

app.py CHANGED Viewed

@@ -256,10 +256,30 @@ def compute_metrics(df: pd.DataFrame) -> dict:
                 "monotone_runs": int(runs),
                 "run_entropy_bits": float(Hruns),
                 "sortedness_fraction": float(sorted_frac),
             }
         else:
             H, k = categorical_entropy(s)
-            col_stats[c] = {"entropy_bits": float(H), "unique_values": int(k)}
     report["per_column"] = col_stats
     try:
@@ -310,6 +330,51 @@ def compute_metrics(df: pd.DataFrame) -> dict:
     return report
 # -------------------------------
 # UI rendering helpers
 # -------------------------------
@@ -360,8 +425,7 @@ def render_columns(report: dict) -> str:
         else:
             rows.append(f"<tr><td><b>{c}</b></td><td>{miss:.1%}</td><td>-</td><td>-</td><td>-</td><td>-</td></tr>")
     header = "<tr><th>Column</th><th>Missing</th><th>Entropy</th><th>Monotone Runs</th><th>Run-Entropy</th><th>Sortedness</th></tr>"
-    table = "<table style='width:100%;border-collapse:collapse'>"+header+"".join(rows)+"</table>"
-    # simple row borders
     table = table.replace("<tr>", "<tr style='border-bottom:1px solid #e5e7eb'>")
     table = table.replace("<th>", "<th style='text-align:left;padding:8px 6px;color:#374151'>")
     table = table.replace("<td>", "<td style='padding:8px 6px;color:#111827'>")
@@ -372,39 +436,43 @@ def render_columns(report: dict) -> str:
 # -------------------------------
 def analyze(file):
     if file is None:
-        return "{}", "Please upload a CSV.", "", ""
     try:
         df = pd.read_csv(file.name)
     except Exception as e:
-        return "{}", f"Failed to read CSV: {e}", "", ""
     report = compute_metrics(df)
     interp = interpret_report(report)
     report_json = json.dumps(report, indent=2)
     dashboard_html = render_dashboard(report, interp)
     recs_html = render_recs(interp)
     cols_html = render_columns(report)
-    return report_json, dashboard_html, recs_html, cols_html
 with gr.Blocks(title="OrderLens — Data Interpreter") as demo:
     gr.Markdown("# OrderLens — Data Interpreter")
     gr.Markdown("Upload a CSV and get **readable** structure metrics with plain-language guidance.")
     with gr.Row():
-        inp = gr.File(file_types=[\".csv\"], label=\"CSV file\")
-    btn = gr.Button(\"Analyze\", variant=\"primary\")
-    gr.Markdown(\"---\")
-    gr.Markdown(\"### Dashboard\")  # color-coded cards + verdict
     dash = gr.HTML()
-    gr.Markdown(\"### Recommendations\")  # actionable tips
     recs = gr.HTML()
-    gr.Markdown(\"### Column Details\")  # per-column table
     cols = gr.HTML()
-    gr.Markdown(\"### Raw report (JSON)\")  # API-friendly
-    json_out = gr.Code(label=\"Report\", language=\"json\")
-    btn.click(analyze, inputs=inp, outputs=[json_out, dash, recs, cols])
-if __name__ == \"__main__\":
     demo.launch()

                 "monotone_runs": int(runs),
                 "run_entropy_bits": float(Hruns),
                 "sortedness_fraction": float(sorted_frac),
+                "min": float(np.nanmin(s.values)) if s.dropna().shape[0] else None,
+                "max": float(np.nanmax(s.values)) if s.dropna().shape[0] else None,
+                "mean": float(np.nanmean(s.values)) if s.dropna().shape[0] else None,
+                "std": float(np.nanstd(s.values)) if s.dropna().shape[0] else None,
             }
+        elif types[c] == "datetime":
+            try:
+                sd = pd.to_datetime(s, errors="coerce")
+                min_dt = sd.min()
+                max_dt = sd.max()
+                col_stats[c] = {
+                    "entropy_bits": 0.0,
+                    "unique_values": int(sd.nunique(dropna=True)),
+                    "min_datetime": None if pd.isna(min_dt) else min_dt.isoformat(),
+                    "max_datetime": None if pd.isna(max_dt) else max_dt.isoformat(),
+                }
+            except Exception:
+                col_stats[c] = {"entropy_bits": 0.0, "unique_values": int(s.nunique(dropna=True))}
         else:
             H, k = categorical_entropy(s)
+            # top-5 categories
+            vc = s.astype(str).value_counts(dropna=True).head(5)
+            top5 = [{"value": str(idx), "count": int(cnt)} for idx, cnt in vc.items()]
+            col_stats[c] = {"entropy_bits": float(H), "unique_values": int(k), "top_values": top5}
     report["per_column"] = col_stats
     try:
     return report
+# -------------------------------
+# Dataset shape summary for other models
+# -------------------------------
+def dataset_shape_summary(df: pd.DataFrame, report: dict, max_examples: int = 3) -> dict:
+    """Compact JSON describing the dataset schema, ranges, and examples for LLM ingestion."""
+    cols = []
+    for name, t in report["column_types"].items():
+        col_info = {"name": name, "type": t}
+        per = report["per_column"].get(name, {})
+        if t == "numeric":
+            col_info.update({
+                "min": per.get("min"),
+                "max": per.get("max"),
+                "mean": per.get("mean"),
+                "std": per.get("std"),
+                "missing_frac": report["missing_fraction_per_column"].get(name, 0.0)
+            })
+        elif t == "datetime":
+            col_info.update({
+                "min": per.get("min_datetime"),
+                "max": per.get("max_datetime"),
+                "missing_frac": report["missing_fraction_per_column"].get(name, 0.0)
+            })
+        else:  # categorical or other
+            col_info.update({
+                "unique_values": per.get("unique_values"),
+                "top_values": per.get("top_values", []),
+                "missing_frac": report["missing_fraction_per_column"].get(name, 0.0)
+            })
+        cols.append(col_info)
+    # few example rows (stringified to be safe)
+    examples = df.head(max_examples).astype(str).to_dict(orient="records")
+    shape = {
+        "n_rows": report["shape"]["rows"],
+        "n_cols": report["shape"]["cols"],
+        "columns": cols,
+        "duplicates_fraction": report.get("duplicate_row_fraction", 0.0),
+        "gzip_compression_ratio": report.get("gzip_compression_ratio", None),
+        "harvestable_energy_score": report.get("harvestable_energy_score", None),
+        "examples": examples
+    }
+    return shape
 # -------------------------------
 # UI rendering helpers
 # -------------------------------
         else:
             rows.append(f"<tr><td><b>{c}</b></td><td>{miss:.1%}</td><td>-</td><td>-</td><td>-</td><td>-</td></tr>")
     header = "<tr><th>Column</th><th>Missing</th><th>Entropy</th><th>Monotone Runs</th><th>Run-Entropy</th><th>Sortedness</th></tr>"
+    table = "<table style='width:100%;border-collapse:collapse'>" + header + "".join(rows) + "</table>"
     table = table.replace("<tr>", "<tr style='border-bottom:1px solid #e5e7eb'>")
     table = table.replace("<th>", "<th style='text-align:left;padding:8px 6px;color:#374151'>")
     table = table.replace("<td>", "<td style='padding:8px 6px;color:#111827'>")
 # -------------------------------
 def analyze(file):
     if file is None:
+        return "{}", "Please upload a CSV.", "", "", "{}"
     try:
         df = pd.read_csv(file.name)
     except Exception as e:
+        return "{}", f"Failed to read CSV: {e}", "", "", "{}"
     report = compute_metrics(df)
     interp = interpret_report(report)
+    shape = dataset_shape_summary(df, report, max_examples=3)
     report_json = json.dumps(report, indent=2)
     dashboard_html = render_dashboard(report, interp)
     recs_html = render_recs(interp)
     cols_html = render_columns(report)
+    shape_json = json.dumps(shape, indent=2)
+    return report_json, dashboard_html, recs_html, cols_html, shape_json
 with gr.Blocks(title="OrderLens — Data Interpreter") as demo:
     gr.Markdown("# OrderLens — Data Interpreter")
     gr.Markdown("Upload a CSV and get **readable** structure metrics with plain-language guidance.")
     with gr.Row():
+        inp = gr.File(file_types=[".csv"], label="CSV file")
+    btn = gr.Button("Analyze", variant="primary")
+    gr.Markdown("---")
+    gr.Markdown("### Dashboard")  # color-coded cards + verdict
     dash = gr.HTML()
+    gr.Markdown("### Recommendations")  # actionable tips
     recs = gr.HTML()
+    gr.Markdown("### Column Details")  # per-column table
     cols = gr.HTML()
+    gr.Markdown("### Dataset Shape Summary (JSON)")  # compact schema for other models
+    shape_out = gr.Code(label="Shape", language="json")
+    gr.Markdown("### Raw report (JSON)")  # API-friendly
+    json_out = gr.Code(label="Report", language="json")
+    btn.click(analyze, inputs=inp, outputs=[json_out, dash, recs, cols, shape_out])
+if __name__ == "__main__":
     demo.launch()