Spaces:
Running
Running
| """analytics.py β Load sample results and build Plotly figures for the Analytics tab.""" | |
| import json | |
| import os | |
| from collections import Counter, defaultdict | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| # ββ Data loading ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _DIR = os.path.dirname(__file__) | |
| DATASETS = { | |
| "ICLR 2025": "iclr2025_v2_results.jsonl", | |
| "ICML 2025": "icml2025_v3_results.jsonl", | |
| "NeurIPS 2025": "neurips2025_v3_results.jsonl", | |
| } | |
| LABEL_COLORS = { | |
| "System 1": "#ef4444", | |
| "Mixed": "#f59e0b", | |
| "System 2": "#22c55e", | |
| } | |
| CONF_COLORS = { | |
| "ICLR 2025": "#6366f1", | |
| "ICML 2025": "#f59e0b", | |
| "NeurIPS 2025": "#22c55e", | |
| } | |
| def _load_results(fname: str) -> list: | |
| path = os.path.join(_DIR, fname) | |
| if not os.path.exists(path): | |
| return [] | |
| out = [] | |
| for line in open(path): | |
| line = line.strip() | |
| if line: | |
| try: | |
| out.append(json.loads(line)) | |
| except Exception: | |
| pass | |
| return out | |
| def load_all() -> dict: | |
| """Returns {conf: {"papers": [...], "reviews": [...], "metas": [...]}}""" | |
| data = {} | |
| for conf, fname in DATASETS.items(): | |
| papers = _load_results(fname) | |
| reviews = [] | |
| for p in papers: | |
| for r in p.get("review_ratings", []): | |
| if r.get("label"): | |
| reviews.append({**r, "_decision": p.get("decision", ""), "_conf": conf}) | |
| metas = [] | |
| for p in papers: | |
| m = p.get("metareview_rating") | |
| if m and m.get("label"): | |
| metas.append({**m, "_decision": p.get("decision", ""), "_conf": conf}) | |
| data[conf] = {"papers": papers, "reviews": reviews, "metas": metas} | |
| return data | |
| # ββ Figure builders βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def fig_label_distribution(data: dict) -> go.Figure: | |
| """Grouped bar: label distribution per conference.""" | |
| labels_order = ["System 1", "Mixed", "System 2"] | |
| confs = list(data.keys()) | |
| fig = go.Figure() | |
| for lbl in labels_order: | |
| y_vals = [] | |
| for conf in confs: | |
| reviews = data[conf]["reviews"] | |
| if not reviews: | |
| y_vals.append(0) | |
| continue | |
| cnt = sum(1 for r in reviews if r["label"] == lbl) | |
| y_vals.append(round(cnt / len(reviews) * 100, 1)) | |
| fig.add_trace(go.Bar( | |
| name=lbl, | |
| x=confs, | |
| y=y_vals, | |
| marker_color=LABEL_COLORS.get(lbl, "#888"), | |
| text=[f"{v}%" for v in y_vals], | |
| textposition="outside", | |
| )) | |
| fig.update_layout( | |
| title="Review Label Distribution by Conference", | |
| barmode="group", | |
| yaxis=dict(title="% of reviews", range=[0, 75]), | |
| legend=dict(orientation="h", y=-0.2), | |
| height=420, | |
| margin=dict(t=50, b=80), | |
| ) | |
| return fig | |
| def fig_rqs_by_decision(data: dict) -> go.Figure: | |
| """Grouped bar: mean RQS per decision tier per conference.""" | |
| decision_map = { | |
| "Accept (Oral)": "Oral", | |
| "Accept (oral)": "Oral", | |
| "Accept (Spotlight)": "Spotlight", | |
| "Accept (spotlight)": "Spotlight", | |
| "Accept (spotlight poster)": "Spotlight", | |
| "Accept (Poster)": "Poster", | |
| "Accept (poster)": "Poster", | |
| } | |
| tiers = ["Oral", "Spotlight", "Poster"] | |
| confs = list(data.keys()) | |
| fig = go.Figure() | |
| for conf in confs: | |
| by_tier = defaultdict(list) | |
| for r in data[conf]["reviews"]: | |
| tier = decision_map.get(r["_decision"]) | |
| rqs = r.get("overall_reasoning_quality_score") | |
| if tier and rqs: | |
| by_tier[tier].append(float(rqs)) | |
| y_vals = [round(sum(by_tier[t]) / len(by_tier[t]), 2) if by_tier[t] else None for t in tiers] | |
| counts = [len(by_tier[t]) for t in tiers] | |
| fig.add_trace(go.Bar( | |
| name=conf, | |
| x=tiers, | |
| y=y_vals, | |
| marker_color=CONF_COLORS[conf], | |
| text=[f"{v:.2f}<br>(n={c})" if v else "" for v, c in zip(y_vals, counts)], | |
| textposition="outside", | |
| )) | |
| fig.update_layout( | |
| title="Mean Reasoning Quality Score by Decision Tier", | |
| barmode="group", | |
| yaxis=dict(title="RQS (1β5)", range=[0, 4]), | |
| legend=dict(orientation="h", y=-0.2), | |
| height=420, | |
| margin=dict(t=50, b=80), | |
| ) | |
| return fig | |
| def fig_s1_s2_scatter(data: dict) -> go.Figure: | |
| """Scatter: S1 score vs S2 score, colored by label, one trace per conf.""" | |
| fig = go.Figure() | |
| for conf in data: | |
| reviews = data[conf]["reviews"] | |
| for lbl in ["System 1", "Mixed", "System 2", "Non-evaluative"]: | |
| subset = [r for r in reviews if r.get("label") == lbl | |
| and r.get("system1_score") and r.get("system2_score")] | |
| if not subset: | |
| continue | |
| fig.add_trace(go.Scatter( | |
| x=[r["system1_score"] for r in subset], | |
| y=[r["system2_score"] for r in subset], | |
| mode="markers", | |
| name=f"{conf} β {lbl}", | |
| marker=dict(color=LABEL_COLORS.get(lbl, "#888"), size=5, opacity=0.6), | |
| legendgroup=lbl, | |
| showlegend=True, | |
| )) | |
| # diagonal reference line | |
| fig.add_shape(type="line", x0=1, y0=1, x1=5, y1=5, | |
| line=dict(color="gray", dash="dash", width=1)) | |
| fig.update_layout( | |
| title="System 1 vs System 2 Score (all reviews)", | |
| xaxis=dict(title="System 1 Score", range=[0.8, 5.2]), | |
| yaxis=dict(title="System 2 Score", range=[0.8, 5.2]), | |
| height=480, | |
| margin=dict(t=50, b=40), | |
| ) | |
| return fig | |
| def fig_bias_heatmap(data: dict) -> go.Figure: | |
| """Heatmap: bias frequency (% of reviews) per conference.""" | |
| bias_order = [ | |
| "Checklist Inflation", | |
| "Representativeness Heuristic", | |
| "Question Substitution", | |
| "Conclusion-First Justification", | |
| "Overconfidence", | |
| "Narrative Fallacy", | |
| "Authority Substitution", | |
| "Confirmation Bias", | |
| ] | |
| confs = list(data.keys()) | |
| z = [] | |
| text = [] | |
| for conf in confs: | |
| reviews = data[conf]["reviews"] | |
| n = len(reviews) or 1 | |
| row = [] | |
| trow = [] | |
| for b in bias_order: | |
| cnt = sum(1 for r in reviews if b in r.get("bias_diagnostics", [])) | |
| pct = round(cnt / n * 100, 1) | |
| row.append(pct) | |
| trow.append(f"{pct}%<br>({cnt})") | |
| z.append(row) | |
| text.append(trow) | |
| fig = go.Figure(go.Heatmap( | |
| z=z, | |
| x=bias_order, | |
| y=confs, | |
| text=text, | |
| texttemplate="%{text}", | |
| colorscale="YlOrRd", | |
| showscale=True, | |
| colorbar=dict(title="% reviews"), | |
| )) | |
| fig.update_layout( | |
| title="Bias Diagnostics Frequency (% of reviews per conference)", | |
| xaxis=dict(tickangle=-30), | |
| height=320, | |
| margin=dict(t=50, b=120), | |
| ) | |
| return fig | |
| def fig_rqs_distribution(data: dict) -> go.Figure: | |
| """Violin: RQS distribution per conference.""" | |
| fig = go.Figure() | |
| for conf in data: | |
| rqs_vals = [float(r["overall_reasoning_quality_score"]) | |
| for r in data[conf]["reviews"] | |
| if r.get("overall_reasoning_quality_score")] | |
| fig.add_trace(go.Violin( | |
| y=rqs_vals, | |
| name=conf, | |
| box_visible=True, | |
| meanline_visible=True, | |
| fillcolor=CONF_COLORS[conf], | |
| opacity=0.7, | |
| line_color="white", | |
| )) | |
| fig.update_layout( | |
| title="RQS Distribution by Conference", | |
| yaxis=dict(title="Overall Reasoning Quality Score (1β5)"), | |
| height=400, | |
| margin=dict(t=50, b=40), | |
| ) | |
| return fig | |
| # ββ Summary text ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def build_summary(data: dict) -> str: | |
| lines = [] | |
| for conf in data: | |
| reviews = data[conf]["reviews"] | |
| if not reviews: | |
| continue | |
| n = len(reviews) | |
| lc = Counter(r["label"] for r in reviews) | |
| rqs = [float(r["overall_reasoning_quality_score"]) for r in reviews if r.get("overall_reasoning_quality_score")] | |
| mean_rqs = sum(rqs) / len(rqs) if rqs else 0 | |
| lines.append(f"**{conf}** β {n} reviews Β· RQS mean {mean_rqs:.2f} Β· " | |
| f"Mixed {lc.get('Mixed',0)/n*100:.0f}% Β· " | |
| f"S1 {lc.get('System 1',0)/n*100:.0f}% Β· " | |
| f"S2 {lc.get('System 2',0)/n*100:.0f}%") | |
| return "\n\n".join(lines) | |
| FINDINGS = """ | |
| ### Key Findings | |
| *100 papers Γ 3 conferences, ~1,150 reviews, rated by claude-sonnet-4-6. Papers sampled by stratified random sampling proportional to acceptance tier (Oral / Spotlight / Poster) within each venue.* | |
| 1. **ICML and NeurIPS reviewers show more System 2 tendency (~23β26%) than ICLR (16%).** ICML's structured fields (*Claims and Evidence*, *Theoretical Claims*, *Experimental Designs*) appear to scaffold more explicit, decomposed reasoning. | |
| 2. **Despite different formats and communities, the overall analytical depth of peer review is remarkably uniform** (RQS 2.80β2.94 / 5), suggesting a field-wide ceiling rather than venue-specific culture. | |
| 3. **Decision tier does not predict review quality.** Oral-paper reviews are not systematically stronger than Poster reviews (differences < 0.2 RQS points). Reviewers do not write more analytically for papers they rate highly. | |
| --- | |
| > *We are not against AI review. We are against flawed reasoning behind review.* | |
| """ | |