Kahneman4Review / analytics.py
nuocuhz's picture
Analytics: remove Non-evaluative, trim findings to 3 points, add slogan
918804b
"""analytics.py β€” Load sample results and build Plotly figures for the Analytics tab."""
import json
import os
from collections import Counter, defaultdict
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# ── Data loading ──────────────────────────────────────────────────────────────
_DIR = os.path.dirname(__file__)
DATASETS = {
"ICLR 2025": "iclr2025_v2_results.jsonl",
"ICML 2025": "icml2025_v3_results.jsonl",
"NeurIPS 2025": "neurips2025_v3_results.jsonl",
}
LABEL_COLORS = {
"System 1": "#ef4444",
"Mixed": "#f59e0b",
"System 2": "#22c55e",
}
CONF_COLORS = {
"ICLR 2025": "#6366f1",
"ICML 2025": "#f59e0b",
"NeurIPS 2025": "#22c55e",
}
def _load_results(fname: str) -> list:
path = os.path.join(_DIR, fname)
if not os.path.exists(path):
return []
out = []
for line in open(path):
line = line.strip()
if line:
try:
out.append(json.loads(line))
except Exception:
pass
return out
def load_all() -> dict:
"""Returns {conf: {"papers": [...], "reviews": [...], "metas": [...]}}"""
data = {}
for conf, fname in DATASETS.items():
papers = _load_results(fname)
reviews = []
for p in papers:
for r in p.get("review_ratings", []):
if r.get("label"):
reviews.append({**r, "_decision": p.get("decision", ""), "_conf": conf})
metas = []
for p in papers:
m = p.get("metareview_rating")
if m and m.get("label"):
metas.append({**m, "_decision": p.get("decision", ""), "_conf": conf})
data[conf] = {"papers": papers, "reviews": reviews, "metas": metas}
return data
# ── Figure builders ───────────────────────────────────────────────────────────
def fig_label_distribution(data: dict) -> go.Figure:
"""Grouped bar: label distribution per conference."""
labels_order = ["System 1", "Mixed", "System 2"]
confs = list(data.keys())
fig = go.Figure()
for lbl in labels_order:
y_vals = []
for conf in confs:
reviews = data[conf]["reviews"]
if not reviews:
y_vals.append(0)
continue
cnt = sum(1 for r in reviews if r["label"] == lbl)
y_vals.append(round(cnt / len(reviews) * 100, 1))
fig.add_trace(go.Bar(
name=lbl,
x=confs,
y=y_vals,
marker_color=LABEL_COLORS.get(lbl, "#888"),
text=[f"{v}%" for v in y_vals],
textposition="outside",
))
fig.update_layout(
title="Review Label Distribution by Conference",
barmode="group",
yaxis=dict(title="% of reviews", range=[0, 75]),
legend=dict(orientation="h", y=-0.2),
height=420,
margin=dict(t=50, b=80),
)
return fig
def fig_rqs_by_decision(data: dict) -> go.Figure:
"""Grouped bar: mean RQS per decision tier per conference."""
decision_map = {
"Accept (Oral)": "Oral",
"Accept (oral)": "Oral",
"Accept (Spotlight)": "Spotlight",
"Accept (spotlight)": "Spotlight",
"Accept (spotlight poster)": "Spotlight",
"Accept (Poster)": "Poster",
"Accept (poster)": "Poster",
}
tiers = ["Oral", "Spotlight", "Poster"]
confs = list(data.keys())
fig = go.Figure()
for conf in confs:
by_tier = defaultdict(list)
for r in data[conf]["reviews"]:
tier = decision_map.get(r["_decision"])
rqs = r.get("overall_reasoning_quality_score")
if tier and rqs:
by_tier[tier].append(float(rqs))
y_vals = [round(sum(by_tier[t]) / len(by_tier[t]), 2) if by_tier[t] else None for t in tiers]
counts = [len(by_tier[t]) for t in tiers]
fig.add_trace(go.Bar(
name=conf,
x=tiers,
y=y_vals,
marker_color=CONF_COLORS[conf],
text=[f"{v:.2f}<br>(n={c})" if v else "" for v, c in zip(y_vals, counts)],
textposition="outside",
))
fig.update_layout(
title="Mean Reasoning Quality Score by Decision Tier",
barmode="group",
yaxis=dict(title="RQS (1–5)", range=[0, 4]),
legend=dict(orientation="h", y=-0.2),
height=420,
margin=dict(t=50, b=80),
)
return fig
def fig_s1_s2_scatter(data: dict) -> go.Figure:
"""Scatter: S1 score vs S2 score, colored by label, one trace per conf."""
fig = go.Figure()
for conf in data:
reviews = data[conf]["reviews"]
for lbl in ["System 1", "Mixed", "System 2", "Non-evaluative"]:
subset = [r for r in reviews if r.get("label") == lbl
and r.get("system1_score") and r.get("system2_score")]
if not subset:
continue
fig.add_trace(go.Scatter(
x=[r["system1_score"] for r in subset],
y=[r["system2_score"] for r in subset],
mode="markers",
name=f"{conf} β€” {lbl}",
marker=dict(color=LABEL_COLORS.get(lbl, "#888"), size=5, opacity=0.6),
legendgroup=lbl,
showlegend=True,
))
# diagonal reference line
fig.add_shape(type="line", x0=1, y0=1, x1=5, y1=5,
line=dict(color="gray", dash="dash", width=1))
fig.update_layout(
title="System 1 vs System 2 Score (all reviews)",
xaxis=dict(title="System 1 Score", range=[0.8, 5.2]),
yaxis=dict(title="System 2 Score", range=[0.8, 5.2]),
height=480,
margin=dict(t=50, b=40),
)
return fig
def fig_bias_heatmap(data: dict) -> go.Figure:
"""Heatmap: bias frequency (% of reviews) per conference."""
bias_order = [
"Checklist Inflation",
"Representativeness Heuristic",
"Question Substitution",
"Conclusion-First Justification",
"Overconfidence",
"Narrative Fallacy",
"Authority Substitution",
"Confirmation Bias",
]
confs = list(data.keys())
z = []
text = []
for conf in confs:
reviews = data[conf]["reviews"]
n = len(reviews) or 1
row = []
trow = []
for b in bias_order:
cnt = sum(1 for r in reviews if b in r.get("bias_diagnostics", []))
pct = round(cnt / n * 100, 1)
row.append(pct)
trow.append(f"{pct}%<br>({cnt})")
z.append(row)
text.append(trow)
fig = go.Figure(go.Heatmap(
z=z,
x=bias_order,
y=confs,
text=text,
texttemplate="%{text}",
colorscale="YlOrRd",
showscale=True,
colorbar=dict(title="% reviews"),
))
fig.update_layout(
title="Bias Diagnostics Frequency (% of reviews per conference)",
xaxis=dict(tickangle=-30),
height=320,
margin=dict(t=50, b=120),
)
return fig
def fig_rqs_distribution(data: dict) -> go.Figure:
"""Violin: RQS distribution per conference."""
fig = go.Figure()
for conf in data:
rqs_vals = [float(r["overall_reasoning_quality_score"])
for r in data[conf]["reviews"]
if r.get("overall_reasoning_quality_score")]
fig.add_trace(go.Violin(
y=rqs_vals,
name=conf,
box_visible=True,
meanline_visible=True,
fillcolor=CONF_COLORS[conf],
opacity=0.7,
line_color="white",
))
fig.update_layout(
title="RQS Distribution by Conference",
yaxis=dict(title="Overall Reasoning Quality Score (1–5)"),
height=400,
margin=dict(t=50, b=40),
)
return fig
# ── Summary text ──────────────────────────────────────────────────────────────
def build_summary(data: dict) -> str:
lines = []
for conf in data:
reviews = data[conf]["reviews"]
if not reviews:
continue
n = len(reviews)
lc = Counter(r["label"] for r in reviews)
rqs = [float(r["overall_reasoning_quality_score"]) for r in reviews if r.get("overall_reasoning_quality_score")]
mean_rqs = sum(rqs) / len(rqs) if rqs else 0
lines.append(f"**{conf}** β€” {n} reviews Β· RQS mean {mean_rqs:.2f} Β· "
f"Mixed {lc.get('Mixed',0)/n*100:.0f}% Β· "
f"S1 {lc.get('System 1',0)/n*100:.0f}% Β· "
f"S2 {lc.get('System 2',0)/n*100:.0f}%")
return "\n\n".join(lines)
FINDINGS = """
### Key Findings
*100 papers Γ— 3 conferences, ~1,150 reviews, rated by claude-sonnet-4-6. Papers sampled by stratified random sampling proportional to acceptance tier (Oral / Spotlight / Poster) within each venue.*
1. **ICML and NeurIPS reviewers show more System 2 tendency (~23–26%) than ICLR (16%).** ICML's structured fields (*Claims and Evidence*, *Theoretical Claims*, *Experimental Designs*) appear to scaffold more explicit, decomposed reasoning.
2. **Despite different formats and communities, the overall analytical depth of peer review is remarkably uniform** (RQS 2.80–2.94 / 5), suggesting a field-wide ceiling rather than venue-specific culture.
3. **Decision tier does not predict review quality.** Oral-paper reviews are not systematically stronger than Poster reviews (differences < 0.2 RQS points). Reviewers do not write more analytically for papers they rate highly.
---
> *We are not against AI review. We are against flawed reasoning behind review.*
"""