Spaces:

nuojohnchen
/

Kahneman4Review

Running

App Files Files Community

Kahneman4Review / analytics.py

nuocuhz

Analytics: remove Non-evaluative, trim findings to 3 points, add slogan

918804b 1 day ago

raw

history blame contribute delete

10.4 kB

	"""analytics.py — Load sample results and build Plotly figures for the Analytics tab."""

	import json
	import os
	from collections import Counter, defaultdict

	import plotly.graph_objects as go
	from plotly.subplots import make_subplots

	# ── Data loading ──────────────────────────────────────────────────────────────

	_DIR = os.path.dirname(__file__)

	DATASETS = {
	"ICLR 2025": "iclr2025_v2_results.jsonl",
	"ICML 2025": "icml2025_v3_results.jsonl",
	"NeurIPS 2025": "neurips2025_v3_results.jsonl",
	}

	LABEL_COLORS = {
	"System 1": "#ef4444",
	"Mixed": "#f59e0b",
	"System 2": "#22c55e",
	}

	CONF_COLORS = {
	"ICLR 2025": "#6366f1",
	"ICML 2025": "#f59e0b",
	"NeurIPS 2025": "#22c55e",
	}


	def _load_results(fname: str) -> list:
	path = os.path.join(_DIR, fname)
	if not os.path.exists(path):
	return []
	out = []
	for line in open(path):
	line = line.strip()
	if line:
	try:
	out.append(json.loads(line))
	except Exception:
	pass
	return out


	def load_all() -> dict:
	"""Returns {conf: {"papers": [...], "reviews": [...], "metas": [...]}}"""
	data = {}
	for conf, fname in DATASETS.items():
	papers = _load_results(fname)
	reviews = []
	for p in papers:
	for r in p.get("review_ratings", []):
	if r.get("label"):
	reviews.append({**r, "_decision": p.get("decision", ""), "_conf": conf})
	metas = []
	for p in papers:
	m = p.get("metareview_rating")
	if m and m.get("label"):
	metas.append({**m, "_decision": p.get("decision", ""), "_conf": conf})
	data[conf] = {"papers": papers, "reviews": reviews, "metas": metas}
	return data


	# ── Figure builders ───────────────────────────────────────────────────────────

	def fig_label_distribution(data: dict) -> go.Figure:
	"""Grouped bar: label distribution per conference."""
	labels_order = ["System 1", "Mixed", "System 2"]
	confs = list(data.keys())

	fig = go.Figure()
	for lbl in labels_order:
	y_vals = []
	for conf in confs:
	reviews = data[conf]["reviews"]
	if not reviews:
	y_vals.append(0)
	continue
	cnt = sum(1 for r in reviews if r["label"] == lbl)
	y_vals.append(round(cnt / len(reviews) * 100, 1))
	fig.add_trace(go.Bar(
	name=lbl,
	x=confs,
	y=y_vals,
	marker_color=LABEL_COLORS.get(lbl, "#888"),
	text=[f"{v}%" for v in y_vals],
	textposition="outside",
	))

	fig.update_layout(
	title="Review Label Distribution by Conference",
	barmode="group",
	yaxis=dict(title="% of reviews", range=[0, 75]),
	legend=dict(orientation="h", y=-0.2),
	height=420,
	margin=dict(t=50, b=80),
	)
	return fig


	def fig_rqs_by_decision(data: dict) -> go.Figure:
	"""Grouped bar: mean RQS per decision tier per conference."""
	decision_map = {
	"Accept (Oral)": "Oral",
	"Accept (oral)": "Oral",
	"Accept (Spotlight)": "Spotlight",
	"Accept (spotlight)": "Spotlight",
	"Accept (spotlight poster)": "Spotlight",
	"Accept (Poster)": "Poster",
	"Accept (poster)": "Poster",
	}
	tiers = ["Oral", "Spotlight", "Poster"]
	confs = list(data.keys())

	fig = go.Figure()
	for conf in confs:
	by_tier = defaultdict(list)
	for r in data[conf]["reviews"]:
	tier = decision_map.get(r["_decision"])
	rqs = r.get("overall_reasoning_quality_score")
	if tier and rqs:
	by_tier[tier].append(float(rqs))
	y_vals = [round(sum(by_tier[t]) / len(by_tier[t]), 2) if by_tier[t] else None for t in tiers]
	counts = [len(by_tier[t]) for t in tiers]
	fig.add_trace(go.Bar(
	name=conf,
	x=tiers,
	y=y_vals,
	marker_color=CONF_COLORS[conf],
	text=[f"{v:.2f}<br>(n={c})" if v else "" for v, c in zip(y_vals, counts)],
	textposition="outside",
	))

	fig.update_layout(
	title="Mean Reasoning Quality Score by Decision Tier",
	barmode="group",
	yaxis=dict(title="RQS (1–5)", range=[0, 4]),
	legend=dict(orientation="h", y=-0.2),
	height=420,
	margin=dict(t=50, b=80),
	)
	return fig


	def fig_s1_s2_scatter(data: dict) -> go.Figure:
	"""Scatter: S1 score vs S2 score, colored by label, one trace per conf."""
	fig = go.Figure()
	for conf in data:
	reviews = data[conf]["reviews"]
	for lbl in ["System 1", "Mixed", "System 2", "Non-evaluative"]:
	subset = [r for r in reviews if r.get("label") == lbl
	and r.get("system1_score") and r.get("system2_score")]
	if not subset:
	continue
	fig.add_trace(go.Scatter(
	x=[r["system1_score"] for r in subset],
	y=[r["system2_score"] for r in subset],
	mode="markers",
	name=f"{conf} — {lbl}",
	marker=dict(color=LABEL_COLORS.get(lbl, "#888"), size=5, opacity=0.6),
	legendgroup=lbl,
	showlegend=True,
	))

	# diagonal reference line
	fig.add_shape(type="line", x0=1, y0=1, x1=5, y1=5,
	line=dict(color="gray", dash="dash", width=1))
	fig.update_layout(
	title="System 1 vs System 2 Score (all reviews)",
	xaxis=dict(title="System 1 Score", range=[0.8, 5.2]),
	yaxis=dict(title="System 2 Score", range=[0.8, 5.2]),
	height=480,
	margin=dict(t=50, b=40),
	)
	return fig


	def fig_bias_heatmap(data: dict) -> go.Figure:
	"""Heatmap: bias frequency (% of reviews) per conference."""
	bias_order = [
	"Checklist Inflation",
	"Representativeness Heuristic",
	"Question Substitution",
	"Conclusion-First Justification",
	"Overconfidence",
	"Narrative Fallacy",
	"Authority Substitution",
	"Confirmation Bias",
	]
	confs = list(data.keys())
	z = []
	text = []
	for conf in confs:
	reviews = data[conf]["reviews"]
	n = len(reviews) or 1
	row = []
	trow = []
	for b in bias_order:
	cnt = sum(1 for r in reviews if b in r.get("bias_diagnostics", []))
	pct = round(cnt / n * 100, 1)
	row.append(pct)
	trow.append(f"{pct}%<br>({cnt})")
	z.append(row)
	text.append(trow)

	fig = go.Figure(go.Heatmap(
	z=z,
	x=bias_order,
	y=confs,
	text=text,
	texttemplate="%{text}",
	colorscale="YlOrRd",
	showscale=True,
	colorbar=dict(title="% reviews"),
	))
	fig.update_layout(
	title="Bias Diagnostics Frequency (% of reviews per conference)",
	xaxis=dict(tickangle=-30),
	height=320,
	margin=dict(t=50, b=120),
	)
	return fig


	def fig_rqs_distribution(data: dict) -> go.Figure:
	"""Violin: RQS distribution per conference."""
	fig = go.Figure()
	for conf in data:
	rqs_vals = [float(r["overall_reasoning_quality_score"])
	for r in data[conf]["reviews"]
	if r.get("overall_reasoning_quality_score")]
	fig.add_trace(go.Violin(
	y=rqs_vals,
	name=conf,
	box_visible=True,
	meanline_visible=True,
	fillcolor=CONF_COLORS[conf],
	opacity=0.7,
	line_color="white",
	))
	fig.update_layout(
	title="RQS Distribution by Conference",
	yaxis=dict(title="Overall Reasoning Quality Score (1–5)"),
	height=400,
	margin=dict(t=50, b=40),
	)
	return fig


	# ── Summary text ──────────────────────────────────────────────────────────────

	def build_summary(data: dict) -> str:
	lines = []
	for conf in data:
	reviews = data[conf]["reviews"]
	if not reviews:
	continue
	n = len(reviews)
	lc = Counter(r["label"] for r in reviews)
	rqs = [float(r["overall_reasoning_quality_score"]) for r in reviews if r.get("overall_reasoning_quality_score")]
	mean_rqs = sum(rqs) / len(rqs) if rqs else 0
	lines.append(f"{conf} — {n} reviews · RQS mean {mean_rqs:.2f} · "
	f"Mixed {lc.get('Mixed',0)/n*100:.0f}% · "
	f"S1 {lc.get('System 1',0)/n*100:.0f}% · "
	f"S2 {lc.get('System 2',0)/n*100:.0f}%")
	return "\n\n".join(lines)


	FINDINGS = """
	### Key Findings

	100 papers × 3 conferences, ~1,150 reviews, rated by claude-sonnet-4-6. Papers sampled by stratified random sampling proportional to acceptance tier (Oral / Spotlight / Poster) within each venue.

	1. ICML and NeurIPS reviewers show more System 2 tendency (~23–26%) than ICLR (16%). ICML's structured fields (Claims and Evidence, Theoretical Claims, Experimental Designs) appear to scaffold more explicit, decomposed reasoning.

	2. Despite different formats and communities, the overall analytical depth of peer review is remarkably uniform (RQS 2.80–2.94 / 5), suggesting a field-wide ceiling rather than venue-specific culture.

	3. Decision tier does not predict review quality. Oral-paper reviews are not systematically stronger than Poster reviews (differences < 0.2 RQS points). Reviewers do not write more analytically for papers they rate highly.

	---

	> We are not against AI review. We are against flawed reasoning behind review.
	"""