V2: switch to raw Intercepted DESC sort + Hermes-only filter (visible-column order)
Browse files
app.py
CHANGED
|
@@ -33,7 +33,7 @@ INTRO = """# π ClawBench β Web Agent Benchmark
|
|
| 33 |
[**π Paper**](https://arxiv.org/abs/2604.08523) Β· [**π» GitHub**](https://github.com/reacher-z/ClawBench) Β· [**π Dataset**](https://huggingface.co/datasets/TIGER-Lab/ClawBench) Β· [**π Traces V1**](https://huggingface.co/datasets/NAIL-Group/ClawBenchV1Trace) Β· [**π Traces V2**](https://huggingface.co/datasets/TIGER-Lab/ClawBenchV2Trace) Β· [**π Site**](https://claw-bench.com)
|
| 34 |
"""
|
| 35 |
|
| 36 |
-
TABLE_INTRO = """**Intercepted** (sort key) = agent's final HTTP request matched the task's URL/method schema β Stage 1, deterministic, no judge. **Reward** = additionally requires the LLM judge (default `deepseek/deepseek-v4-pro`) to confirm the payload fulfilled the instruction β Stage 2. Rows are ranked by Intercepted
|
| 37 |
|
| 38 |
ABOUT = """## About ClawBench
|
| 39 |
|
|
@@ -109,23 +109,15 @@ def load_results() -> pd.DataFrame:
|
|
| 109 |
df = pd.read_csv(io.BytesIO(raw))
|
| 110 |
if "reward_rate" not in df.columns:
|
| 111 |
df["reward_rate"] = pd.NA
|
| 112 |
-
# Rank by
|
| 113 |
-
#
|
| 114 |
-
#
|
| 115 |
-
# so partial batches don't outrank complete ones with lower rates.
|
| 116 |
-
df["_corpus_size"] = df["dataset"].map(CORPUS_SIZE).fillna(df["total"])
|
| 117 |
-
# `pass_rate` in our CSV is the Stage-1 intercept rate (%) over attempted.
|
| 118 |
-
# Convert it to a fraction over the full corpus.
|
| 119 |
-
df["_intercepted_count"] = (df["pass_rate"].astype(float) / 100.0 * df["total"]).round().astype(int)
|
| 120 |
-
df["_corpus_intercepted"] = df["_intercepted_count"] / df["_corpus_size"]
|
| 121 |
-
df["_corpus_reward"] = df["passed"] / df["_corpus_size"]
|
| 122 |
df = df.sort_values(
|
| 123 |
-
["dataset", "
|
| 124 |
ascending=[True, False, False],
|
| 125 |
na_position="last",
|
| 126 |
).reset_index(drop=True)
|
| 127 |
df.insert(0, "rank", df.groupby("dataset").cumcount() + 1)
|
| 128 |
-
df = df.drop(columns=["_corpus_size", "_corpus_reward", "_intercepted_count", "_corpus_intercepted"])
|
| 129 |
df["pass_rate"] = df["pass_rate"].map(_format_pct)
|
| 130 |
df["reward_rate"] = df["reward_rate"].map(_format_pct)
|
| 131 |
df["wall_hours"] = df["wall_hours"].map(_format_wall)
|
|
|
|
| 33 |
[**π Paper**](https://arxiv.org/abs/2604.08523) Β· [**π» GitHub**](https://github.com/reacher-z/ClawBench) Β· [**π Dataset**](https://huggingface.co/datasets/TIGER-Lab/ClawBench) Β· [**π Traces V1**](https://huggingface.co/datasets/NAIL-Group/ClawBenchV1Trace) Β· [**π Traces V2**](https://huggingface.co/datasets/TIGER-Lab/ClawBenchV2Trace) Β· [**π Site**](https://claw-bench.com)
|
| 34 |
"""
|
| 35 |
|
| 36 |
+
TABLE_INTRO = """**Intercepted** (sort key) = agent's final HTTP request matched the task's URL/method schema β Stage 1, deterministic, no judge. **Reward** = additionally requires the LLM judge (default `deepseek/deepseek-v4-pro`) to confirm the payload fulfilled the instruction β Stage 2. Rows are ranked by Intercepted DESC, then Reward DESC as tiebreak. V2 is **Hermes-only**; alternative harnesses are evaluated separately. *Partial* = batch attempted fewer than the full corpus (mid-run abort / queue cap); rates are over attempted, not over corpus."""
|
| 37 |
|
| 38 |
ABOUT = """## About ClawBench
|
| 39 |
|
|
|
|
| 109 |
df = pd.read_csv(io.BytesIO(raw))
|
| 110 |
if "reward_rate" not in df.columns:
|
| 111 |
df["reward_rate"] = pd.NA
|
| 112 |
+
# Rank by raw Intercepted (Stage 1 rate over attempted tasks) descending, then
|
| 113 |
+
# Reward as tiebreak. Visible-column order: what you see in the Intercepted
|
| 114 |
+
# column is what sorts. Partial batches keep their attempted-rate.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
df = df.sort_values(
|
| 116 |
+
["dataset", "pass_rate", "reward_rate"],
|
| 117 |
ascending=[True, False, False],
|
| 118 |
na_position="last",
|
| 119 |
).reset_index(drop=True)
|
| 120 |
df.insert(0, "rank", df.groupby("dataset").cumcount() + 1)
|
|
|
|
| 121 |
df["pass_rate"] = df["pass_rate"].map(_format_pct)
|
| 122 |
df["reward_rate"] = df["reward_rate"].map(_format_pct)
|
| 123 |
df["wall_hours"] = df["wall_hours"].map(_format_wall)
|