AgPerry commited on
Commit
a036d16
Β·
verified Β·
1 Parent(s): 50a75ee

V2: switch to raw Intercepted DESC sort + Hermes-only filter (visible-column order)

Browse files
Files changed (1) hide show
  1. app.py +5 -13
app.py CHANGED
@@ -33,7 +33,7 @@ INTRO = """# πŸ† ClawBench β€” Web Agent Benchmark
33
  [**πŸ“– Paper**](https://arxiv.org/abs/2604.08523) Β· [**πŸ’» GitHub**](https://github.com/reacher-z/ClawBench) Β· [**πŸ—‚ Dataset**](https://huggingface.co/datasets/TIGER-Lab/ClawBench) Β· [**🎞 Traces V1**](https://huggingface.co/datasets/NAIL-Group/ClawBenchV1Trace) Β· [**🎞 Traces V2**](https://huggingface.co/datasets/TIGER-Lab/ClawBenchV2Trace) Β· [**🌐 Site**](https://claw-bench.com)
34
  """
35
 
36
- TABLE_INTRO = """**Intercepted** (sort key) = agent's final HTTP request matched the task's URL/method schema β€” Stage 1, deterministic, no judge. **Reward** = additionally requires the LLM judge (default `deepseek/deepseek-v4-pro`) to confirm the payload fulfilled the instruction β€” Stage 2. Rows are ranked by Intercepted (corpus-normalized: `intercepted / 130` for V2 so partials don't outrank complete batches) with Reward as tiebreak. `β€”` = no Stage-2 data yet."""
37
 
38
  ABOUT = """## About ClawBench
39
 
@@ -109,23 +109,15 @@ def load_results() -> pd.DataFrame:
109
  df = pd.read_csv(io.BytesIO(raw))
110
  if "reward_rate" not in df.columns:
111
  df["reward_rate"] = pd.NA
112
- # Rank by corpus interception rate (intercepted_count / full_corpus_size) as
113
- # the headline metric β€” Stage 1 is deterministic (URL/method match) and
114
- # universally comparable. Tiebreak by corpus reward (passed / corpus_size)
115
- # so partial batches don't outrank complete ones with lower rates.
116
- df["_corpus_size"] = df["dataset"].map(CORPUS_SIZE).fillna(df["total"])
117
- # `pass_rate` in our CSV is the Stage-1 intercept rate (%) over attempted.
118
- # Convert it to a fraction over the full corpus.
119
- df["_intercepted_count"] = (df["pass_rate"].astype(float) / 100.0 * df["total"]).round().astype(int)
120
- df["_corpus_intercepted"] = df["_intercepted_count"] / df["_corpus_size"]
121
- df["_corpus_reward"] = df["passed"] / df["_corpus_size"]
122
  df = df.sort_values(
123
- ["dataset", "_corpus_intercepted", "_corpus_reward"],
124
  ascending=[True, False, False],
125
  na_position="last",
126
  ).reset_index(drop=True)
127
  df.insert(0, "rank", df.groupby("dataset").cumcount() + 1)
128
- df = df.drop(columns=["_corpus_size", "_corpus_reward", "_intercepted_count", "_corpus_intercepted"])
129
  df["pass_rate"] = df["pass_rate"].map(_format_pct)
130
  df["reward_rate"] = df["reward_rate"].map(_format_pct)
131
  df["wall_hours"] = df["wall_hours"].map(_format_wall)
 
33
  [**πŸ“– Paper**](https://arxiv.org/abs/2604.08523) Β· [**πŸ’» GitHub**](https://github.com/reacher-z/ClawBench) Β· [**πŸ—‚ Dataset**](https://huggingface.co/datasets/TIGER-Lab/ClawBench) Β· [**🎞 Traces V1**](https://huggingface.co/datasets/NAIL-Group/ClawBenchV1Trace) Β· [**🎞 Traces V2**](https://huggingface.co/datasets/TIGER-Lab/ClawBenchV2Trace) Β· [**🌐 Site**](https://claw-bench.com)
34
  """
35
 
36
+ TABLE_INTRO = """**Intercepted** (sort key) = agent's final HTTP request matched the task's URL/method schema β€” Stage 1, deterministic, no judge. **Reward** = additionally requires the LLM judge (default `deepseek/deepseek-v4-pro`) to confirm the payload fulfilled the instruction β€” Stage 2. Rows are ranked by Intercepted DESC, then Reward DESC as tiebreak. V2 is **Hermes-only**; alternative harnesses are evaluated separately. *Partial* = batch attempted fewer than the full corpus (mid-run abort / queue cap); rates are over attempted, not over corpus."""
37
 
38
  ABOUT = """## About ClawBench
39
 
 
109
  df = pd.read_csv(io.BytesIO(raw))
110
  if "reward_rate" not in df.columns:
111
  df["reward_rate"] = pd.NA
112
+ # Rank by raw Intercepted (Stage 1 rate over attempted tasks) descending, then
113
+ # Reward as tiebreak. Visible-column order: what you see in the Intercepted
114
+ # column is what sorts. Partial batches keep their attempted-rate.
 
 
 
 
 
 
 
115
  df = df.sort_values(
116
+ ["dataset", "pass_rate", "reward_rate"],
117
  ascending=[True, False, False],
118
  na_position="last",
119
  ).reset_index(drop=True)
120
  df.insert(0, "rank", df.groupby("dataset").cumcount() + 1)
 
121
  df["pass_rate"] = df["pass_rate"].map(_format_pct)
122
  df["reward_rate"] = df["reward_rate"].map(_format_pct)
123
  df["wall_hours"] = df["wall_hours"].map(_format_wall)