hf-papers / scripts /score_hf_hub_community_coverage.py
evalstate's picture
evalstate HF Staff
sync: promote hf_hub_community prompt v3 + add prompt/coverage harness
bba4fab verified
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import re
import subprocess
from pathlib import Path
from typing import Any
ROOT = Path(__file__).resolve().parents[1]
DEFAULT_CARDS_DIR = ROOT / '.fast-agent' / 'tool-cards'
DEFAULT_AGENT = 'hf_hub_community'
PROMPTS_FILE = ROOT / 'scripts' / 'hf_hub_community_coverage_prompts.json'
REPORT_MD = ROOT / 'docs' / 'hf_hub_community_coverage_report.md'
REPORT_JSON = ROOT / 'docs' / 'hf_hub_community_coverage_report.json'
ANSI_RE = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]")
def strip_ansi(text: str) -> str:
return ANSI_RE.sub('', text)
def load_cases(path: Path) -> list[dict[str, Any]]:
rows = json.loads(path.read_text(encoding='utf-8'))
if not isinstance(rows, list):
raise ValueError('coverage prompts file must be a JSON list')
out: list[dict[str, Any]] = []
for row in rows:
if not isinstance(row, dict):
continue
out.append({
'id': int(row['id']),
'prompt': str(row['prompt']),
'expected_endpoint_any': list(row.get('expected_endpoint_any', [])),
'expected_method_any': [str(x).upper() for x in row.get('expected_method_any', ['GET'])],
})
return out
def _session_extract(result_path: Path) -> dict[str, Any]:
data = json.loads(result_path.read_text(encoding='utf-8'))
messages = data.get('messages', []) if isinstance(data, dict) else []
endpoints: list[str] = []
methods: list[str] = []
tool_names: list[str] = []
tool_calls_count = 0
merged_parts: list[str] = []
tool_error = False
usage_input_tokens = 0
usage_output_tokens = 0
usage_total_tokens = 0
for msg in messages:
if not isinstance(msg, dict):
continue
if msg.get('role') == 'assistant':
for item in msg.get('content', []) or []:
if isinstance(item, dict) and item.get('type') == 'text' and item.get('text'):
merged_parts.append(str(item['text']))
channels = msg.get('channels') or {}
for item in channels.get('reasoning', []) or []:
if isinstance(item, dict) and item.get('text'):
merged_parts.append(str(item['text']))
for item in channels.get('fast-agent-usage', []) or []:
if not isinstance(item, dict):
continue
txt = item.get('text')
if not isinstance(txt, str):
continue
try:
payload = json.loads(txt)
except Exception:
continue
turn = payload.get('turn', {}) if isinstance(payload, dict) else {}
if isinstance(turn, dict):
usage_input_tokens += int(turn.get('input_tokens') or 0)
usage_output_tokens += int(turn.get('output_tokens') or 0)
usage_total_tokens += int(turn.get('total_tokens') or 0)
tool_calls = msg.get('tool_calls') or {}
if isinstance(tool_calls, dict):
tool_calls_count += len(tool_calls)
for tc in tool_calls.values():
params = (tc or {}).get('params', {}) if isinstance(tc, dict) else {}
name = params.get('name') if isinstance(params, dict) else None
args = params.get('arguments', {}) if isinstance(params, dict) else {}
if isinstance(name, str):
tool_names.append(name)
merged_parts.append(f'tool call - {name}')
if isinstance(args, dict):
ep = args.get('endpoint')
if isinstance(ep, str):
endpoints.append(ep)
method = args.get('method')
methods.append(str(method).upper() if method else 'GET')
merged_parts.append(json.dumps(args, ensure_ascii=False))
if msg.get('role') == 'user':
tool_results = msg.get('tool_results') or {}
if isinstance(tool_results, dict):
for tr in tool_results.values():
if bool((tr or {}).get('isError')):
tool_error = True
for item in (tr or {}).get('content', []) or []:
if isinstance(item, dict) and item.get('type') == 'text' and item.get('text'):
text = str(item['text'])
merged_parts.append(text)
if 'Error executing tool' in text:
tool_error = True
return {
'endpoints': endpoints,
'methods': methods,
'tool_names': tool_names,
'tool_calls_count': tool_calls_count,
'tool_error': tool_error,
'merged': '\n'.join(merged_parts).strip(),
'usage': {
'input_tokens': usage_input_tokens,
'output_tokens': usage_output_tokens,
'total_tokens': usage_total_tokens,
},
}
def run_case(
case: dict[str, Any],
timeout_sec: int,
model: str,
agent_cards: Path,
agent: str,
result_path: Path,
) -> dict[str, Any]:
prompt = case['prompt']
result_path.parent.mkdir(parents=True, exist_ok=True)
cmd = [
'fast-agent', 'go',
'--no-env',
'--model', model,
'--agent-cards', str(agent_cards),
'--agent', agent,
'--results', str(result_path),
'-m', prompt,
]
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout_sec)
out = strip_ansi(proc.stdout or '')
err = strip_ansi(proc.stderr or '')
if not result_path.exists():
raise RuntimeError(f'Expected --results file not written: {result_path}')
parsed = _session_extract(result_path)
expected_ep = [re.compile(p) for p in case.get('expected_endpoint_any', [])]
expected_methods = [m.upper() for m in case.get('expected_method_any', ['GET'])]
endpoint_ok = any(p.search(ep) for p in expected_ep for ep in parsed['endpoints']) if expected_ep else False
method_ok = any(m in expected_methods for m in parsed['methods']) if parsed['methods'] else ('GET' in expected_methods)
tool_ok = any(t == 'hf_api_request' for t in parsed['tool_names'])
success = proc.returncode == 0 and 'Traceback' not in (out + '\n' + err)
clarity = len(parsed['merged']) > 20
score = int(endpoint_ok) * 4 + int(method_ok) * 2 + int(tool_ok) * 2 + int(success) + int(clarity)
return {
'id': case['id'],
'prompt': prompt,
'returncode': proc.returncode,
'result_file': str(result_path),
'observed': {
'endpoints': parsed['endpoints'],
'methods': parsed['methods'],
'tool_names': parsed['tool_names'],
'tool_calls_count': parsed['tool_calls_count'],
'tool_error': parsed['tool_error'],
'usage': parsed['usage'],
},
'expected': {
'endpoint_any': case.get('expected_endpoint_any', []),
'method_any': expected_methods,
},
'eval': {
'endpoint_ok': endpoint_ok,
'method_ok': method_ok,
'tool_ok': tool_ok,
'success': success,
'clarity': clarity,
'score_total': score,
},
'merged': parsed['merged'],
}
def summarize(rows: list[dict[str, Any]]) -> dict[str, Any]:
n = len(rows)
if n == 0:
return {'n_cases': 0}
endpoint_rate = sum(1 for r in rows if r['eval']['endpoint_ok']) / n
method_rate = sum(1 for r in rows if r['eval']['method_ok']) / n
tool_rate = sum(1 for r in rows if r['eval']['tool_ok']) / n
success_rate = sum(1 for r in rows if r['eval']['success']) / n
avg_score = sum(r['eval']['score_total'] for r in rows) / n
avg_calls = sum(r['observed']['tool_calls_count'] for r in rows) / n
avg_tokens = sum(int(r['observed']['usage'].get('total_tokens') or 0) for r in rows) / n
tool_error_rate = sum(1 for r in rows if r['observed']['tool_error']) / n
return {
'n_cases': n,
'endpoint_match_rate': round(endpoint_rate, 4),
'method_match_rate': round(method_rate, 4),
'tool_use_rate': round(tool_rate, 4),
'success_rate': round(success_rate, 4),
'tool_error_rate': round(tool_error_rate, 4),
'avg_score_total': round(avg_score, 3),
'avg_tool_calls': round(avg_calls, 3),
'avg_total_tokens': round(avg_tokens, 1),
}
def render_markdown(rows: list[dict[str, Any]], summary: dict[str, Any], model: str, agent: str) -> str:
out = [
'# HF Hub Community Coverage Report',
'',
f'- Model: `{model}`',
f'- Agent: `{agent}`',
'',
'## Summary',
'',
f"- Cases: **{summary.get('n_cases', 0)}**",
f"- Endpoint match rate: **{summary.get('endpoint_match_rate')}**",
f"- Method match rate: **{summary.get('method_match_rate')}**",
f"- Tool use rate: **{summary.get('tool_use_rate')}**",
f"- Success rate: **{summary.get('success_rate')}**",
f"- Tool error rate: **{summary.get('tool_error_rate')}**",
f"- Avg score (/10): **{summary.get('avg_score_total')}**",
f"- Avg tool calls: **{summary.get('avg_tool_calls')}**",
f"- Avg total tokens: **{summary.get('avg_total_tokens')}**",
'',
'| # | Score | Endpoint OK | Method OK | Calls | Tokens | Prompt |',
'|---|------:|------------:|----------:|------:|-------:|--------|',
]
for r in rows:
score = r['eval']['score_total']
calls = r['observed']['tool_calls_count']
tokens = int(r['observed']['usage'].get('total_tokens') or 0)
out.append(
f"| {r['id']} | {score} | {int(r['eval']['endpoint_ok'])} | {int(r['eval']['method_ok'])} | {calls} | {tokens} | {r['prompt'][:72].replace('|','/')} |"
)
return '\n'.join(out) + '\n'
def main() -> None:
ap = argparse.ArgumentParser(description='Run endpoint-coverage pack for hf_hub_community')
ap.add_argument('--model', default='gpt-oss')
ap.add_argument('--agent', default=DEFAULT_AGENT)
ap.add_argument('--agent-cards', type=Path, default=DEFAULT_CARDS_DIR)
ap.add_argument('--cases', type=Path, default=PROMPTS_FILE)
ap.add_argument('--timeout', type=int, default=240)
ap.add_argument('--raw-results-dir', type=Path, default=ROOT / 'docs' / 'hf_hub_community_coverage_results')
ap.add_argument('--json-out', type=Path, default=REPORT_JSON)
ap.add_argument('--md-out', type=Path, default=REPORT_MD)
args = ap.parse_args()
cases = load_cases(args.cases)
rows: list[dict[str, Any]] = []
for case in cases:
result_file = args.raw_results_dir / f"coverage_{args.agent}_{args.model.replace('/', '_')}_case_{case['id']:02d}.json"
row = run_case(
case=case,
timeout_sec=args.timeout,
model=args.model,
agent_cards=args.agent_cards,
agent=args.agent,
result_path=result_file,
)
rows.append(row)
print(f"[{case['id']}] score={row['eval']['score_total']}/10 endpoint_ok={row['eval']['endpoint_ok']} method_ok={row['eval']['method_ok']}")
summary = summarize(rows)
payload = {
'summary': summary,
'rows': rows,
}
args.json_out.parent.mkdir(parents=True, exist_ok=True)
args.md_out.parent.mkdir(parents=True, exist_ok=True)
args.json_out.write_text(json.dumps(payload, indent=2), encoding='utf-8')
args.md_out.write_text(render_markdown(rows, summary, model=args.model, agent=args.agent), encoding='utf-8')
print(f"\nWrote:\n- {args.json_out}\n- {args.md_out}")
if __name__ == '__main__':
main()