| | |
| | from __future__ import annotations |
| |
|
| | import argparse |
| | import json |
| | import re |
| | import subprocess |
| | from pathlib import Path |
| | from typing import Any |
| |
|
| | ROOT = Path(__file__).resolve().parents[1] |
| | DEFAULT_CARDS_DIR = ROOT / '.fast-agent' / 'tool-cards' |
| | DEFAULT_AGENT = 'hf_hub_community' |
| | PROMPTS_FILE = ROOT / 'scripts' / 'hf_hub_community_coverage_prompts.json' |
| | REPORT_MD = ROOT / 'docs' / 'hf_hub_community_coverage_report.md' |
| | REPORT_JSON = ROOT / 'docs' / 'hf_hub_community_coverage_report.json' |
| |
|
| | ANSI_RE = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") |
| |
|
| |
|
| | def strip_ansi(text: str) -> str: |
| | return ANSI_RE.sub('', text) |
| |
|
| |
|
| | def load_cases(path: Path) -> list[dict[str, Any]]: |
| | rows = json.loads(path.read_text(encoding='utf-8')) |
| | if not isinstance(rows, list): |
| | raise ValueError('coverage prompts file must be a JSON list') |
| | out: list[dict[str, Any]] = [] |
| | for row in rows: |
| | if not isinstance(row, dict): |
| | continue |
| | out.append({ |
| | 'id': int(row['id']), |
| | 'prompt': str(row['prompt']), |
| | 'expected_endpoint_any': list(row.get('expected_endpoint_any', [])), |
| | 'expected_method_any': [str(x).upper() for x in row.get('expected_method_any', ['GET'])], |
| | }) |
| | return out |
| |
|
| |
|
| | def _session_extract(result_path: Path) -> dict[str, Any]: |
| | data = json.loads(result_path.read_text(encoding='utf-8')) |
| | messages = data.get('messages', []) if isinstance(data, dict) else [] |
| |
|
| | endpoints: list[str] = [] |
| | methods: list[str] = [] |
| | tool_names: list[str] = [] |
| | tool_calls_count = 0 |
| | merged_parts: list[str] = [] |
| | tool_error = False |
| |
|
| | usage_input_tokens = 0 |
| | usage_output_tokens = 0 |
| | usage_total_tokens = 0 |
| |
|
| | for msg in messages: |
| | if not isinstance(msg, dict): |
| | continue |
| |
|
| | if msg.get('role') == 'assistant': |
| | for item in msg.get('content', []) or []: |
| | if isinstance(item, dict) and item.get('type') == 'text' and item.get('text'): |
| | merged_parts.append(str(item['text'])) |
| |
|
| | channels = msg.get('channels') or {} |
| | for item in channels.get('reasoning', []) or []: |
| | if isinstance(item, dict) and item.get('text'): |
| | merged_parts.append(str(item['text'])) |
| |
|
| | for item in channels.get('fast-agent-usage', []) or []: |
| | if not isinstance(item, dict): |
| | continue |
| | txt = item.get('text') |
| | if not isinstance(txt, str): |
| | continue |
| | try: |
| | payload = json.loads(txt) |
| | except Exception: |
| | continue |
| | turn = payload.get('turn', {}) if isinstance(payload, dict) else {} |
| | if isinstance(turn, dict): |
| | usage_input_tokens += int(turn.get('input_tokens') or 0) |
| | usage_output_tokens += int(turn.get('output_tokens') or 0) |
| | usage_total_tokens += int(turn.get('total_tokens') or 0) |
| |
|
| | tool_calls = msg.get('tool_calls') or {} |
| | if isinstance(tool_calls, dict): |
| | tool_calls_count += len(tool_calls) |
| | for tc in tool_calls.values(): |
| | params = (tc or {}).get('params', {}) if isinstance(tc, dict) else {} |
| | name = params.get('name') if isinstance(params, dict) else None |
| | args = params.get('arguments', {}) if isinstance(params, dict) else {} |
| | if isinstance(name, str): |
| | tool_names.append(name) |
| | merged_parts.append(f'tool call - {name}') |
| | if isinstance(args, dict): |
| | ep = args.get('endpoint') |
| | if isinstance(ep, str): |
| | endpoints.append(ep) |
| | method = args.get('method') |
| | methods.append(str(method).upper() if method else 'GET') |
| | merged_parts.append(json.dumps(args, ensure_ascii=False)) |
| |
|
| | if msg.get('role') == 'user': |
| | tool_results = msg.get('tool_results') or {} |
| | if isinstance(tool_results, dict): |
| | for tr in tool_results.values(): |
| | if bool((tr or {}).get('isError')): |
| | tool_error = True |
| | for item in (tr or {}).get('content', []) or []: |
| | if isinstance(item, dict) and item.get('type') == 'text' and item.get('text'): |
| | text = str(item['text']) |
| | merged_parts.append(text) |
| | if 'Error executing tool' in text: |
| | tool_error = True |
| |
|
| | return { |
| | 'endpoints': endpoints, |
| | 'methods': methods, |
| | 'tool_names': tool_names, |
| | 'tool_calls_count': tool_calls_count, |
| | 'tool_error': tool_error, |
| | 'merged': '\n'.join(merged_parts).strip(), |
| | 'usage': { |
| | 'input_tokens': usage_input_tokens, |
| | 'output_tokens': usage_output_tokens, |
| | 'total_tokens': usage_total_tokens, |
| | }, |
| | } |
| |
|
| |
|
| | def run_case( |
| | case: dict[str, Any], |
| | timeout_sec: int, |
| | model: str, |
| | agent_cards: Path, |
| | agent: str, |
| | result_path: Path, |
| | ) -> dict[str, Any]: |
| | prompt = case['prompt'] |
| | result_path.parent.mkdir(parents=True, exist_ok=True) |
| | cmd = [ |
| | 'fast-agent', 'go', |
| | '--no-env', |
| | '--model', model, |
| | '--agent-cards', str(agent_cards), |
| | '--agent', agent, |
| | '--results', str(result_path), |
| | '-m', prompt, |
| | ] |
| |
|
| | proc = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout_sec) |
| | out = strip_ansi(proc.stdout or '') |
| | err = strip_ansi(proc.stderr or '') |
| |
|
| | if not result_path.exists(): |
| | raise RuntimeError(f'Expected --results file not written: {result_path}') |
| |
|
| | parsed = _session_extract(result_path) |
| |
|
| | expected_ep = [re.compile(p) for p in case.get('expected_endpoint_any', [])] |
| | expected_methods = [m.upper() for m in case.get('expected_method_any', ['GET'])] |
| |
|
| | endpoint_ok = any(p.search(ep) for p in expected_ep for ep in parsed['endpoints']) if expected_ep else False |
| | method_ok = any(m in expected_methods for m in parsed['methods']) if parsed['methods'] else ('GET' in expected_methods) |
| | tool_ok = any(t == 'hf_api_request' for t in parsed['tool_names']) |
| | success = proc.returncode == 0 and 'Traceback' not in (out + '\n' + err) |
| | clarity = len(parsed['merged']) > 20 |
| |
|
| | score = int(endpoint_ok) * 4 + int(method_ok) * 2 + int(tool_ok) * 2 + int(success) + int(clarity) |
| |
|
| | return { |
| | 'id': case['id'], |
| | 'prompt': prompt, |
| | 'returncode': proc.returncode, |
| | 'result_file': str(result_path), |
| | 'observed': { |
| | 'endpoints': parsed['endpoints'], |
| | 'methods': parsed['methods'], |
| | 'tool_names': parsed['tool_names'], |
| | 'tool_calls_count': parsed['tool_calls_count'], |
| | 'tool_error': parsed['tool_error'], |
| | 'usage': parsed['usage'], |
| | }, |
| | 'expected': { |
| | 'endpoint_any': case.get('expected_endpoint_any', []), |
| | 'method_any': expected_methods, |
| | }, |
| | 'eval': { |
| | 'endpoint_ok': endpoint_ok, |
| | 'method_ok': method_ok, |
| | 'tool_ok': tool_ok, |
| | 'success': success, |
| | 'clarity': clarity, |
| | 'score_total': score, |
| | }, |
| | 'merged': parsed['merged'], |
| | } |
| |
|
| |
|
| | def summarize(rows: list[dict[str, Any]]) -> dict[str, Any]: |
| | n = len(rows) |
| | if n == 0: |
| | return {'n_cases': 0} |
| |
|
| | endpoint_rate = sum(1 for r in rows if r['eval']['endpoint_ok']) / n |
| | method_rate = sum(1 for r in rows if r['eval']['method_ok']) / n |
| | tool_rate = sum(1 for r in rows if r['eval']['tool_ok']) / n |
| | success_rate = sum(1 for r in rows if r['eval']['success']) / n |
| | avg_score = sum(r['eval']['score_total'] for r in rows) / n |
| | avg_calls = sum(r['observed']['tool_calls_count'] for r in rows) / n |
| | avg_tokens = sum(int(r['observed']['usage'].get('total_tokens') or 0) for r in rows) / n |
| | tool_error_rate = sum(1 for r in rows if r['observed']['tool_error']) / n |
| |
|
| | return { |
| | 'n_cases': n, |
| | 'endpoint_match_rate': round(endpoint_rate, 4), |
| | 'method_match_rate': round(method_rate, 4), |
| | 'tool_use_rate': round(tool_rate, 4), |
| | 'success_rate': round(success_rate, 4), |
| | 'tool_error_rate': round(tool_error_rate, 4), |
| | 'avg_score_total': round(avg_score, 3), |
| | 'avg_tool_calls': round(avg_calls, 3), |
| | 'avg_total_tokens': round(avg_tokens, 1), |
| | } |
| |
|
| |
|
| | def render_markdown(rows: list[dict[str, Any]], summary: dict[str, Any], model: str, agent: str) -> str: |
| | out = [ |
| | '# HF Hub Community Coverage Report', |
| | '', |
| | f'- Model: `{model}`', |
| | f'- Agent: `{agent}`', |
| | '', |
| | '## Summary', |
| | '', |
| | f"- Cases: **{summary.get('n_cases', 0)}**", |
| | f"- Endpoint match rate: **{summary.get('endpoint_match_rate')}**", |
| | f"- Method match rate: **{summary.get('method_match_rate')}**", |
| | f"- Tool use rate: **{summary.get('tool_use_rate')}**", |
| | f"- Success rate: **{summary.get('success_rate')}**", |
| | f"- Tool error rate: **{summary.get('tool_error_rate')}**", |
| | f"- Avg score (/10): **{summary.get('avg_score_total')}**", |
| | f"- Avg tool calls: **{summary.get('avg_tool_calls')}**", |
| | f"- Avg total tokens: **{summary.get('avg_total_tokens')}**", |
| | '', |
| | '| # | Score | Endpoint OK | Method OK | Calls | Tokens | Prompt |', |
| | '|---|------:|------------:|----------:|------:|-------:|--------|', |
| | ] |
| |
|
| | for r in rows: |
| | score = r['eval']['score_total'] |
| | calls = r['observed']['tool_calls_count'] |
| | tokens = int(r['observed']['usage'].get('total_tokens') or 0) |
| | out.append( |
| | f"| {r['id']} | {score} | {int(r['eval']['endpoint_ok'])} | {int(r['eval']['method_ok'])} | {calls} | {tokens} | {r['prompt'][:72].replace('|','/')} |" |
| | ) |
| |
|
| | return '\n'.join(out) + '\n' |
| |
|
| |
|
| | def main() -> None: |
| | ap = argparse.ArgumentParser(description='Run endpoint-coverage pack for hf_hub_community') |
| | ap.add_argument('--model', default='gpt-oss') |
| | ap.add_argument('--agent', default=DEFAULT_AGENT) |
| | ap.add_argument('--agent-cards', type=Path, default=DEFAULT_CARDS_DIR) |
| | ap.add_argument('--cases', type=Path, default=PROMPTS_FILE) |
| | ap.add_argument('--timeout', type=int, default=240) |
| | ap.add_argument('--raw-results-dir', type=Path, default=ROOT / 'docs' / 'hf_hub_community_coverage_results') |
| | ap.add_argument('--json-out', type=Path, default=REPORT_JSON) |
| | ap.add_argument('--md-out', type=Path, default=REPORT_MD) |
| | args = ap.parse_args() |
| |
|
| | cases = load_cases(args.cases) |
| |
|
| | rows: list[dict[str, Any]] = [] |
| | for case in cases: |
| | result_file = args.raw_results_dir / f"coverage_{args.agent}_{args.model.replace('/', '_')}_case_{case['id']:02d}.json" |
| | row = run_case( |
| | case=case, |
| | timeout_sec=args.timeout, |
| | model=args.model, |
| | agent_cards=args.agent_cards, |
| | agent=args.agent, |
| | result_path=result_file, |
| | ) |
| | rows.append(row) |
| | print(f"[{case['id']}] score={row['eval']['score_total']}/10 endpoint_ok={row['eval']['endpoint_ok']} method_ok={row['eval']['method_ok']}") |
| |
|
| | summary = summarize(rows) |
| |
|
| | payload = { |
| | 'summary': summary, |
| | 'rows': rows, |
| | } |
| |
|
| | args.json_out.parent.mkdir(parents=True, exist_ok=True) |
| | args.md_out.parent.mkdir(parents=True, exist_ok=True) |
| | args.json_out.write_text(json.dumps(payload, indent=2), encoding='utf-8') |
| | args.md_out.write_text(render_markdown(rows, summary, model=args.model, agent=args.agent), encoding='utf-8') |
| |
|
| | print(f"\nWrote:\n- {args.json_out}\n- {args.md_out}") |
| |
|
| |
|
| | if __name__ == '__main__': |
| | main() |
| |
|