| |
| """ |
| Generate the FunctionGemma evaluation benchmark. |
| |
| Creates 100 high-quality samples to assess function-calling accuracy across: |
| - SEARCH_TOKEN calls |
| - EXECUTE_SWAP calls |
| - Incomplete requests (should ask back) |
| - Irrelevant requests (should refuse) |
| """ |
|
|
| import json |
| import random |
| import argparse |
| from pathlib import Path |
| from typing import Dict, List, Any, Optional |
|
|
| PROJECT_ROOT = Path(__file__).resolve().parent.parent |
| DEFAULT_BENCHMARK_PATH = PROJECT_ROOT / "data" / "benchmark_dataset.json" |
|
|
| |
| TOKENS = { |
| "SOL": {"ca": "So11111111111111111111111111111111111111112", "chain": "solana"}, |
| "USDC": {"ca": "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", "chain": "solana"}, |
| "JUP": {"ca": "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN", "chain": "solana"}, |
| "RAY": {"ca": "4k3Dyjzvzp8eMZWUXbBCjEvwSkkk59S5iCNLY3QrkX6R", "chain": "solana"}, |
| "BONK": {"ca": "DezXAZ8z7PnrnRJjz3wXBoRgixCa6xjnB7YaB1pPB263", "chain": "solana"}, |
| "WIF": {"ca": "EKpQGSJtjMFqKZ9KQanSqYXRcF8fBopzLHYxdM65zcjm", "chain": "solana"}, |
| "ETH": {"ca": "7vfCXTUXx5WJV5JADk17DUJ4ksgau7utNKj4b963voxs", "chain": "solana"}, |
| "BTC": {"ca": "9n4nbM75f5Ui33ZbPYXn59EwSgE8CGsHtAeTH5YFeJ9E", "chain": "solana"}, |
| "POPCAT": {"ca": "7GCihgDB8fe6KNjn2MYtkzZcRjQy3t9GHdC8uHYmW2hr", "chain": "solana"}, |
| "TRUMP": {"ca": "6p6xgHyF7AeE6TZkSmFsko444wqoP15icUSqi2jfGiPN", "chain": "solana"}, |
| } |
|
|
| CHAINS = ["solana", "ethereum", "bsc", "base"] |
|
|
| |
| TOOLS = [ |
| { |
| "type": "function", |
| "function": { |
| "name": "SEARCH_TOKEN", |
| "description": "search token onchain", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "symbol": {"type": ["string", "null"], "description": "Symbol of the token"}, |
| "address": {"type": ["string", "null"], "description": "Contract address of the token"}, |
| "chain": {"type": "string", "enum": ["solana", "ethereum", "bsc", "base"], "description": "supported chains"}, |
| "keyword": {"type": ["string", "null"], "description": "keyword to search for the token"} |
| }, |
| "required": [] |
| } |
| } |
| }, |
| { |
| "type": "function", |
| "function": { |
| "name": "EXECUTE_SWAP", |
| "description": "Swap tokens on the Solana blockchain. When the user specifies 'buy <token>', the default input token is SOL. When the user specifies 'sell <token>', the default output token is SOL.", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "inputTokenSymbol": {"type": ["string", "null"], "description": "Symbol of the token to sell."}, |
| "inputTokenCA": {"type": ["string", "null"], "description": "Contract address of the token to sell."}, |
| "outputTokenCA": {"type": ["string", "null"], "description": "Contract address of the token to buy."}, |
| "inputTokenAmount": {"type": ["string", "null"], "description": "Exact amount of the input token to swap."}, |
| "inputTokenPercentage": {"type": ["number", "null"], "description": "Percentage of the input token balance to swap."}, |
| "outputTokenAmount": {"type": ["string", "null"], "description": "Expected amount of the output token to receive."} |
| }, |
| "required": ["inputTokenCA", "outputTokenCA", "inputTokenAmount", "inputTokenPercentage"] |
| } |
| } |
| } |
| ] |
|
|
|
|
| def create_benchmark_item( |
| user_input: str, |
| expected_function: Optional[str], |
| expected_args: Optional[Dict] = None, |
| category: str = "function_call", |
| description: str = "" |
| ) -> Dict: |
| """Create one benchmark sample.""" |
| return { |
| "id": None, |
| "category": category, |
| "description": description, |
| "input": { |
| "messages": [ |
| {"role": "developer", "content": "You are a model that can do function calling with the following functions"}, |
| {"role": "user", "content": user_input} |
| ], |
| "tools": TOOLS |
| }, |
| "expected": { |
| "function_name": expected_function, |
| "arguments": expected_args |
| } |
| } |
|
|
|
|
| def generate_search_token_benchmarks() -> List[Dict]: |
| """Generate SEARCH_TOKEN cases.""" |
| benchmarks = [] |
| |
| |
| test_cases = [ |
| ("Search for BONK token", "BONK", "solana", None, None), |
| ("Find WIF on solana", "WIF", "solana", None, None), |
| ("Look up JUP token", "JUP", "solana", None, None), |
| ("Search ETH on ethereum", "ETH", "ethereum", None, None), |
| ("Find USDC token on base", "USDC", "base", None, None), |
| ] |
| |
| for query, symbol, chain, address, keyword in test_cases: |
| expected_args = {"symbol": symbol, "chain": chain} |
| if address: |
| expected_args["address"] = address |
| if keyword: |
| expected_args["keyword"] = keyword |
| benchmarks.append(create_benchmark_item( |
| query, "SEARCH_TOKEN", expected_args, |
| "search_by_symbol", f"Search {symbol} by symbol" |
| )) |
| |
| |
| cn_cases = [ |
| ("帮我搜索 BONK 代币", "BONK", "solana"), |
| ("查一下 WIF 这个币", "WIF", "solana"), |
| ("找一下 JUP 代币信息", "JUP", "solana"), |
| ("搜索 RAY 代币", "RAY", "solana"), |
| ("查询 POPCAT 代币", "POPCAT", "solana"), |
| ] |
| |
| for query, symbol, chain in cn_cases: |
| benchmarks.append(create_benchmark_item( |
| query, "SEARCH_TOKEN", {"symbol": symbol, "chain": chain}, |
| "search_by_symbol_cn", f"Search {symbol} by symbol (Chinese)" |
| )) |
| |
| |
| for token, info in list(TOKENS.items())[:5]: |
| query = f"Search token at address {info['ca']}" |
| benchmarks.append(create_benchmark_item( |
| query, "SEARCH_TOKEN", {"address": info['ca'], "chain": info['chain']}, |
| "search_by_address", f"Search {token} by address" |
| )) |
| |
| |
| keyword_cases = [ |
| ("Search for dog themed tokens", "dog", "solana"), |
| ("Find meme coins", "meme", "solana"), |
| ("Look for cat tokens on base", "cat", "base"), |
| ] |
| |
| for query, keyword, chain in keyword_cases: |
| benchmarks.append(create_benchmark_item( |
| query, "SEARCH_TOKEN", {"keyword": keyword, "chain": chain}, |
| "search_by_keyword", f"Search by keyword: {keyword}" |
| )) |
| |
| return benchmarks |
|
|
|
|
| def generate_execute_swap_benchmarks() -> List[Dict]: |
| """Generate EXECUTE_SWAP cases.""" |
| benchmarks = [] |
| |
| |
| buy_cases = [ |
| ("Buy 1 SOL worth of BONK", "SOL", "BONK", "1", None), |
| ("Purchase 5 SOL of WIF", "SOL", "WIF", "5", None), |
| ("Buy 10 USDC worth of JUP", "USDC", "JUP", "10", None), |
| ("I want to buy 2 SOL of RAY", "SOL", "RAY", "2", None), |
| ("Get me 0.5 SOL of POPCAT", "SOL", "POPCAT", "0.5", None), |
| ] |
| |
| for query, input_token, output_token, amount, percentage in buy_cases: |
| input_ca = TOKENS[input_token]["ca"] |
| output_ca = TOKENS[output_token]["ca"] |
| benchmarks.append(create_benchmark_item( |
| query, "EXECUTE_SWAP", |
| {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage}, |
| "buy_with_amount", f"Buy {output_token} with {amount} {input_token}" |
| )) |
| |
| |
| buy_pct_cases = [ |
| ("Buy BONK with 50% of my SOL", "SOL", "BONK", None, 0.5), |
| ("Use 30% of my USDC to buy WIF", "USDC", "WIF", None, 0.3), |
| ("Spend 100% of my SOL on JUP", "SOL", "JUP", None, 1.0), |
| ("Put 25% of my ETH into RAY", "ETH", "RAY", None, 0.25), |
| ("Use half of my BTC to get BONK", "BTC", "BONK", None, 0.5), |
| ] |
| |
| for query, input_token, output_token, amount, percentage in buy_pct_cases: |
| input_ca = TOKENS[input_token]["ca"] |
| output_ca = TOKENS[output_token]["ca"] |
| benchmarks.append(create_benchmark_item( |
| query, "EXECUTE_SWAP", |
| {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage}, |
| "buy_with_percentage", f"Buy {output_token} with {int(percentage*100)}% {input_token}" |
| )) |
| |
| |
| sell_cases = [ |
| ("Sell 1000 BONK", "BONK", "SOL", "1000", None), |
| ("Sell 500 WIF for SOL", "WIF", "SOL", "500", None), |
| ("Convert 100 JUP to SOL", "JUP", "SOL", "100", None), |
| ("Dump 2000 RAY", "RAY", "SOL", "2000", None), |
| ("Sell 50 USDC", "USDC", "SOL", "50", None), |
| ] |
| |
| for query, input_token, output_token, amount, percentage in sell_cases: |
| input_ca = TOKENS[input_token]["ca"] |
| output_ca = TOKENS[output_token]["ca"] |
| benchmarks.append(create_benchmark_item( |
| query, "EXECUTE_SWAP", |
| {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage}, |
| "sell_with_amount", f"Sell {amount} {input_token}" |
| )) |
| |
| |
| sell_pct_cases = [ |
| ("Sell 50% of my BONK", "BONK", "SOL", None, 0.5), |
| ("Dump all my WIF", "WIF", "SOL", None, 1.0), |
| ("Sell 30% of my JUP holdings", "JUP", "SOL", None, 0.3), |
| ("Get rid of 75% of my RAY", "RAY", "SOL", None, 0.75), |
| ("Sell a quarter of my USDC", "USDC", "SOL", None, 0.25), |
| ] |
| |
| for query, input_token, output_token, amount, percentage in sell_pct_cases: |
| input_ca = TOKENS[input_token]["ca"] |
| output_ca = TOKENS[output_token]["ca"] |
| benchmarks.append(create_benchmark_item( |
| query, "EXECUTE_SWAP", |
| {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage}, |
| "sell_with_percentage", f"Sell {int(percentage*100)}% {input_token}" |
| )) |
| |
| |
| cn_swap_cases = [ |
| ("用 1 个 SOL 买 BONK", "SOL", "BONK", "1", None), |
| ("把 50% 的 USDC 换成 WIF", "USDC", "WIF", None, 0.5), |
| ("卖掉 1000 个 BONK", "BONK", "SOL", "1000", None), |
| ("把所有 JUP 都卖了", "JUP", "SOL", None, 1.0), |
| ("用 2 SOL 购买 RAY", "SOL", "RAY", "2", None), |
| ("出售 30% 的 WIF", "WIF", "SOL", None, 0.3), |
| ("买入 5 SOL 的 POPCAT", "SOL", "POPCAT", "5", None), |
| ("清仓 ETH", "ETH", "SOL", None, 1.0), |
| ] |
| |
| for query, input_token, output_token, amount, percentage in cn_swap_cases: |
| input_ca = TOKENS[input_token]["ca"] |
| output_ca = TOKENS[output_token]["ca"] |
| benchmarks.append(create_benchmark_item( |
| query, "EXECUTE_SWAP", |
| {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage}, |
| "swap_chinese", f"Swap request in Chinese" |
| )) |
| |
| |
| swap_cases = [ |
| ("Swap 100 USDC for BONK", "USDC", "BONK", "100", None), |
| ("Exchange 50 JUP for WIF", "JUP", "WIF", "50", None), |
| ("Convert all my ETH to USDC", "ETH", "USDC", None, 1.0), |
| ] |
| |
| for query, input_token, output_token, amount, percentage in swap_cases: |
| input_ca = TOKENS[input_token]["ca"] |
| output_ca = TOKENS[output_token]["ca"] |
| benchmarks.append(create_benchmark_item( |
| query, "EXECUTE_SWAP", |
| {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage}, |
| "token_to_token", f"Swap {input_token} to {output_token}" |
| )) |
| |
| return benchmarks |
|
|
|
|
| def generate_incomplete_benchmarks() -> List[Dict]: |
| """Generate incomplete requests (should ask clarification).""" |
| benchmarks = [] |
| |
| incomplete_cases = [ |
| ("I want to buy some tokens", "incomplete_no_token", "Missing token name"), |
| ("Sell my holdings", "incomplete_no_token", "Missing which token to sell"), |
| ("Search for a token", "incomplete_no_info", "Missing token info"), |
| ("Buy something", "incomplete_vague", "Too vague"), |
| ("我想买币", "incomplete_cn", "Missing token (Chinese)"), |
| ("帮我卖掉", "incomplete_cn", "Missing token and amount (Chinese)"), |
| ("Swap tokens", "incomplete_swap", "Missing swap details"), |
| ("I want to trade", "incomplete_trade", "Missing trade details"), |
| ] |
| |
| for query, category, description in incomplete_cases: |
| benchmarks.append(create_benchmark_item( |
| query, None, None, category, description |
| )) |
| |
| return benchmarks |
|
|
|
|
| def generate_irrelevant_benchmarks() -> List[Dict]: |
| """Generate irrelevant requests (should not call any function).""" |
| benchmarks = [] |
| |
| irrelevant_cases = [ |
| ("What's the weather today?", "irrelevant_weather", "Weather query"), |
| ("Tell me a joke", "irrelevant_joke", "Joke request"), |
| ("What time is it?", "irrelevant_time", "Time query"), |
| ("Who is the president?", "irrelevant_general", "General knowledge"), |
| ("今天天气怎么样?", "irrelevant_cn", "Weather (Chinese)"), |
| ("给我讲个笑话", "irrelevant_cn", "Joke (Chinese)"), |
| ("Hello, how are you?", "irrelevant_greeting", "Greeting"), |
| ("What is Bitcoin?", "irrelevant_info", "Info request (no action)"), |
| ] |
| |
| for query, category, description in irrelevant_cases: |
| benchmarks.append(create_benchmark_item( |
| query, None, None, category, description |
| )) |
| |
| return benchmarks |
|
|
|
|
| def generate_benchmark_dataset(output_path: str = str(DEFAULT_BENCHMARK_PATH)): |
| """Generate the full benchmark dataset.""" |
| |
| print("=" * 60) |
| print("Generating FunctionGemma benchmark dataset") |
| print("=" * 60) |
| |
| |
| all_benchmarks = [] |
| |
| |
| search_benchmarks = generate_search_token_benchmarks() |
| print(f"SEARCH_TOKEN cases: {len(search_benchmarks)}") |
| all_benchmarks.extend(search_benchmarks) |
| |
| |
| swap_benchmarks = generate_execute_swap_benchmarks() |
| print(f"EXECUTE_SWAP cases: {len(swap_benchmarks)}") |
| all_benchmarks.extend(swap_benchmarks) |
| |
| |
| incomplete_benchmarks = generate_incomplete_benchmarks() |
| print(f"Incomplete request cases: {len(incomplete_benchmarks)}") |
| all_benchmarks.extend(incomplete_benchmarks) |
| |
| |
| irrelevant_benchmarks = generate_irrelevant_benchmarks() |
| print(f"Irrelevant request cases: {len(irrelevant_benchmarks)}") |
| all_benchmarks.extend(irrelevant_benchmarks) |
| |
| |
| while len(all_benchmarks) < 100: |
| |
| extra_cases = [ |
| ("Buy 3 SOL of TRUMP", "SOL", "TRUMP", "3", None, "EXECUTE_SWAP"), |
| ("Search for TRUMP token", "TRUMP", "solana", None, None, "SEARCH_TOKEN"), |
| ] |
| for case in extra_cases: |
| if len(all_benchmarks) >= 100: |
| break |
| if case[5] == "EXECUTE_SWAP": |
| input_ca = TOKENS[case[1]]["ca"] |
| output_ca = TOKENS[case[2]]["ca"] |
| all_benchmarks.append(create_benchmark_item( |
| case[0], "EXECUTE_SWAP", |
| {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": case[3], "inputTokenPercentage": case[4]}, |
| "extra", "Extra test case" |
| )) |
| else: |
| all_benchmarks.append(create_benchmark_item( |
| case[0], "SEARCH_TOKEN", |
| {"symbol": case[1], "chain": case[2]}, |
| "extra", "Extra test case" |
| )) |
| |
| |
| all_benchmarks = all_benchmarks[:100] |
| |
| |
| for i, item in enumerate(all_benchmarks): |
| item["id"] = i + 1 |
| |
| |
| random.seed(42) |
| random.shuffle(all_benchmarks) |
| |
| |
| for i, item in enumerate(all_benchmarks): |
| item["id"] = i + 1 |
| |
| print(f"\nTotal: {len(all_benchmarks)} cases") |
| |
| |
| categories = {} |
| for item in all_benchmarks: |
| cat = item["category"] |
| categories[cat] = categories.get(cat, 0) + 1 |
| |
| print("\nCategory distribution:") |
| for cat, count in sorted(categories.items()): |
| print(f" - {cat}: {count}") |
| |
| |
| func_counts = {"SEARCH_TOKEN": 0, "EXECUTE_SWAP": 0, "None": 0} |
| for item in all_benchmarks: |
| func = item["expected"]["function_name"] |
| if func: |
| func_counts[func] = func_counts.get(func, 0) + 1 |
| else: |
| func_counts["None"] += 1 |
| |
| print("\nFunction distribution:") |
| for func, count in func_counts.items(): |
| print(f" - {func}: {count}") |
| |
| |
| with open(output_path, 'w', encoding='utf-8') as f: |
| json.dump(all_benchmarks, f, ensure_ascii=False, indent=2) |
| |
| print(f"\nBenchmark saved to: {output_path}") |
| |
| |
| print("\n" + "=" * 60) |
| print("Examples:") |
| print("=" * 60) |
| |
| for i, item in enumerate(all_benchmarks[:3]): |
| print(f"\n--- Example {i+1} ---") |
| print(f"ID: {item['id']}") |
| print(f"Category: {item['category']}") |
| print(f"Input: {item['input']['messages'][1]['content']}") |
| print(f"Expected function: {item['expected']['function_name']}") |
| if item['expected']['arguments']: |
| print(f"Expected args: {json.dumps(item['expected']['arguments'], ensure_ascii=False)}") |
| |
| return all_benchmarks |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Generate FunctionGemma benchmark dataset") |
| parser.add_argument("--output", type=str, default=str(DEFAULT_BENCHMARK_PATH), help="Output file path") |
| args = parser.parse_args() |
| |
| output_path = Path(args.output) |
| output_path.parent.mkdir(parents=True, exist_ok=True) |
| |
| generate_benchmark_dataset(str(output_path)) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|