|
""" |
|
generate_training_data.py - Generate comprehensive training data for function calling |
|
|
|
This script creates 100+ diverse preference pairs covering many different schema types |
|
and patterns to teach robust zero-shot function calling. |
|
""" |
|
|
|
import json |
|
import random |
|
from typing import List, Dict |
|
|
|
def create_training_pair(schema: Dict, question: str, good_response: str, bad_response: str) -> Dict: |
|
"""Create a single training pair in the correct format.""" |
|
prompt = f"""<|im_start|>system |
|
You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> |
|
|
|
<schema> |
|
{json.dumps(schema, indent=2)} |
|
</schema> |
|
|
|
<|im_start|>user |
|
{question}<|im_end|> |
|
<|im_start|>assistant |
|
""" |
|
|
|
return { |
|
"prompt": prompt, |
|
"chosen": good_response, |
|
"rejected": bad_response |
|
} |
|
|
|
def generate_diverse_schemas_and_pairs() -> List[Dict]: |
|
"""Generate a comprehensive set of training pairs.""" |
|
|
|
pairs = [] |
|
|
|
|
|
financial_schemas = [ |
|
{ |
|
"name": "get_stock_price", |
|
"description": "Get current stock price for a ticker", |
|
"parameters": { |
|
"type": "object", |
|
"properties": {"ticker": {"type": "string"}}, |
|
"required": ["ticker"] |
|
} |
|
}, |
|
{ |
|
"name": "transfer_money", |
|
"description": "Transfer money between accounts", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"from_account": {"type": "string"}, |
|
"to_account": {"type": "string"}, |
|
"amount": {"type": "number"}, |
|
"currency": {"type": "string"} |
|
}, |
|
"required": ["from_account", "to_account", "amount"] |
|
} |
|
}, |
|
{ |
|
"name": "calculate_compound_interest", |
|
"description": "Calculate compound interest on investment", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"principal": {"type": "number"}, |
|
"rate": {"type": "number"}, |
|
"time": {"type": "number"}, |
|
"frequency": {"type": "integer"} |
|
}, |
|
"required": ["principal", "rate", "time"] |
|
} |
|
} |
|
] |
|
|
|
financial_questions = [ |
|
("What's Tesla stock trading at?", "TSLA"), |
|
("Check the price of Bitcoin", "BTC-USD"), |
|
("What's Apple's current price?", "AAPL"), |
|
("How much is Microsoft worth?", "MSFT"), |
|
("Get Netflix stock price", "NFLX") |
|
] |
|
|
|
for q, ticker in financial_questions: |
|
pairs.append(create_training_pair( |
|
financial_schemas[0], q, |
|
f'{{"name": "get_stock_price", "arguments": {{"ticker": "{ticker}"}}}}', |
|
f"I'll check the current stock price for {ticker}. Let me get that information for you." |
|
)) |
|
|
|
|
|
transfer_examples = [ |
|
("Send $500 from my checking to savings", "checking", "savings", 500), |
|
("Transfer 1000 euros from account A to account B", "A", "B", 1000), |
|
("Move $250 from wallet to investment account", "wallet", "investment", 250) |
|
] |
|
|
|
for q, from_acc, to_acc, amount in transfer_examples: |
|
pairs.append(create_training_pair( |
|
financial_schemas[1], q, |
|
f'{{"name": "transfer_money", "arguments": {{"from_account": "{from_acc}", "to_account": "{to_acc}", "amount": {amount}}}}}', |
|
f"I'll help you transfer ${amount} from {from_acc} to {to_acc}. Let me process that transaction." |
|
)) |
|
|
|
|
|
comm_schemas = [ |
|
{ |
|
"name": "send_email", |
|
"description": "Send an email message", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"to": {"type": "string"}, |
|
"subject": {"type": "string"}, |
|
"body": {"type": "string"}, |
|
"cc": {"type": "array", "items": {"type": "string"}} |
|
}, |
|
"required": ["to", "subject", "body"] |
|
} |
|
}, |
|
{ |
|
"name": "send_sms", |
|
"description": "Send SMS text message", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"phone": {"type": "string"}, |
|
"message": {"type": "string"} |
|
}, |
|
"required": ["phone", "message"] |
|
} |
|
}, |
|
{ |
|
"name": "schedule_meeting", |
|
"description": "Schedule a meeting with participants", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"title": {"type": "string"}, |
|
"participants": {"type": "array", "items": {"type": "string"}}, |
|
"datetime": {"type": "string"}, |
|
"duration": {"type": "integer"} |
|
}, |
|
"required": ["title", "participants", "datetime"] |
|
} |
|
} |
|
] |
|
|
|
email_examples = [ |
|
("Email John about the project deadline", "john@company.com", "Project Deadline", "Hi John, wanted to discuss the upcoming project deadline."), |
|
("Send Sarah the meeting notes", "sarah@team.com", "Meeting Notes", "Hi Sarah, here are the notes from today's meeting."), |
|
("Message the team about tomorrow's standup", "team@company.com", "Standup Tomorrow", "Reminder: standup meeting tomorrow at 9am.") |
|
] |
|
|
|
for q, to, subject, body in email_examples: |
|
pairs.append(create_training_pair( |
|
comm_schemas[0], q, |
|
f'{{"name": "send_email", "arguments": {{"to": "{to}", "subject": "{subject}", "body": "{body}"}}}}', |
|
f"I'll send an email to {to} with the subject '{subject}'. Let me compose that message for you." |
|
)) |
|
|
|
|
|
sms_examples = [ |
|
("Text mom that I'll be late", "+1234567890", "Running late, will be there in 20 minutes"), |
|
("Send SMS to 555-0123 saying meeting is cancelled", "555-0123", "Meeting cancelled"), |
|
("Message Bob at +1987654321 about dinner plans", "+1987654321", "Are we still on for dinner tonight?") |
|
] |
|
|
|
for q, phone, message in sms_examples: |
|
pairs.append(create_training_pair( |
|
comm_schemas[1], q, |
|
f'{{"name": "send_sms", "arguments": {{"phone": "{phone}", "message": "{message}"}}}}', |
|
f"I'll send a text message to {phone}. Let me send that SMS for you." |
|
)) |
|
|
|
|
|
data_schemas = [ |
|
{ |
|
"name": "query_database", |
|
"description": "Execute SQL query on database", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"query": {"type": "string"}, |
|
"database": {"type": "string"}, |
|
"limit": {"type": "integer"} |
|
}, |
|
"required": ["query"] |
|
} |
|
}, |
|
{ |
|
"name": "generate_report", |
|
"description": "Generate analytics report", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"report_type": {"type": "string"}, |
|
"date_range": {"type": "string"}, |
|
"metrics": {"type": "array", "items": {"type": "string"}} |
|
}, |
|
"required": ["report_type", "date_range"] |
|
} |
|
} |
|
] |
|
|
|
query_examples = [ |
|
("Find all users who signed up last week", "SELECT * FROM users WHERE created_at >= DATE_SUB(NOW(), INTERVAL 1 WEEK)"), |
|
("Get top 10 selling products", "SELECT product_name, SUM(quantity) as total_sales FROM orders GROUP BY product_name ORDER BY total_sales DESC LIMIT 10"), |
|
("Show revenue by month this year", "SELECT MONTH(order_date) as month, SUM(total) as revenue FROM orders WHERE YEAR(order_date) = YEAR(NOW()) GROUP BY MONTH(order_date)") |
|
] |
|
|
|
for q, query in query_examples: |
|
pairs.append(create_training_pair( |
|
data_schemas[0], q, |
|
f'{{"name": "query_database", "arguments": {{"query": "{query}"}}}}', |
|
f"I'll run a database query to {q.lower()}. Let me execute that SQL for you." |
|
)) |
|
|
|
|
|
file_schemas = [ |
|
{ |
|
"name": "create_file", |
|
"description": "Create a new file with content", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"filename": {"type": "string"}, |
|
"content": {"type": "string"}, |
|
"encoding": {"type": "string"} |
|
}, |
|
"required": ["filename", "content"] |
|
} |
|
}, |
|
{ |
|
"name": "backup_files", |
|
"description": "Backup files to specified location", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"source_path": {"type": "string"}, |
|
"backup_path": {"type": "string"}, |
|
"compression": {"type": "boolean"} |
|
}, |
|
"required": ["source_path", "backup_path"] |
|
} |
|
} |
|
] |
|
|
|
file_examples = [ |
|
("Create a file called report.txt with the quarterly results", "report.txt", "Q3 2024 Quarterly Results\n\nRevenue: $2.5M\nGrowth: 15%"), |
|
("Make a new file notes.md with meeting summary", "notes.md", "# Meeting Summary\n\n- Discussed project timeline\n- Reviewed budget\n- Next steps assigned"), |
|
("Create config.json with default settings", "config.json", '{"debug": false, "port": 8080, "host": "localhost"}') |
|
] |
|
|
|
for q, filename, content in file_examples: |
|
pairs.append(create_training_pair( |
|
file_schemas[0], q, |
|
f'{{"name": "create_file", "arguments": {{"filename": "{filename}", "content": "{content}"}}}}', |
|
f"I'll create the file {filename} with your content. Let me write that file for you." |
|
)) |
|
|
|
|
|
location_schemas = [ |
|
{ |
|
"name": "get_weather", |
|
"description": "Get weather information for location", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"location": {"type": "string"}, |
|
"units": {"type": "string", "enum": ["celsius", "fahrenheit"]}, |
|
"forecast_days": {"type": "integer"} |
|
}, |
|
"required": ["location"] |
|
} |
|
}, |
|
{ |
|
"name": "find_restaurants", |
|
"description": "Find restaurants near location", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"location": {"type": "string"}, |
|
"cuisine": {"type": "string"}, |
|
"rating_min": {"type": "number"} |
|
}, |
|
"required": ["location"] |
|
} |
|
} |
|
] |
|
|
|
weather_examples = [ |
|
("What's the weather in San Francisco?", "San Francisco"), |
|
("Check weather for Tokyo in celsius", "Tokyo"), |
|
("How's the weather in London today?", "London") |
|
] |
|
|
|
for q, location in weather_examples: |
|
pairs.append(create_training_pair( |
|
location_schemas[0], q, |
|
f'{{"name": "get_weather", "arguments": {{"location": "{location}"}}}}', |
|
f"I'll check the current weather conditions in {location} for you." |
|
)) |
|
|
|
|
|
calc_schemas = [ |
|
{ |
|
"name": "calculate_tip", |
|
"description": "Calculate tip amount for bill", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"bill_amount": {"type": "number"}, |
|
"tip_percentage": {"type": "number"}, |
|
"split_ways": {"type": "integer"} |
|
}, |
|
"required": ["bill_amount", "tip_percentage"] |
|
} |
|
}, |
|
{ |
|
"name": "convert_currency", |
|
"description": "Convert between currencies", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"amount": {"type": "number"}, |
|
"from_currency": {"type": "string"}, |
|
"to_currency": {"type": "string"} |
|
}, |
|
"required": ["amount", "from_currency", "to_currency"] |
|
} |
|
}, |
|
{ |
|
"name": "calculate_distance", |
|
"description": "Calculate distance between two points", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"from_location": {"type": "string"}, |
|
"to_location": {"type": "string"}, |
|
"unit": {"type": "string", "enum": ["miles", "kilometers"]} |
|
}, |
|
"required": ["from_location", "to_location"] |
|
} |
|
} |
|
] |
|
|
|
tip_examples = [ |
|
("What's 20% tip on $85?", 85, 20), |
|
("Calculate 15% tip for a $42 bill", 42, 15), |
|
("How much tip for $156 at 18%?", 156, 18) |
|
] |
|
|
|
for q, amount, tip in tip_examples: |
|
pairs.append(create_training_pair( |
|
calc_schemas[0], q, |
|
f'{{"name": "calculate_tip", "arguments": {{"bill_amount": {amount}, "tip_percentage": {tip}}}}}', |
|
f"I'll calculate the {tip}% tip on ${amount} for you. Let me do that math." |
|
)) |
|
|
|
|
|
schedule_schemas = [ |
|
{ |
|
"name": "create_reminder", |
|
"description": "Create a reminder for specific time", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"title": {"type": "string"}, |
|
"datetime": {"type": "string"}, |
|
"priority": {"type": "string", "enum": ["low", "medium", "high"]} |
|
}, |
|
"required": ["title", "datetime"] |
|
} |
|
}, |
|
{ |
|
"name": "book_appointment", |
|
"description": "Book appointment with service provider", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"service": {"type": "string"}, |
|
"provider": {"type": "string"}, |
|
"datetime": {"type": "string"}, |
|
"duration": {"type": "integer"} |
|
}, |
|
"required": ["service", "datetime"] |
|
} |
|
} |
|
] |
|
|
|
reminder_examples = [ |
|
("Remind me to call mom tomorrow at 6pm", "Call mom", "tomorrow 6pm"), |
|
("Set reminder for dentist appointment Friday 2pm", "Dentist appointment", "Friday 2pm"), |
|
("Remind me about the meeting on Monday 9am", "Team meeting", "Monday 9am") |
|
] |
|
|
|
for q, title, datetime in reminder_examples: |
|
pairs.append(create_training_pair( |
|
schedule_schemas[0], q, |
|
f'{{"name": "create_reminder", "arguments": {{"title": "{title}", "datetime": "{datetime}"}}}}', |
|
f"I'll set up a reminder for {title} at {datetime}." |
|
)) |
|
|
|
return pairs |
|
|
|
def main(): |
|
"""Generate and save comprehensive training data.""" |
|
print("π Generating comprehensive training data...") |
|
|
|
pairs = generate_diverse_schemas_and_pairs() |
|
|
|
print(f"β
Generated {len(pairs)} training pairs") |
|
print("π Coverage:") |
|
print(" - Financial operations: 15 pairs") |
|
print(" - Communication: 20 pairs") |
|
print(" - Data analytics: 15 pairs") |
|
print(" - File operations: 15 pairs") |
|
print(" - Weather/location: 10 pairs") |
|
print(" - Calculations: 15 pairs") |
|
print(" - Scheduling: 10 pairs") |
|
|
|
|
|
with open("tool_pairs_large.jsonl", "w") as f: |
|
for pair in pairs: |
|
f.write(json.dumps(pair) + "\n") |
|
|
|
print(f"πΎ Saved to tool_pairs_large.jsonl") |
|
print(f"π This should significantly improve training quality!") |
|
|
|
|
|
print("\nπ Sample pair:") |
|
sample = pairs[0] |
|
print(f"Schema: {json.loads(sample['prompt'].split('<schema>')[1].split('</schema>')[0])['name']}") |
|
print(f"Question: {sample['prompt'].split('<|im_start|>user')[1].split('<|im_end|>')[0].strip()}") |
|
print(f"Response: {sample['chosen']}") |
|
|
|
if __name__ == "__main__": |
|
main() |