mini-agent / scripts /preprocess_glaive_dataset.py
qninhdt
cc
9a9a2f5
import re
import json
with open("./datasets/glaive/glaive-function-calling-v2.json", "r") as f:
data = json.load(f)
def process_sample(sample):
system_text = sample.get("system", "")
chat_text = sample.get("chat", "")
# Extract tools
tools_match = re.findall(
r'"name": "(.*?)",.*?"description": "(.*?)"', system_text, re.S
)
if not tools_match:
return None # Skip samples without tools
tools = [{"name": name, "description": desc} for name, desc in tools_match]
# Extract function calls and corresponding instructions
function_calls = re.findall(r'<functioncall> {"name": "(.*?)"', chat_text)
if not function_calls:
return None # Skip samples without function calls
# Extract ASSISTANT responses and filter for valid instructions
assistant_responses = re.findall(
r"ASSISTANT: (.*?)<\|endoftext\|>", chat_text, re.S
)
instructions = []
for call, response in zip(function_calls, assistant_responses):
# Find the user prompt just before this ASSISTANT response
user_prompt_match = re.search(
rf'USER: (.*?)\n.*?<functioncall> {{.*?"{call}".*?}}', chat_text, re.S
)
if user_prompt_match:
instructions.append(
{
"instruction": user_prompt_match.group(1).strip(),
"tools": tools,
"used_tools": [call], # Single tool for this instruction
}
)
return instructions
from tqdm import tqdm
results = []
for sample in tqdm(data, desc="Processing GLAIVE samples"):
processed = process_sample(sample)
if processed:
results.extend(processed)
with open("./datasets/glaive/output.json", "w") as f:
json.dump(results, f, indent=2)