mini-agent / scripts /preprocess_toolace_dataset.py
qninhdt
cc
9a9a2f5
import json
import re
with open("./datasets/toolace/data.json") as f:
data = json.load(f)
def process_sample(sample):
system_message = sample["system"]
functions_match = re.findall(
r'\{"name": "(.*?)", "description": "(.*?)"', system_message
)
functions = [
{"name": name.replace(" ", "_").replace("/", "_"), "description": desc}
for name, desc in functions_match
]
# Process the conversation to extract instructions and function calls
conversations = sample["conversations"]
results = []
for i in range(len(conversations)):
entry = conversations[i]
# Check if the entry is from the user and the next entry is an assistant function call
if (
entry["from"] == "user"
and i + 1 < len(conversations)
and conversations[i + 1]["from"] == "assistant"
):
function_call = conversations[i + 1]["value"]
if re.match(r"\[.*\]", function_call): # Ensure it contains a function call
# Extract full tool names (including spaces) from the function call
used_tools = [
tool.replace(" ", "_").replace("/", "_")
for tool in re.findall(r"\[([^\(]+)\(", function_call)
]
if len(used_tools) == 0:
continue
results.append(
{
"instruction": entry["value"],
"tools": functions,
"used_tools": used_tools,
}
)
return results
from tqdm import tqdm
results = []
for sample in tqdm(data, desc="Processing ToolACE samples"):
processed = process_sample(sample)
if processed:
results.extend(processed)
with open("./datasets/toolace/output.json", "w") as f:
json.dump(results, f, indent=2)