qninhdt
/

mini-agent

mini-agent / scripts /preprocess_toolace_dataset.py

qninhdt

9a9a2f5 10 months ago

1.92 kB

	import json
	import re

	with open("./datasets/toolace/data.json") as f:
	data = json.load(f)


	def process_sample(sample):
	system_message = sample["system"]
	functions_match = re.findall(
	r'\{"name": "(.?)", "description": "(.?)"', system_message
	)
	functions = [
	{"name": name.replace(" ", "_").replace("/", "_"), "description": desc}
	for name, desc in functions_match
	]

	# Process the conversation to extract instructions and function calls
	conversations = sample["conversations"]
	results = []

	for i in range(len(conversations)):
	entry = conversations[i]

	# Check if the entry is from the user and the next entry is an assistant function call
	if (
	entry["from"] == "user"
	and i + 1 < len(conversations)
	and conversations[i + 1]["from"] == "assistant"
	):
	function_call = conversations[i + 1]["value"]
	if re.match(r"\[.*\]", function_call): # Ensure it contains a function call
	# Extract full tool names (including spaces) from the function call
	used_tools = [
	tool.replace(" ", "_").replace("/", "_")
	for tool in re.findall(r"\[([^\(]+)\(", function_call)
	]

	if len(used_tools) == 0:
	continue

	results.append(
	{
	"instruction": entry["value"],
	"tools": functions,
	"used_tools": used_tools,
	}
	)

	return results


	from tqdm import tqdm

	results = []
	for sample in tqdm(data, desc="Processing ToolACE samples"):
	processed = process_sample(sample)
	if processed:
	results.extend(processed)

	with open("./datasets/toolace/output.json", "w") as f:
	json.dump(results, f, indent=2)