qninhdt
/

mini-agent

mini-agent / scripts /preprocess_glaive_dataset.py

qninhdt

9a9a2f5 10 months ago

1.81 kB

	import re
	import json


	with open("./datasets/glaive/glaive-function-calling-v2.json", "r") as f:
	data = json.load(f)


	def process_sample(sample):
	system_text = sample.get("system", "")
	chat_text = sample.get("chat", "")

	# Extract tools
	tools_match = re.findall(
	r'"name": "(.?)",.?"description": "(.*?)"', system_text, re.S
	)
	if not tools_match:
	return None # Skip samples without tools
	tools = [{"name": name, "description": desc} for name, desc in tools_match]

	# Extract function calls and corresponding instructions
	function_calls = re.findall(r'<functioncall> {"name": "(.*?)"', chat_text)
	if not function_calls:
	return None # Skip samples without function calls

	# Extract ASSISTANT responses and filter for valid instructions
	assistant_responses = re.findall(
	r"ASSISTANT: (.*?)<\\|endoftext\\|>", chat_text, re.S
	)
	instructions = []
	for call, response in zip(function_calls, assistant_responses):
	# Find the user prompt just before this ASSISTANT response
	user_prompt_match = re.search(
	rf'USER: (.?)\n.?<functioncall> {{.?"{call}".?}}', chat_text, re.S
	)
	if user_prompt_match:
	instructions.append(
	{
	"instruction": user_prompt_match.group(1).strip(),
	"tools": tools,
	"used_tools": [call], # Single tool for this instruction
	}
	)

	return instructions


	from tqdm import tqdm

	results = []
	for sample in tqdm(data, desc="Processing GLAIVE samples"):
	processed = process_sample(sample)
	if processed:
	results.extend(processed)

	with open("./datasets/glaive/output.json", "w") as f:
	json.dump(results, f, indent=2)