qninhdt
/

mini-agent

Model card Files Files and versions Community

mini-agent / scripts /mix_datasets.py

qninhdt

cc

9a9a2f5 10 months ago

history blame contribute delete

3.41 kB

	import json
	from tqdm import tqdm

	DATASETS = ["apigen", "glaive", "toolace"]

	data = []

	for dataset_name in DATASETS:
	with open(f"./datasets/{dataset_name}/output.json") as f:
	subdata = json.load(f)

	subdata = [{**item, "source": dataset_name} for item in subdata]

	data.extend(subdata)

	# filter all samples with used functions not in tool list
	new_data = []

	for sample in data:
	tools = [tool["name"] for tool in sample["tools"]]

	if any([used_tool not in tools for used_tool in sample["used_tools"]]):
	continue

	new_data.append(sample)

	data = new_data

	print("Number of samples:", len(data))


	def generate_dataset(data):
	tools = {}
	dataset = []

	for sample in tqdm(data, desc="Processing samples"):

	for tool in sample["tools"]:
	if tool["name"] in tools:
	continue

	tools[tool["name"]] = {
	"name": tool["name"],
	"description": tool["description"],
	"id": len(tools),
	"source": sample["source"],
	}

	used_tools = []

	for tool_name in sample["used_tools"]:
	used_tools.append(tools[tool_name]["id"])

	new_sample = {
	"instruction": sample["instruction"],
	"tools": used_tools,
	"source": sample["source"],
	}

	dataset.append(new_sample)

	return {"tools": list(tools.values()), "samples": dataset}


	from collections import defaultdict


	def count(data):
	used_tools_count = defaultdict(int)

	for item in data:
	used_tools_count[len(item["used_tools"])] += 1

	return used_tools_count


	print("Used tools count in dataset:")
	for k, v in count(data).items():
	print(f"{k} used tools: {v} samples")

	from random import shuffle, seed

	# split dataset based on used tools count
	# for one and two used tools we will use 80-20 split
	# for three used tools we will all samples in test set
	one_two = [item for item in data if len(item["used_tools"]) in [1, 2]]
	other = [item for item in data if len(item["used_tools"]) > 2]

	seed(42)
	shuffle(one_two)

	seed(42)
	shuffle(other)

	train_samples = one_two[: int(len(one_two) * 0.8)]
	test_samples = one_two[int(len(one_two) * 0.8) :] + other

	print("Train samples count:", len(train_samples))
	print("Test samples count:", len(test_samples))

	# count(train_samples), count(test_samples)
	print("Train samples count based on used tools count:")
	for k, v in count(train_samples).items():
	print(f"{k} used tools: {v} samples")

	print("Test samples count based on used tools count:")
	for k, v in count(test_samples).items():
	print(f"{k} used tools: {v} samples")

	train_dataset = generate_dataset(train_samples)
	test_dataset = generate_dataset(test_samples)

	seed(42)
	shuffle(train_dataset["samples"])

	seed(42)
	shuffle(test_dataset["samples"])

	print("Number of tools in train dataset:", len(train_dataset["tools"]))
	print("Number of samples in train dataset:", len(train_dataset["samples"]))

	print("Number of tools in test dataset:", len(test_dataset["tools"]))
	print("Number of samples in test dataset:", len(test_dataset["samples"]))

	import os

	os.makedirs("./datasets/mixed", exist_ok=True)

	with open("./datasets/mixed/train.json", "w") as f:
	json.dump(train_dataset, f, indent=2)

	with open("./datasets/mixed/test.json", "w") as f:
	json.dump(test_dataset, f, indent=2)