mini-agent / scripts /mix_datasets.py
qninhdt
cc
9a9a2f5
import json
from tqdm import tqdm
DATASETS = ["apigen", "glaive", "toolace"]
data = []
for dataset_name in DATASETS:
with open(f"./datasets/{dataset_name}/output.json") as f:
subdata = json.load(f)
subdata = [{**item, "source": dataset_name} for item in subdata]
data.extend(subdata)
# filter all samples with used functions not in tool list
new_data = []
for sample in data:
tools = [tool["name"] for tool in sample["tools"]]
if any([used_tool not in tools for used_tool in sample["used_tools"]]):
continue
new_data.append(sample)
data = new_data
print("Number of samples:", len(data))
def generate_dataset(data):
tools = {}
dataset = []
for sample in tqdm(data, desc="Processing samples"):
for tool in sample["tools"]:
if tool["name"] in tools:
continue
tools[tool["name"]] = {
"name": tool["name"],
"description": tool["description"],
"id": len(tools),
"source": sample["source"],
}
used_tools = []
for tool_name in sample["used_tools"]:
used_tools.append(tools[tool_name]["id"])
new_sample = {
"instruction": sample["instruction"],
"tools": used_tools,
"source": sample["source"],
}
dataset.append(new_sample)
return {"tools": list(tools.values()), "samples": dataset}
from collections import defaultdict
def count(data):
used_tools_count = defaultdict(int)
for item in data:
used_tools_count[len(item["used_tools"])] += 1
return used_tools_count
print("Used tools count in dataset:")
for k, v in count(data).items():
print(f"{k} used tools: {v} samples")
from random import shuffle, seed
# split dataset based on used tools count
# for one and two used tools we will use 80-20 split
# for three used tools we will all samples in test set
one_two = [item for item in data if len(item["used_tools"]) in [1, 2]]
other = [item for item in data if len(item["used_tools"]) > 2]
seed(42)
shuffle(one_two)
seed(42)
shuffle(other)
train_samples = one_two[: int(len(one_two) * 0.8)]
test_samples = one_two[int(len(one_two) * 0.8) :] + other
print("Train samples count:", len(train_samples))
print("Test samples count:", len(test_samples))
# count(train_samples), count(test_samples)
print("Train samples count based on used tools count:")
for k, v in count(train_samples).items():
print(f"{k} used tools: {v} samples")
print("Test samples count based on used tools count:")
for k, v in count(test_samples).items():
print(f"{k} used tools: {v} samples")
train_dataset = generate_dataset(train_samples)
test_dataset = generate_dataset(test_samples)
seed(42)
shuffle(train_dataset["samples"])
seed(42)
shuffle(test_dataset["samples"])
print("Number of tools in train dataset:", len(train_dataset["tools"]))
print("Number of samples in train dataset:", len(train_dataset["samples"]))
print("Number of tools in test dataset:", len(test_dataset["tools"]))
print("Number of samples in test dataset:", len(test_dataset["samples"]))
import os
os.makedirs("./datasets/mixed", exist_ok=True)
with open("./datasets/mixed/train.json", "w") as f:
json.dump(train_dataset, f, indent=2)
with open("./datasets/mixed/test.json", "w") as f:
json.dump(test_dataset, f, indent=2)