|
import json |
|
from tqdm import tqdm |
|
|
|
DATASETS = ["apigen", "glaive", "toolace"] |
|
|
|
data = [] |
|
|
|
for dataset_name in DATASETS: |
|
with open(f"./datasets/{dataset_name}/output.json") as f: |
|
subdata = json.load(f) |
|
|
|
subdata = [{**item, "source": dataset_name} for item in subdata] |
|
|
|
data.extend(subdata) |
|
|
|
|
|
new_data = [] |
|
|
|
for sample in data: |
|
tools = [tool["name"] for tool in sample["tools"]] |
|
|
|
if any([used_tool not in tools for used_tool in sample["used_tools"]]): |
|
continue |
|
|
|
new_data.append(sample) |
|
|
|
data = new_data |
|
|
|
print("Number of samples:", len(data)) |
|
|
|
|
|
def generate_dataset(data): |
|
tools = {} |
|
dataset = [] |
|
|
|
for sample in tqdm(data, desc="Processing samples"): |
|
|
|
for tool in sample["tools"]: |
|
if tool["name"] in tools: |
|
continue |
|
|
|
tools[tool["name"]] = { |
|
"name": tool["name"], |
|
"description": tool["description"], |
|
"id": len(tools), |
|
"source": sample["source"], |
|
} |
|
|
|
used_tools = [] |
|
|
|
for tool_name in sample["used_tools"]: |
|
used_tools.append(tools[tool_name]["id"]) |
|
|
|
new_sample = { |
|
"instruction": sample["instruction"], |
|
"tools": used_tools, |
|
"source": sample["source"], |
|
} |
|
|
|
dataset.append(new_sample) |
|
|
|
return {"tools": list(tools.values()), "samples": dataset} |
|
|
|
|
|
from collections import defaultdict |
|
|
|
|
|
def count(data): |
|
used_tools_count = defaultdict(int) |
|
|
|
for item in data: |
|
used_tools_count[len(item["used_tools"])] += 1 |
|
|
|
return used_tools_count |
|
|
|
|
|
print("Used tools count in dataset:") |
|
for k, v in count(data).items(): |
|
print(f"{k} used tools: {v} samples") |
|
|
|
from random import shuffle, seed |
|
|
|
|
|
|
|
|
|
one_two = [item for item in data if len(item["used_tools"]) in [1, 2]] |
|
other = [item for item in data if len(item["used_tools"]) > 2] |
|
|
|
seed(42) |
|
shuffle(one_two) |
|
|
|
seed(42) |
|
shuffle(other) |
|
|
|
train_samples = one_two[: int(len(one_two) * 0.8)] |
|
test_samples = one_two[int(len(one_two) * 0.8) :] + other |
|
|
|
print("Train samples count:", len(train_samples)) |
|
print("Test samples count:", len(test_samples)) |
|
|
|
|
|
print("Train samples count based on used tools count:") |
|
for k, v in count(train_samples).items(): |
|
print(f"{k} used tools: {v} samples") |
|
|
|
print("Test samples count based on used tools count:") |
|
for k, v in count(test_samples).items(): |
|
print(f"{k} used tools: {v} samples") |
|
|
|
train_dataset = generate_dataset(train_samples) |
|
test_dataset = generate_dataset(test_samples) |
|
|
|
seed(42) |
|
shuffle(train_dataset["samples"]) |
|
|
|
seed(42) |
|
shuffle(test_dataset["samples"]) |
|
|
|
print("Number of tools in train dataset:", len(train_dataset["tools"])) |
|
print("Number of samples in train dataset:", len(train_dataset["samples"])) |
|
|
|
print("Number of tools in test dataset:", len(test_dataset["tools"])) |
|
print("Number of samples in test dataset:", len(test_dataset["samples"])) |
|
|
|
import os |
|
|
|
os.makedirs("./datasets/mixed", exist_ok=True) |
|
|
|
with open("./datasets/mixed/train.json", "w") as f: |
|
json.dump(train_dataset, f, indent=2) |
|
|
|
with open("./datasets/mixed/test.json", "w") as f: |
|
json.dump(test_dataset, f, indent=2) |
|
|