File size: 3,408 Bytes
01377f7 9a9a2f5 01377f7 9a9a2f5 01377f7 9a9a2f5 01377f7 9a9a2f5 01377f7 9a9a2f5 01377f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import json
from tqdm import tqdm
DATASETS = ["apigen", "glaive", "toolace"]
data = []
for dataset_name in DATASETS:
with open(f"./datasets/{dataset_name}/output.json") as f:
subdata = json.load(f)
subdata = [{**item, "source": dataset_name} for item in subdata]
data.extend(subdata)
# filter all samples with used functions not in tool list
new_data = []
for sample in data:
tools = [tool["name"] for tool in sample["tools"]]
if any([used_tool not in tools for used_tool in sample["used_tools"]]):
continue
new_data.append(sample)
data = new_data
print("Number of samples:", len(data))
def generate_dataset(data):
tools = {}
dataset = []
for sample in tqdm(data, desc="Processing samples"):
for tool in sample["tools"]:
if tool["name"] in tools:
continue
tools[tool["name"]] = {
"name": tool["name"],
"description": tool["description"],
"id": len(tools),
"source": sample["source"],
}
used_tools = []
for tool_name in sample["used_tools"]:
used_tools.append(tools[tool_name]["id"])
new_sample = {
"instruction": sample["instruction"],
"tools": used_tools,
"source": sample["source"],
}
dataset.append(new_sample)
return {"tools": list(tools.values()), "samples": dataset}
from collections import defaultdict
def count(data):
used_tools_count = defaultdict(int)
for item in data:
used_tools_count[len(item["used_tools"])] += 1
return used_tools_count
print("Used tools count in dataset:")
for k, v in count(data).items():
print(f"{k} used tools: {v} samples")
from random import shuffle, seed
# split dataset based on used tools count
# for one and two used tools we will use 80-20 split
# for three used tools we will all samples in test set
one_two = [item for item in data if len(item["used_tools"]) in [1, 2]]
other = [item for item in data if len(item["used_tools"]) > 2]
seed(42)
shuffle(one_two)
seed(42)
shuffle(other)
train_samples = one_two[: int(len(one_two) * 0.8)]
test_samples = one_two[int(len(one_two) * 0.8) :] + other
print("Train samples count:", len(train_samples))
print("Test samples count:", len(test_samples))
# count(train_samples), count(test_samples)
print("Train samples count based on used tools count:")
for k, v in count(train_samples).items():
print(f"{k} used tools: {v} samples")
print("Test samples count based on used tools count:")
for k, v in count(test_samples).items():
print(f"{k} used tools: {v} samples")
train_dataset = generate_dataset(train_samples)
test_dataset = generate_dataset(test_samples)
seed(42)
shuffle(train_dataset["samples"])
seed(42)
shuffle(test_dataset["samples"])
print("Number of tools in train dataset:", len(train_dataset["tools"]))
print("Number of samples in train dataset:", len(train_dataset["samples"]))
print("Number of tools in test dataset:", len(test_dataset["tools"]))
print("Number of samples in test dataset:", len(test_dataset["samples"]))
import os
os.makedirs("./datasets/mixed", exist_ok=True)
with open("./datasets/mixed/train.json", "w") as f:
json.dump(train_dataset, f, indent=2)
with open("./datasets/mixed/test.json", "w") as f:
json.dump(test_dataset, f, indent=2)
|