File size: 3,408 Bytes
01377f7
 
 
 
 
 
 
 
9a9a2f5
01377f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a9a2f5
 
 
 
 
 
01377f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a9a2f5
01377f7
9a9a2f5
01377f7
 
9a9a2f5
01377f7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import json
from tqdm import tqdm

DATASETS = ["apigen", "glaive", "toolace"]

data = []

for dataset_name in DATASETS:
    with open(f"./datasets/{dataset_name}/output.json") as f:
        subdata = json.load(f)

    subdata = [{**item, "source": dataset_name} for item in subdata]

    data.extend(subdata)

# filter all samples with used functions not in tool list
new_data = []

for sample in data:
    tools = [tool["name"] for tool in sample["tools"]]

    if any([used_tool not in tools for used_tool in sample["used_tools"]]):
        continue

    new_data.append(sample)

data = new_data

print("Number of samples:", len(data))


def generate_dataset(data):
    tools = {}
    dataset = []

    for sample in tqdm(data, desc="Processing samples"):

        for tool in sample["tools"]:
            if tool["name"] in tools:
                continue

            tools[tool["name"]] = {
                "name": tool["name"],
                "description": tool["description"],
                "id": len(tools),
                "source": sample["source"],
            }

        used_tools = []

        for tool_name in sample["used_tools"]:
            used_tools.append(tools[tool_name]["id"])

        new_sample = {
            "instruction": sample["instruction"],
            "tools": used_tools,
            "source": sample["source"],
        }

        dataset.append(new_sample)

    return {"tools": list(tools.values()), "samples": dataset}


from collections import defaultdict


def count(data):
    used_tools_count = defaultdict(int)

    for item in data:
        used_tools_count[len(item["used_tools"])] += 1

    return used_tools_count


print("Used tools count in dataset:")
for k, v in count(data).items():
    print(f"{k} used tools: {v} samples")

from random import shuffle, seed

# split dataset based on used tools count
# for one and two used tools we will use 80-20 split
# for three used tools we will all samples in test set
one_two = [item for item in data if len(item["used_tools"]) in [1, 2]]
other = [item for item in data if len(item["used_tools"]) > 2]

seed(42)
shuffle(one_two)

seed(42)
shuffle(other)

train_samples = one_two[: int(len(one_two) * 0.8)]
test_samples = one_two[int(len(one_two) * 0.8) :] + other

print("Train samples count:", len(train_samples))
print("Test samples count:", len(test_samples))

# count(train_samples), count(test_samples)
print("Train samples count based on used tools count:")
for k, v in count(train_samples).items():
    print(f"{k} used tools: {v} samples")

print("Test samples count based on used tools count:")
for k, v in count(test_samples).items():
    print(f"{k} used tools: {v} samples")

train_dataset = generate_dataset(train_samples)
test_dataset = generate_dataset(test_samples)

seed(42)
shuffle(train_dataset["samples"])

seed(42)
shuffle(test_dataset["samples"])

print("Number of tools in train dataset:", len(train_dataset["tools"]))
print("Number of samples in train dataset:", len(train_dataset["samples"]))

print("Number of tools in test dataset:", len(test_dataset["tools"]))
print("Number of samples in test dataset:", len(test_dataset["samples"]))

import os

os.makedirs("./datasets/mixed", exist_ok=True)

with open("./datasets/mixed/train.json", "w") as f:
    json.dump(train_dataset, f, indent=2)

with open("./datasets/mixed/test.json", "w") as f:
    json.dump(test_dataset, f, indent=2)