|
import json |
|
import re |
|
|
|
with open("./datasets/toolace/data.json") as f: |
|
data = json.load(f) |
|
|
|
|
|
def process_sample(sample): |
|
system_message = sample["system"] |
|
functions_match = re.findall( |
|
r'\{"name": "(.*?)", "description": "(.*?)"', system_message |
|
) |
|
functions = [ |
|
{"name": name.replace(" ", "_").replace("/", "_"), "description": desc} |
|
for name, desc in functions_match |
|
] |
|
|
|
|
|
conversations = sample["conversations"] |
|
results = [] |
|
|
|
for i in range(len(conversations)): |
|
entry = conversations[i] |
|
|
|
|
|
if ( |
|
entry["from"] == "user" |
|
and i + 1 < len(conversations) |
|
and conversations[i + 1]["from"] == "assistant" |
|
): |
|
function_call = conversations[i + 1]["value"] |
|
if re.match(r"\[.*\]", function_call): |
|
|
|
used_tools = [ |
|
tool.replace(" ", "_").replace("/", "_") |
|
for tool in re.findall(r"\[([^\(]+)\(", function_call) |
|
] |
|
|
|
if len(used_tools) == 0: |
|
continue |
|
|
|
results.append( |
|
{ |
|
"instruction": entry["value"], |
|
"tools": functions, |
|
"used_tools": used_tools, |
|
} |
|
) |
|
|
|
return results |
|
|
|
|
|
from tqdm import tqdm |
|
|
|
results = [] |
|
for sample in tqdm(data, desc="Processing ToolACE samples"): |
|
processed = process_sample(sample) |
|
if processed: |
|
results.extend(processed) |
|
|
|
with open("./datasets/toolace/output.json", "w") as f: |
|
json.dump(results, f, indent=2) |
|
|