Spaces:
Paused
Paused
#! /usr/bin/env python | |
import json | |
from collections import Counter | |
from argparse import ArgumentParser | |
import os | |
parser = ArgumentParser() | |
parser.add_argument("--path", type=str, required=True) | |
args = parser.parse_args() | |
with open(args.path) as f: | |
data = json.load(f) | |
train_examples = [] | |
err_count = 0 | |
for setting in data: | |
api_desc = [setting["NLDocumentation"]] | |
for instance in setting["Instances"]: | |
try: | |
conv = [{ | |
"role": "user", | |
"content": instance['input'], | |
}] | |
for step in instance['intermediate_steps']: | |
tool_name, params, react = step[0] | |
step_thought = react.split("Action:")[0].strip() | |
observation = step[1] | |
conv.append({ | |
"role": "assistant", | |
"content": step_thought, | |
}) | |
conv.append({ | |
"role": "tool", | |
"name": tool_name, | |
"parameters": json.loads(params), | |
"observation": observation, | |
}) | |
conv.append({ | |
"role": "assistant", | |
"content": instance['Final Thought'] + "\n" + instance['output'], | |
}) | |
except: | |
err_count += 1 | |
else: | |
train_examples.append({ | |
"tools": api_desc, | |
"conversations": conv | |
}) | |
print("err_count:", err_count) | |
print("train_examples:", len(train_examples)) | |
print("conversation distribution:", Counter([len(e["conversations"]) for e in train_examples])) | |
os.makedirs("formatted_data", exist_ok=True) | |
with open("formatted_data/tool_alpaca.jsonl", "w") as f: | |
for e in train_examples: | |
f.write(json.dumps(e, ensure_ascii=False) + "\n") |