kevinwang676's picture
Upload folder using huggingface_hub
4721aa1
#! /usr/bin/env python
import json
from collections import Counter
from argparse import ArgumentParser
import os
parser = ArgumentParser()
parser.add_argument("--path", type=str, required=True)
args = parser.parse_args()
with open(args.path) as f:
data = json.load(f)
train_examples = []
err_count = 0
for setting in data:
api_desc = [setting["NLDocumentation"]]
for instance in setting["Instances"]:
try:
conv = [{
"role": "user",
"content": instance['input'],
}]
for step in instance['intermediate_steps']:
tool_name, params, react = step[0]
step_thought = react.split("Action:")[0].strip()
observation = step[1]
conv.append({
"role": "assistant",
"content": step_thought,
})
conv.append({
"role": "tool",
"name": tool_name,
"parameters": json.loads(params),
"observation": observation,
})
conv.append({
"role": "assistant",
"content": instance['Final Thought'] + "\n" + instance['output'],
})
except:
err_count += 1
else:
train_examples.append({
"tools": api_desc,
"conversations": conv
})
print("err_count:", err_count)
print("train_examples:", len(train_examples))
print("conversation distribution:", Counter([len(e["conversations"]) for e in train_examples]))
os.makedirs("formatted_data", exist_ok=True)
with open("formatted_data/tool_alpaca.jsonl", "w") as f:
for e in train_examples:
f.write(json.dumps(e, ensure_ascii=False) + "\n")