Spaces:
Paused
Paused
File size: 1,822 Bytes
4721aa1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
#! /usr/bin/env python
import json
from collections import Counter
from argparse import ArgumentParser
import os
parser = ArgumentParser()
parser.add_argument("--path", type=str, required=True)
args = parser.parse_args()
with open(args.path) as f:
data = json.load(f)
train_examples = []
err_count = 0
for setting in data:
api_desc = [setting["NLDocumentation"]]
for instance in setting["Instances"]:
try:
conv = [{
"role": "user",
"content": instance['input'],
}]
for step in instance['intermediate_steps']:
tool_name, params, react = step[0]
step_thought = react.split("Action:")[0].strip()
observation = step[1]
conv.append({
"role": "assistant",
"content": step_thought,
})
conv.append({
"role": "tool",
"name": tool_name,
"parameters": json.loads(params),
"observation": observation,
})
conv.append({
"role": "assistant",
"content": instance['Final Thought'] + "\n" + instance['output'],
})
except:
err_count += 1
else:
train_examples.append({
"tools": api_desc,
"conversations": conv
})
print("err_count:", err_count)
print("train_examples:", len(train_examples))
print("conversation distribution:", Counter([len(e["conversations"]) for e in train_examples]))
os.makedirs("formatted_data", exist_ok=True)
with open("formatted_data/tool_alpaca.jsonl", "w") as f:
for e in train_examples:
f.write(json.dumps(e, ensure_ascii=False) + "\n") |