NeuralChat-LLAMA-POC / fastchat /data /alpaca-converter.py
lvkaokao
update codes.
5a7ab71
raw
history blame
1.95 kB
import argparse
import json
import pathlib
# Prompt from stanford alpaca's training script
PROMPT_DICT = {
"prompt_input": (
"Below is an instruction that describes a task, paired with an input that provides further context. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
),
"prompt_no_input": (
"Below is an instruction that describes a task. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Response:"
),
}
def main(args):
data_path = pathlib.Path(args.data_path)
with data_path.open() as f:
data = json.load(f)
prompt_input, prompt_no_input = (
PROMPT_DICT["prompt_input"],
PROMPT_DICT["prompt_no_input"],
)
sources = [
prompt_input.format_map(example)
if example.get("input", "") != ""
else prompt_no_input.format_map(example)
for example in data
]
targets = [example["output"] for example in data]
new_data = []
cnt = 1
for s, t in zip(sources, targets):
new_data.append(
{
"id": str(cnt),
"conversations": [
{
"from": "human",
"value": s,
},
{
"from": "gpt",
"value": t,
},
],
}
)
cnt += 1
json.dump(new_data, open(args.output_path, "w"), indent=2)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--data_path", type=str, default="alpaca-data.json")
parser.add_argument(
"--output_path", type=str, default="alpaca-data-conversation.json"
)
args = parser.parse_args()
main(args)