import argparse import json import pathlib # Prompt from stanford alpaca's training script PROMPT_DICT = { "prompt_input": ( "Below is an instruction that describes a task, paired with an input that provides further context. " "Write a response that appropriately completes the request.\n\n" "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:" ), "prompt_no_input": ( "Below is an instruction that describes a task. " "Write a response that appropriately completes the request.\n\n" "### Instruction:\n{instruction}\n\n### Response:" ), } def main(args): data_path = pathlib.Path(args.data_path) with data_path.open() as f: data = json.load(f) prompt_input, prompt_no_input = ( PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"], ) sources = [ prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example) for example in data ] targets = [example["output"] for example in data] new_data = [] cnt = 1 for s, t in zip(sources, targets): new_data.append( { "id": str(cnt), "conversations": [ { "from": "human", "value": s, }, { "from": "gpt", "value": t, }, ], } ) cnt += 1 json.dump(new_data, open(args.output_path, "w"), indent=2) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--data_path", type=str, default="alpaca-data.json") parser.add_argument( "--output_path", type=str, default="alpaca-data-conversation.json" ) args = parser.parse_args() main(args)