|
import argparse |
|
import json |
|
import pathlib |
|
|
|
|
|
PROMPT_DICT = { |
|
"prompt_input": ( |
|
"Below is an instruction that describes a task, paired with an input that provides further context. " |
|
"Write a response that appropriately completes the request.\n\n" |
|
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:" |
|
), |
|
"prompt_no_input": ( |
|
"Below is an instruction that describes a task. " |
|
"Write a response that appropriately completes the request.\n\n" |
|
"### Instruction:\n{instruction}\n\n### Response:" |
|
), |
|
} |
|
|
|
|
|
def main(args): |
|
data_path = pathlib.Path(args.data_path) |
|
with data_path.open() as f: |
|
data = json.load(f) |
|
|
|
prompt_input, prompt_no_input = ( |
|
PROMPT_DICT["prompt_input"], |
|
PROMPT_DICT["prompt_no_input"], |
|
) |
|
sources = [ |
|
prompt_input.format_map(example) |
|
if example.get("input", "") != "" |
|
else prompt_no_input.format_map(example) |
|
for example in data |
|
] |
|
targets = [example["output"] for example in data] |
|
|
|
new_data = [] |
|
cnt = 1 |
|
for s, t in zip(sources, targets): |
|
new_data.append( |
|
{ |
|
"id": str(cnt), |
|
"conversations": [ |
|
{ |
|
"from": "human", |
|
"value": s, |
|
}, |
|
{ |
|
"from": "gpt", |
|
"value": t, |
|
}, |
|
], |
|
} |
|
) |
|
cnt += 1 |
|
|
|
json.dump(new_data, open(args.output_path, "w"), indent=2) |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--data_path", type=str, default="alpaca-data.json") |
|
parser.add_argument( |
|
"--output_path", type=str, default="alpaca-data-conversation.json" |
|
) |
|
args = parser.parse_args() |
|
main(args) |
|
|