""" Convert alpaca dataset into sharegpt format. Usage: python3 -m fastchat.data.convert_alpaca --in alpaca_data.json """ import argparse import json from transformers import AutoTokenizer, AutoModelForCausalLM import numpy as np if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--in-file", type=str) parser.add_argument("--out-file", type=str) args = parser.parse_args() content = json.load(open(args.in_file, "r")) new_content = [] for i, c in enumerate(content): if len(c["input"].strip()) > 1: q, a = c["instruction"] + "\nInput:\n" + c["input"], c["output"] else: q, a = c["instruction"], c["output"] new_content.append( { "id": f"alpaca_{i}", "conversations": [ {"from": "human", "value": q}, {"from": "gpt", "value": a}, ], } ) print(f"#out: {len(new_content)}") json.dump(new_content, open(args.out_file, "w"), indent=2, ensure_ascii=False)