|
import argparse |
|
import jsonlines |
|
import json |
|
from tqdm import tqdm |
|
import uuid |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--in-file", type=str, default="flan1m-alpaca-uncensored.jsonl") |
|
parser.add_argument("--out-file", type=str, default="flan1m-sharegpt-deduped.json") |
|
args = parser.parse_args() |
|
in_file = args.in_file |
|
out_file = args.out_file |
|
|
|
f = open(out_file, "w", encoding="utf-8") |
|
|
|
questions = {} |
|
|
|
out = [] |
|
with jsonlines.open(in_file) as reader: |
|
for obj in tqdm(reader): |
|
if questions.get(obj["instruction"] + obj["input"]) is None: |
|
questions[obj["instruction"] + obj["input"]] = True |
|
out.append( |
|
{ |
|
"id": f"{uuid.uuid4()}", |
|
"bot": "dolphin", |
|
"training": obj["instruction"], |
|
"conversations": [ |
|
{"from": "human", "value": obj["input"]}, |
|
{"from": "gpt", "value": obj["output"]}, |
|
], |
|
} |
|
) |
|
json.dump(out, f, ensure_ascii=False) |
|
f.close() |
|
|