Text Generation
Transformers
English
mixtral
legal
conversational
Inference Endpoints
redactable-dolphin-mixtral / dedupeToShareGpt.py
d-delaurier's picture
Upload 11 files
30e605a
raw
history blame
1.08 kB
import argparse
import jsonlines
import json
from tqdm import tqdm
import uuid
parser = argparse.ArgumentParser()
parser.add_argument("--in-file", type=str, default="flan1m-alpaca-uncensored.jsonl")
parser.add_argument("--out-file", type=str, default="flan1m-sharegpt-deduped.json")
args = parser.parse_args()
in_file = args.in_file
out_file = args.out_file
f = open(out_file, "w", encoding="utf-8")
questions = {}
out = []
with jsonlines.open(in_file) as reader:
for obj in tqdm(reader):
if questions.get(obj["instruction"] + obj["input"]) is None:
questions[obj["instruction"] + obj["input"]] = True
out.append(
{
"id": f"{uuid.uuid4()}",
"bot": "dolphin",
"training": obj["instruction"],
"conversations": [
{"from": "human", "value": obj["input"]},
{"from": "gpt", "value": obj["output"]},
],
}
)
json.dump(out, f, ensure_ascii=False)
f.close()