|
import json |
|
from tqdm import tqdm |
|
|
|
jsonl_file_path = 'common_zh_70k.jsonl' |
|
|
|
results = [] |
|
|
|
with open(jsonl_file_path, 'r', encoding='utf-8') as file: |
|
|
|
for line in tqdm(file): |
|
|
|
json_object = json.loads(line.strip()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(json_object['conversation'])>=2: |
|
rr = [] |
|
for cc in range(len(json_object['conversation'])-1): |
|
rr.append([str(json_object['conversation'][cc]['human']), str(json_object['conversation'][cc]['assistant'])]) |
|
|
|
info = { |
|
"instruction": str(json_object['conversation'][-1]['human']), |
|
"input": "", |
|
"output": str(json_object['conversation'][-1]['assistant']), |
|
"history": rr |
|
} |
|
results.append(info) |
|
|
|
if len(json_object['conversation'])==1: |
|
info = { |
|
"instruction": str(json_object['conversation'][0]['human']), |
|
"input": "", |
|
"output": str(json_object['conversation'][0]['assistant']), |
|
"history": [] |
|
} |
|
results.append(info) |
|
|
|
|
|
|
|
|
|
with open('./sharegpt-70k.json', 'w', encoding="utf-8") as f1: |
|
json.dump(results, f1, ensure_ascii=False, indent=4) |
|
|