Text Generation
Safetensors
Chinese
English
conversational
File size: 1,584 Bytes
40d90bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import json
from tqdm import tqdm

jsonl_file_path = 'common_zh_70k.jsonl'

results = []
# 打开JSON Lines文件
with open(jsonl_file_path, 'r', encoding='utf-8') as file:
    # 逐行读取文件内容
    for line in tqdm(file):
        # 解析JSON行
        json_object = json.loads(line.strip())
        
        # 处理json_object,根据需要执行操作
        #print(json_object['conversation'])
        #print(len(json_object['conversation']))
        #print(json_object['conversation'][0])

        if len(json_object['conversation'])>=2:
            rr = []
            for cc in range(len(json_object['conversation'])-1):
                rr.append([str(json_object['conversation'][cc]['human']), str(json_object['conversation'][cc]['assistant'])])

            info = {
                "instruction": str(json_object['conversation'][-1]['human']),
                "input": "",
                "output": str(json_object['conversation'][-1]['assistant']),
                "history": rr
              }
            results.append(info)
            
        if len(json_object['conversation'])==1:
            info = {
                "instruction": str(json_object['conversation'][0]['human']),
                "input": "",
                "output": str(json_object['conversation'][0]['assistant']),
                "history": []
              }
            results.append(info)
        
        # 打印完第一行后终止循环
        #break

with open('./sharegpt-70k.json', 'w', encoding="utf-8") as f1:
    json.dump(results, f1, ensure_ascii=False, indent=4)