Text Generation
Safetensors
Chinese
English
conversational
wangrongsheng commited on
Commit
40d90bf
1 Parent(s): a547042
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/alpaca_data_zh_51k-clean.json filter=lfs diff=lfs merge=lfs -text
37
+ data/alpaca_gpt4_data_zh-clean.json filter=lfs diff=lfs merge=lfs -text
38
+ data/sharegpt-70k.json filter=lfs diff=lfs merge=lfs -text
data/alpaca_data_zh_51k-clean.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1c1962ed88f95f87ecbe70addd816fa3ade0ee5494a220a3c4972429e7cf111
3
+ size 18810090
data/alpaca_gpt4_data_zh-clean.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:951f1331cacabc7b5de2a5d72592a103be0676daba8d92ae7c67b061639e0f46
3
+ size 35100511
data/read_data.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from tqdm import tqdm
3
+
4
+ jsonl_file_path = 'common_zh_70k.jsonl'
5
+
6
+ results = []
7
+ # 打开JSON Lines文件
8
+ with open(jsonl_file_path, 'r', encoding='utf-8') as file:
9
+ # 逐行读取文件内容
10
+ for line in tqdm(file):
11
+ # 解析JSON行
12
+ json_object = json.loads(line.strip())
13
+
14
+ # 处理json_object,根据需要执行操作
15
+ #print(json_object['conversation'])
16
+ #print(len(json_object['conversation']))
17
+ #print(json_object['conversation'][0])
18
+
19
+ if len(json_object['conversation'])>=2:
20
+ rr = []
21
+ for cc in range(len(json_object['conversation'])-1):
22
+ rr.append([str(json_object['conversation'][cc]['human']), str(json_object['conversation'][cc]['assistant'])])
23
+
24
+ info = {
25
+ "instruction": str(json_object['conversation'][-1]['human']),
26
+ "input": "",
27
+ "output": str(json_object['conversation'][-1]['assistant']),
28
+ "history": rr
29
+ }
30
+ results.append(info)
31
+
32
+ if len(json_object['conversation'])==1:
33
+ info = {
34
+ "instruction": str(json_object['conversation'][0]['human']),
35
+ "input": "",
36
+ "output": str(json_object['conversation'][0]['assistant']),
37
+ "history": []
38
+ }
39
+ results.append(info)
40
+
41
+ # 打印完第一行后终止循环
42
+ #break
43
+
44
+ with open('./sharegpt-70k.json', 'w', encoding="utf-8") as f1:
45
+ json.dump(results, f1, ensure_ascii=False, indent=4)
data/sharegpt-70k.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32bc6e7016fbdab5ee97a97bfb275246a5514b1326d8abfd71f1307b64e9ea8f
3
+ size 287978587