wangrongsheng
commited on
Commit
•
40d90bf
1
Parent(s):
a547042
add data
Browse files- .gitattributes +3 -0
- data/alpaca_data_zh_51k-clean.json +3 -0
- data/alpaca_gpt4_data_zh-clean.json +3 -0
- data/read_data.py +45 -0
- data/sharegpt-70k.json +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data/alpaca_data_zh_51k-clean.json filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data/alpaca_gpt4_data_zh-clean.json filter=lfs diff=lfs merge=lfs -text
|
38 |
+
data/sharegpt-70k.json filter=lfs diff=lfs merge=lfs -text
|
data/alpaca_data_zh_51k-clean.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f1c1962ed88f95f87ecbe70addd816fa3ade0ee5494a220a3c4972429e7cf111
|
3 |
+
size 18810090
|
data/alpaca_gpt4_data_zh-clean.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:951f1331cacabc7b5de2a5d72592a103be0676daba8d92ae7c67b061639e0f46
|
3 |
+
size 35100511
|
data/read_data.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from tqdm import tqdm
|
3 |
+
|
4 |
+
jsonl_file_path = 'common_zh_70k.jsonl'
|
5 |
+
|
6 |
+
results = []
|
7 |
+
# 打开JSON Lines文件
|
8 |
+
with open(jsonl_file_path, 'r', encoding='utf-8') as file:
|
9 |
+
# 逐行读取文件内容
|
10 |
+
for line in tqdm(file):
|
11 |
+
# 解析JSON行
|
12 |
+
json_object = json.loads(line.strip())
|
13 |
+
|
14 |
+
# 处理json_object,根据需要执行操作
|
15 |
+
#print(json_object['conversation'])
|
16 |
+
#print(len(json_object['conversation']))
|
17 |
+
#print(json_object['conversation'][0])
|
18 |
+
|
19 |
+
if len(json_object['conversation'])>=2:
|
20 |
+
rr = []
|
21 |
+
for cc in range(len(json_object['conversation'])-1):
|
22 |
+
rr.append([str(json_object['conversation'][cc]['human']), str(json_object['conversation'][cc]['assistant'])])
|
23 |
+
|
24 |
+
info = {
|
25 |
+
"instruction": str(json_object['conversation'][-1]['human']),
|
26 |
+
"input": "",
|
27 |
+
"output": str(json_object['conversation'][-1]['assistant']),
|
28 |
+
"history": rr
|
29 |
+
}
|
30 |
+
results.append(info)
|
31 |
+
|
32 |
+
if len(json_object['conversation'])==1:
|
33 |
+
info = {
|
34 |
+
"instruction": str(json_object['conversation'][0]['human']),
|
35 |
+
"input": "",
|
36 |
+
"output": str(json_object['conversation'][0]['assistant']),
|
37 |
+
"history": []
|
38 |
+
}
|
39 |
+
results.append(info)
|
40 |
+
|
41 |
+
# 打印完第一行后终止循环
|
42 |
+
#break
|
43 |
+
|
44 |
+
with open('./sharegpt-70k.json', 'w', encoding="utf-8") as f1:
|
45 |
+
json.dump(results, f1, ensure_ascii=False, indent=4)
|
data/sharegpt-70k.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:32bc6e7016fbdab5ee97a97bfb275246a5514b1326d8abfd71f1307b64e9ea8f
|
3 |
+
size 287978587
|