Tongjilibo
commited on
Commit
•
3e5558b
1
Parent(s):
782bf57
init commit
Browse files- README.md +5 -0
- bert4torch_config.json +16 -0
- convert.py +113 -0
- pytorch_model.bin +3 -0
- vocab.txt +0 -0
README.md
CHANGED
@@ -1,3 +1,8 @@
|
|
1 |
---
|
2 |
license: apache-2.0
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
license: apache-2.0
|
3 |
---
|
4 |
+
|
5 |
+
|
6 |
+
- 源项目:https://github.com/ZhuiyiTechnology/t5-pegasus
|
7 |
+
- 本项目权重是下载tf权重后,使用convert脚本转换后得到的,可直接下载使用
|
8 |
+
- 此项目权重仅适配bert4torch项目
|
bert4torch_config.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"hidden_act": "gelu",
|
3 |
+
"hidden_dropout_prob": 0.1,
|
4 |
+
"hidden_size": 512,
|
5 |
+
"initializer_range": 0.02,
|
6 |
+
"intermediate_size": 1024,
|
7 |
+
"num_attention_heads": 6,
|
8 |
+
"attention_head_size": 64,
|
9 |
+
"num_hidden_layers": 8,
|
10 |
+
"vocab_size": 50000,
|
11 |
+
"relative_attention_num_buckets": 32,
|
12 |
+
"attention_scale": false,
|
13 |
+
"is_dropout": true,
|
14 |
+
"model": "mt5.1.1",
|
15 |
+
"segment_vocab_size": 0
|
16 |
+
}
|
convert.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# t5_pegasus从tf转为bert4torch适配的pytorch版本
|
2 |
+
# 权重链接:https://github.com/ZhuiyiTechnology/t5-pegasus
|
3 |
+
import torch
|
4 |
+
import tensorflow as tf
|
5 |
+
import json
|
6 |
+
|
7 |
+
choice = 'small'
|
8 |
+
|
9 |
+
if choice == 'small':
|
10 |
+
ckpt_dir = 'E:/pretrain_ckpt/t5/sushen@chinese_t5_pegasus_small_torch/'
|
11 |
+
tf_dir = 'E:/pretrain_ckpt/t5/sushen@chinese_t5_pegasus_small_tf/'
|
12 |
+
torch_path = ckpt_dir + 'pytorch_model.bin'
|
13 |
+
elif choice == 'base':
|
14 |
+
ckpt_dir = 'E:/pretrain_ckpt/t5/sushen@chinese_t5_pegasus_base_torch/'
|
15 |
+
tf_dir = 'E:/pretrain_ckpt/t5/sushen@chinese_t5_pegasus_base_tf/'
|
16 |
+
torch_path = ckpt_dir + 'pytorch_model.bin'
|
17 |
+
else:
|
18 |
+
raise ValueError(f'{choice} not in pre maintained choices')
|
19 |
+
|
20 |
+
|
21 |
+
tf_path = tf_dir + 'model.ckpt'
|
22 |
+
with open(tf_dir + 'config.json', 'r', encoding='utf-8') as f:
|
23 |
+
config = json.load(f)
|
24 |
+
num_layers = config['num_hidden_layers']
|
25 |
+
torch_state_dict = {}
|
26 |
+
|
27 |
+
mapping = {
|
28 |
+
'shared/embedding': 'shared.weight',
|
29 |
+
'encoder/block_000/layer_000/SelfAttention/relative_attention_bias': 'encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight##T', # 自定义标记,##T结尾表示要转置
|
30 |
+
'encoder/rms_norm/scale': 'encoder.final_layer_norm.weight',
|
31 |
+
'decoder/block_000/layer_000/SelfAttention/relative_attention_bias': 'decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight##T',
|
32 |
+
'decoder/rms_norm/scale': 'decoder.final_layer_norm.weight',
|
33 |
+
'decoder/logits/kernel': 'lm_head.weight##T'
|
34 |
+
}
|
35 |
+
|
36 |
+
|
37 |
+
for i in range(num_layers):
|
38 |
+
i1 = str(i).rjust(3, '0')
|
39 |
+
mapping.update({
|
40 |
+
f'encoder/block_{i1}/layer_000/SelfAttention/q': f'encoder.block.{i}.layer.0.SelfAttention.q.weight##T',
|
41 |
+
f'encoder/block_{i1}/layer_000/SelfAttention/k': f'encoder.block.{i}.layer.0.SelfAttention.k.weight##T',
|
42 |
+
f'encoder/block_{i1}/layer_000/SelfAttention/v': f'encoder.block.{i}.layer.0.SelfAttention.v.weight##T',
|
43 |
+
f'encoder/block_{i1}/layer_000/SelfAttention/o': f'encoder.block.{i}.layer.0.SelfAttention.o.weight##T',
|
44 |
+
f'encoder/block_{i1}/layer_000/rms_norm/scale': f'encoder.block.{i}.layer.0.layer_norm.weight',
|
45 |
+
f'encoder/block_{i1}/layer_001/DenseReluDense/wi_0/kernel': f'encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight##T',
|
46 |
+
f'encoder/block_{i1}/layer_001/DenseReluDense/wi_1/kernel': f'encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight##T',
|
47 |
+
f'encoder/block_{i1}/layer_001/DenseReluDense/wo/kernel': f'encoder.block.{i}.layer.1.DenseReluDense.wo.weight##T',
|
48 |
+
f'encoder/block_{i1}/layer_001/rms_norm/scale': f'encoder.block.{i}.layer.1.layer_norm.weight',
|
49 |
+
f'decoder/block_{i1}/layer_000/SelfAttention/q': f'decoder.block.{i}.layer.0.SelfAttention.q.weight##T',
|
50 |
+
f'decoder/block_{i1}/layer_000/SelfAttention/k': f'decoder.block.{i}.layer.0.SelfAttention.k.weight##T',
|
51 |
+
f'decoder/block_{i1}/layer_000/SelfAttention/v': f'decoder.block.{i}.layer.0.SelfAttention.v.weight##T',
|
52 |
+
f'decoder/block_{i1}/layer_000/SelfAttention/o': f'decoder.block.{i}.layer.0.SelfAttention.o.weight##T',
|
53 |
+
f'decoder/block_{i1}/layer_000/rms_norm/scale': f'decoder.block.{i}.layer.0.layer_norm.weight',
|
54 |
+
f'decoder/block_{i1}/layer_001/EncDecAttention/q': f'decoder.block.{i}.layer.1.EncDecAttention.q.weight##T',
|
55 |
+
f'decoder/block_{i1}/layer_001/EncDecAttention/k': f'decoder.block.{i}.layer.1.EncDecAttention.k.weight##T',
|
56 |
+
f'decoder/block_{i1}/layer_001/EncDecAttention/v': f'decoder.block.{i}.layer.1.EncDecAttention.v.weight##T',
|
57 |
+
f'decoder/block_{i1}/layer_001/EncDecAttention/o': f'decoder.block.{i}.layer.1.EncDecAttention.o.weight##T',
|
58 |
+
f'decoder/block_{i1}/layer_001/rms_norm/scale': f'decoder.block.{i}.layer.1.layer_norm.weight',
|
59 |
+
f'decoder/block_{i1}/layer_002/DenseReluDense/wi_0/kernel': f'decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight##T',
|
60 |
+
f'decoder/block_{i1}/layer_002/DenseReluDense/wi_1/kernel': f'decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight##T',
|
61 |
+
f'decoder/block_{i1}/layer_002/DenseReluDense/wo/kernel': f'decoder.block.{i}.layer.2.DenseReluDense.wo.weight##T',
|
62 |
+
f'decoder/block_{i1}/layer_002/rms_norm/scale': f'decoder.block.{i}.layer.2.layer_norm.weight',
|
63 |
+
})
|
64 |
+
|
65 |
+
transpose_layers = ['']
|
66 |
+
for k, v in mapping.items():
|
67 |
+
ts = torch.from_numpy(tf.train.load_variable(tf_path, k))
|
68 |
+
# if len(ts.shape)==2 and ts.shape[0] == ts.shape[1]:
|
69 |
+
# print(k, v)
|
70 |
+
|
71 |
+
if v.endswith('##T'):
|
72 |
+
torch_state_dict[v.rstrip('##T')] = ts.T
|
73 |
+
else:
|
74 |
+
torch_state_dict[v] = ts
|
75 |
+
|
76 |
+
torch.save(torch_state_dict, torch_path)
|
77 |
+
|
78 |
+
if choice == 'base':
|
79 |
+
config = \
|
80 |
+
{
|
81 |
+
"hidden_act": "gelu",
|
82 |
+
"hidden_dropout_prob": 0.1,
|
83 |
+
"hidden_size": 768,
|
84 |
+
"initializer_range": 0.02,
|
85 |
+
"intermediate_size": 2048,
|
86 |
+
"num_attention_heads": 12,
|
87 |
+
"attention_head_size": 64,
|
88 |
+
"num_hidden_layers": 12,
|
89 |
+
"vocab_size": 50000,
|
90 |
+
"relative_attention_num_buckets": 32,
|
91 |
+
"attention_scale": False,
|
92 |
+
"is_dropout": True
|
93 |
+
}
|
94 |
+
|
95 |
+
elif choice == 'small':
|
96 |
+
config = \
|
97 |
+
{
|
98 |
+
"hidden_act": "gelu",
|
99 |
+
"hidden_dropout_prob": 0.1,
|
100 |
+
"hidden_size": 512,
|
101 |
+
"initializer_range": 0.02,
|
102 |
+
"intermediate_size": 1024,
|
103 |
+
"num_attention_heads": 6,
|
104 |
+
"attention_head_size": 64,
|
105 |
+
"num_hidden_layers": 8,
|
106 |
+
"vocab_size": 50000,
|
107 |
+
"relative_attention_num_buckets": 32,
|
108 |
+
"attention_scale": False,
|
109 |
+
"is_dropout": True
|
110 |
+
}
|
111 |
+
|
112 |
+
with open(ckpt_dir+'/bert4torch_config.json', 'w') as f:
|
113 |
+
f.write(json.dumps(config, indent=4))
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1deb82235e25b61ff64c61f0dae488b511f32ced08ce73f991111ccbbbff8a86
|
3 |
+
size 381109253
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|