aapot commited on
Commit
3323445
1 Parent(s): eb43aa0

Add convbert generator model

Browse files
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ConvBertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "conv_kernel_size": 9,
9
+ "embedding_size": 768,
10
+ "eos_token_id": 2,
11
+ "head_ratio": 2,
12
+ "hidden_act": "gelu",
13
+ "hidden_dropout_prob": 0.1,
14
+ "hidden_size": 256,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 1024,
17
+ "layer_norm_eps": 1e-12,
18
+ "max_position_embeddings": 512,
19
+ "model_type": "convbert",
20
+ "num_attention_heads": 4,
21
+ "num_groups": 1,
22
+ "num_hidden_layers": 12,
23
+ "pad_token_id": 0,
24
+ "torch_dtype": "float32",
25
+ "transformers_version": "4.17.0.dev0",
26
+ "type_vocab_size": 2,
27
+ "use_cache": true,
28
+ "vocab_size": 50265
29
+ }
convert_original_convbert_tf_checkpoint_to_generator_pytorch.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/huggingface/transformers/issues/9920#issuecomment-770970712
2
+
3
+ import torch
4
+ import os
5
+
6
+ import tensorflow as tf
7
+
8
+ from transformers import ConvBertConfig, ConvBertForMaskedLM, ConvBertPreTrainedModel
9
+ from transformers.utils import logging
10
+ from operator import attrgetter
11
+
12
+ logger = logging.get_logger(__name__)
13
+
14
+ config_file = "/researchdisk/convbert-base-generator-finnish/config.json"
15
+ tf_path = "/researchdisk/convbert-base-finnish/renamed-model.ckpt"
16
+ pytorch_dump_path = "/researchdisk/convbert-base-generator-finnish"
17
+ config = ConvBertConfig.from_json_file(config_file)
18
+
19
+ model = ConvBertForMaskedLM(config)
20
+
21
+ def load_tf_weights_in_convbert(model, config, tf_checkpoint_path):
22
+ """Load tf checkpoints in a pytorch model."""
23
+ try:
24
+ import tensorflow as tf
25
+ except ImportError:
26
+ logger.error(
27
+ "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
28
+ "https://www.tensorflow.org/install/ for installation instructions."
29
+ )
30
+ raise
31
+ tf_path = os.path.abspath(tf_checkpoint_path)
32
+ logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
33
+ # Load weights from TF model
34
+ init_vars = tf.train.list_variables(tf_path)
35
+ tf_data = {}
36
+ for name, shape in init_vars:
37
+ logger.info("Loading TF weight {} with shape {}".format(name, shape))
38
+ array = tf.train.load_variable(tf_path, name)
39
+ tf_data[name] = array
40
+
41
+ param_mapping = {
42
+ "convbert.embeddings.word_embeddings.weight": "electra/embeddings/word_embeddings",
43
+ "convbert.embeddings.position_embeddings.weight": "electra/embeddings/position_embeddings",
44
+ "convbert.embeddings.token_type_embeddings.weight": "electra/embeddings/token_type_embeddings",
45
+ "convbert.embeddings.LayerNorm.weight": "electra/embeddings/LayerNorm/gamma",
46
+ "convbert.embeddings.LayerNorm.bias": "electra/embeddings/LayerNorm/beta",
47
+ "convbert.embeddings_project.weight": "generator/embeddings_project/kernel",
48
+ "convbert.embeddings_project.bias": "generator/embeddings_project/bias",
49
+ "generator_predictions.LayerNorm.weight": "generator_predictions/LayerNorm/gamma",
50
+ "generator_predictions.LayerNorm.bias": "generator_predictions/LayerNorm/beta",
51
+ "generator_predictions.dense.weight": "generator_predictions/dense/kernel",
52
+ "generator_predictions.dense.bias": "generator_predictions/dense/bias",
53
+ "generator_lm_head.bias": "generator_predictions/output_bias"
54
+ }
55
+ if config.num_groups > 1:
56
+ group_dense_name = "g_dense"
57
+ else:
58
+ group_dense_name = "dense"
59
+
60
+ for j in range(config.num_hidden_layers):
61
+ param_mapping[
62
+ f"convbert.encoder.layer.{j}.attention.self.query.weight"
63
+ ] = f"generator/encoder/layer_{j}/attention/self/query/kernel"
64
+ param_mapping[
65
+ f"convbert.encoder.layer.{j}.attention.self.query.bias"
66
+ ] = f"generator/encoder/layer_{j}/attention/self/query/bias"
67
+ param_mapping[
68
+ f"convbert.encoder.layer.{j}.attention.self.key.weight"
69
+ ] = f"generator/encoder/layer_{j}/attention/self/key/kernel"
70
+ param_mapping[
71
+ f"convbert.encoder.layer.{j}.attention.self.key.bias"
72
+ ] = f"generator/encoder/layer_{j}/attention/self/key/bias"
73
+ param_mapping[
74
+ f"convbert.encoder.layer.{j}.attention.self.value.weight"
75
+ ] = f"generator/encoder/layer_{j}/attention/self/value/kernel"
76
+ param_mapping[
77
+ f"convbert.encoder.layer.{j}.attention.self.value.bias"
78
+ ] = f"generator/encoder/layer_{j}/attention/self/value/bias"
79
+ param_mapping[
80
+ f"convbert.encoder.layer.{j}.attention.self.key_conv_attn_layer.depthwise.weight"
81
+ ] = f"generator/encoder/layer_{j}/attention/self/conv_attn_key/depthwise_kernel"
82
+ param_mapping[
83
+ f"convbert.encoder.layer.{j}.attention.self.key_conv_attn_layer.pointwise.weight"
84
+ ] = f"generator/encoder/layer_{j}/attention/self/conv_attn_key/pointwise_kernel"
85
+ param_mapping[
86
+ f"convbert.encoder.layer.{j}.attention.self.key_conv_attn_layer.bias"
87
+ ] = f"generator/encoder/layer_{j}/attention/self/conv_attn_key/bias"
88
+ param_mapping[
89
+ f"convbert.encoder.layer.{j}.attention.self.conv_kernel_layer.weight"
90
+ ] = f"generator/encoder/layer_{j}/attention/self/conv_attn_kernel/kernel"
91
+ param_mapping[
92
+ f"convbert.encoder.layer.{j}.attention.self.conv_kernel_layer.bias"
93
+ ] = f"generator/encoder/layer_{j}/attention/self/conv_attn_kernel/bias"
94
+ param_mapping[
95
+ f"convbert.encoder.layer.{j}.attention.self.conv_out_layer.weight"
96
+ ] = f"generator/encoder/layer_{j}/attention/self/conv_attn_point/kernel"
97
+ param_mapping[
98
+ f"convbert.encoder.layer.{j}.attention.self.conv_out_layer.bias"
99
+ ] = f"generator/encoder/layer_{j}/attention/self/conv_attn_point/bias"
100
+ param_mapping[
101
+ f"convbert.encoder.layer.{j}.attention.output.dense.weight"
102
+ ] = f"generator/encoder/layer_{j}/attention/output/dense/kernel"
103
+ param_mapping[
104
+ f"convbert.encoder.layer.{j}.attention.output.LayerNorm.weight"
105
+ ] = f"generator/encoder/layer_{j}/attention/output/LayerNorm/gamma"
106
+ param_mapping[
107
+ f"convbert.encoder.layer.{j}.attention.output.dense.bias"
108
+ ] = f"generator/encoder/layer_{j}/attention/output/dense/bias"
109
+ param_mapping[
110
+ f"convbert.encoder.layer.{j}.attention.output.LayerNorm.bias"
111
+ ] = f"generator/encoder/layer_{j}/attention/output/LayerNorm/beta"
112
+ param_mapping[
113
+ f"convbert.encoder.layer.{j}.intermediate.dense.weight"
114
+ ] = f"generator/encoder/layer_{j}/intermediate/{group_dense_name}/kernel"
115
+ param_mapping[
116
+ f"convbert.encoder.layer.{j}.intermediate.dense.bias"
117
+ ] = f"generator/encoder/layer_{j}/intermediate/{group_dense_name}/bias"
118
+ param_mapping[
119
+ f"convbert.encoder.layer.{j}.output.dense.weight"
120
+ ] = f"generator/encoder/layer_{j}/output/{group_dense_name}/kernel"
121
+ param_mapping[
122
+ f"convbert.encoder.layer.{j}.output.dense.bias"
123
+ ] = f"generator/encoder/layer_{j}/output/{group_dense_name}/bias"
124
+ param_mapping[
125
+ f"convbert.encoder.layer.{j}.output.LayerNorm.weight"
126
+ ] = f"generator/encoder/layer_{j}/output/LayerNorm/gamma"
127
+ param_mapping[f"convbert.encoder.layer.{j}.output.LayerNorm.bias"] = f"generator/encoder/layer_{j}/output/LayerNorm/beta"
128
+
129
+ for param in model.named_parameters():
130
+ param_name = param[0]
131
+ retriever = attrgetter(param_name)
132
+ result = retriever(model)
133
+ tf_name = param_mapping[param_name]
134
+ value = torch.from_numpy(tf_data[tf_name])
135
+ logger.info(f"TF: {tf_name}, PT: {param_name} ")
136
+ if tf_name.endswith("/kernel"):
137
+ if not tf_name.endswith("/intermediate/g_dense/kernel"):
138
+ if not tf_name.endswith("/output/g_dense/kernel"):
139
+ value = value.T
140
+ if tf_name.endswith("/depthwise_kernel"):
141
+ value = value.permute(1, 2, 0) # 2, 0, 1
142
+ if tf_name.endswith("/pointwise_kernel"):
143
+ value = value.permute(2, 1, 0) # 2, 1, 0
144
+ if tf_name.endswith("/conv_attn_key/bias"):
145
+ value = value.unsqueeze(-1)
146
+ result.data = value
147
+ return model
148
+
149
+ model = load_tf_weights_in_convbert(model, config, tf_path)
150
+ model.save_pretrained(pytorch_dump_path)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:403378cbb9fb6cf606824f4d46ccf64fa7564787ed8054dee65570e792656df7
3
+ size 194453503
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-cased", "tokenizer_class": "BertTokenizer"}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff