model corrections

Browse files

Files changed (4) hide show

config.json +2 -2
make-tiny-albert.py +11 -6
pytorch_model.bin +2 -2
tokenizer_config.json +1 -1

config.json CHANGED Viewed

@@ -11,12 +11,12 @@
   "gap_size": 0,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
-  "hidden_size": 256,
   "initializer_range": 0.02,
   "inner_group_num": 1,
   "intermediate_size": 128,
   "layer_norm_eps": 1e-12,
-  "max_position_embeddings": 64,
   "model_type": "albert",
   "net_structure_type": 0,
   "num_attention_heads": 2,

   "gap_size": 0,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
+  "hidden_size": 32,
   "initializer_range": 0.02,
   "inner_group_num": 1,
   "intermediate_size": 128,
   "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 256,
   "model_type": "albert",
   "net_structure_type": 0,
   "num_attention_heads": 2,

make-tiny-albert.py CHANGED Viewed

@@ -66,10 +66,11 @@ import os
 # workaround for fast tokenizer protobuf issue, and it's much faster too!
 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
-from transformers import AlbertTokenizerFast, AlbertConfig, AlbertForMaskedLM
 mname_orig = "albert-base-v1"
 mname_tiny = "tiny-albert"
 ### Tokenizer
@@ -82,7 +83,7 @@ if 1: # set to 0 to skip this after running once to speed things up during tune
     # HACK: need the sentencepiece source to get sentencepiece_model_pb2, as it doesn't get installed
     sys.path.append("../sentencepiece/python/src/sentencepiece")
     import sentencepiece_model_pb2 as model
-    tokenizer_orig = AlbertTokenizerFast.from_pretrained(mname_orig)
     tokenizer_orig.save_pretrained(tmp_dir)
     with open(vocab_orig_path, 'rb') as f: data = f.read()
     # adapted from https://blog.ceshine.net/post/trim-down-sentencepiece-vocabulary/
@@ -94,7 +95,10 @@ if 1: # set to 0 to skip this after running once to speed things up during tune
     with open(vocab_short_path, 'wb') as f: f.write(m.SerializeToString())
     m = None
-tokenizer_fast_tiny = AlbertTokenizerFast(vocab_file=vocab_short_path)
 ### Config
@@ -103,11 +107,11 @@ config_tiny = AlbertConfig.from_pretrained(mname_orig)
 print(config_tiny)
 # remember to update this to the actual config as each model is different and then shrink the numbers
 config_tiny.update(dict(
-    vocab_size=vocab_keep_items+12,
     embedding_size=64,
-    hidden_size=256,
     intermediate_size=128,
-    max_position_embeddings=64,
     num_attention_heads=2,
     num_hidden_groups=1,
     num_hidden_layers=2,
@@ -122,6 +126,7 @@ model_tiny.resize_token_embeddings(len(tokenizer_fast_tiny))
 # Test
 inputs = tokenizer_fast_tiny("The capital of France is [MASK].", return_tensors="pt")
 outputs = model_tiny(**inputs)
 print("Test with normal tokenizer:", len(outputs.logits[0]))

 # workaround for fast tokenizer protobuf issue, and it's much faster too!
 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
+from transformers import AlbertTokenizer, AlbertTokenizerFast, AlbertConfig, AlbertForMaskedLM
 mname_orig = "albert-base-v1"
 mname_tiny = "tiny-albert"
+model_max_length = 256
 ### Tokenizer
     # HACK: need the sentencepiece source to get sentencepiece_model_pb2, as it doesn't get installed
     sys.path.append("../sentencepiece/python/src/sentencepiece")
     import sentencepiece_model_pb2 as model
+    tokenizer_orig = AlbertTokenizer.from_pretrained(mname_orig)
     tokenizer_orig.save_pretrained(tmp_dir)
     with open(vocab_orig_path, 'rb') as f: data = f.read()
     # adapted from https://blog.ceshine.net/post/trim-down-sentencepiece-vocabulary/
     with open(vocab_short_path, 'wb') as f: f.write(m.SerializeToString())
     m = None
+# albert breaks without having tokenizer.model_max_length match config.max_position_embeddings
+# these values are hardcoded in the source for official models, so we have to explicitly set those here
+tokenizer_fast_tiny = AlbertTokenizerFast(vocab_file=vocab_short_path,
+                                          model_max_length=model_max_length)
 ### Config
 print(config_tiny)
 # remember to update this to the actual config as each model is different and then shrink the numbers
 config_tiny.update(dict(
+    vocab_size=vocab_keep_items,
     embedding_size=64,
+    hidden_size=32,
     intermediate_size=128,
+    max_position_embeddings=model_max_length,
     num_attention_heads=2,
     num_hidden_groups=1,
     num_hidden_layers=2,
 # Test
 inputs = tokenizer_fast_tiny("The capital of France is [MASK].", return_tensors="pt")
+#print(inputs)
 outputs = model_tiny(**inputs)
 print("Test with normal tokenizer:", len(outputs.logits[0]))

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a9a411609d5818e28a2070e85ffcb65a482bfbe0d434024622d54a318fa53fa8
-size 1396878

 version https://git-lfs.github.com/spec/v1
+oid sha256:e8c50a0be432afd0e75428e41f98be8937fc3f234d99844b1a46603b91e14c62
+size 730318

tokenizer_config.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"do_lower_case": true, "remove_space": true, "keep_accents": false, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "sp_model_kwargs": {}, "tokenizer_class": "AlbertTokenizer"}

+ {"do_lower_case": true, "remove_space": true, "keep_accents": false, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "model_max_length": 256, "sp_model_kwargs": {}, "tokenizer_class": "AlbertTokenizer"}