stas commited on
Commit
d3cc9ec
1 Parent(s): 122a043

model corrections

Browse files
config.json CHANGED
@@ -11,12 +11,12 @@
11
  "gap_size": 0,
12
  "hidden_act": "gelu",
13
  "hidden_dropout_prob": 0.1,
14
- "hidden_size": 256,
15
  "initializer_range": 0.02,
16
  "inner_group_num": 1,
17
  "intermediate_size": 128,
18
  "layer_norm_eps": 1e-12,
19
- "max_position_embeddings": 64,
20
  "model_type": "albert",
21
  "net_structure_type": 0,
22
  "num_attention_heads": 2,
11
  "gap_size": 0,
12
  "hidden_act": "gelu",
13
  "hidden_dropout_prob": 0.1,
14
+ "hidden_size": 32,
15
  "initializer_range": 0.02,
16
  "inner_group_num": 1,
17
  "intermediate_size": 128,
18
  "layer_norm_eps": 1e-12,
19
+ "max_position_embeddings": 256,
20
  "model_type": "albert",
21
  "net_structure_type": 0,
22
  "num_attention_heads": 2,
make-tiny-albert.py CHANGED
@@ -66,10 +66,11 @@ import os
66
  # workaround for fast tokenizer protobuf issue, and it's much faster too!
67
  os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
68
 
69
- from transformers import AlbertTokenizerFast, AlbertConfig, AlbertForMaskedLM
70
 
71
  mname_orig = "albert-base-v1"
72
  mname_tiny = "tiny-albert"
 
73
 
74
  ### Tokenizer
75
 
@@ -82,7 +83,7 @@ if 1: # set to 0 to skip this after running once to speed things up during tune
82
  # HACK: need the sentencepiece source to get sentencepiece_model_pb2, as it doesn't get installed
83
  sys.path.append("../sentencepiece/python/src/sentencepiece")
84
  import sentencepiece_model_pb2 as model
85
- tokenizer_orig = AlbertTokenizerFast.from_pretrained(mname_orig)
86
  tokenizer_orig.save_pretrained(tmp_dir)
87
  with open(vocab_orig_path, 'rb') as f: data = f.read()
88
  # adapted from https://blog.ceshine.net/post/trim-down-sentencepiece-vocabulary/
@@ -94,7 +95,10 @@ if 1: # set to 0 to skip this after running once to speed things up during tune
94
  with open(vocab_short_path, 'wb') as f: f.write(m.SerializeToString())
95
  m = None
96
 
97
- tokenizer_fast_tiny = AlbertTokenizerFast(vocab_file=vocab_short_path)
 
 
 
98
 
99
 
100
  ### Config
@@ -103,11 +107,11 @@ config_tiny = AlbertConfig.from_pretrained(mname_orig)
103
  print(config_tiny)
104
  # remember to update this to the actual config as each model is different and then shrink the numbers
105
  config_tiny.update(dict(
106
- vocab_size=vocab_keep_items+12,
107
  embedding_size=64,
108
- hidden_size=256,
109
  intermediate_size=128,
110
- max_position_embeddings=64,
111
  num_attention_heads=2,
112
  num_hidden_groups=1,
113
  num_hidden_layers=2,
@@ -122,6 +126,7 @@ model_tiny.resize_token_embeddings(len(tokenizer_fast_tiny))
122
 
123
  # Test
124
  inputs = tokenizer_fast_tiny("The capital of France is [MASK].", return_tensors="pt")
 
125
  outputs = model_tiny(**inputs)
126
  print("Test with normal tokenizer:", len(outputs.logits[0]))
127
 
66
  # workaround for fast tokenizer protobuf issue, and it's much faster too!
67
  os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
68
 
69
+ from transformers import AlbertTokenizer, AlbertTokenizerFast, AlbertConfig, AlbertForMaskedLM
70
 
71
  mname_orig = "albert-base-v1"
72
  mname_tiny = "tiny-albert"
73
+ model_max_length = 256
74
 
75
  ### Tokenizer
76
 
83
  # HACK: need the sentencepiece source to get sentencepiece_model_pb2, as it doesn't get installed
84
  sys.path.append("../sentencepiece/python/src/sentencepiece")
85
  import sentencepiece_model_pb2 as model
86
+ tokenizer_orig = AlbertTokenizer.from_pretrained(mname_orig)
87
  tokenizer_orig.save_pretrained(tmp_dir)
88
  with open(vocab_orig_path, 'rb') as f: data = f.read()
89
  # adapted from https://blog.ceshine.net/post/trim-down-sentencepiece-vocabulary/
95
  with open(vocab_short_path, 'wb') as f: f.write(m.SerializeToString())
96
  m = None
97
 
98
+ # albert breaks without having tokenizer.model_max_length match config.max_position_embeddings
99
+ # these values are hardcoded in the source for official models, so we have to explicitly set those here
100
+ tokenizer_fast_tiny = AlbertTokenizerFast(vocab_file=vocab_short_path,
101
+ model_max_length=model_max_length)
102
 
103
 
104
  ### Config
107
  print(config_tiny)
108
  # remember to update this to the actual config as each model is different and then shrink the numbers
109
  config_tiny.update(dict(
110
+ vocab_size=vocab_keep_items,
111
  embedding_size=64,
112
+ hidden_size=32,
113
  intermediate_size=128,
114
+ max_position_embeddings=model_max_length,
115
  num_attention_heads=2,
116
  num_hidden_groups=1,
117
  num_hidden_layers=2,
126
 
127
  # Test
128
  inputs = tokenizer_fast_tiny("The capital of France is [MASK].", return_tensors="pt")
129
+ #print(inputs)
130
  outputs = model_tiny(**inputs)
131
  print("Test with normal tokenizer:", len(outputs.logits[0]))
132
 
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a9a411609d5818e28a2070e85ffcb65a482bfbe0d434024622d54a318fa53fa8
3
- size 1396878
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8c50a0be432afd0e75428e41f98be8937fc3f234d99844b1a46603b91e14c62
3
+ size 730318
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"do_lower_case": true, "remove_space": true, "keep_accents": false, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "sp_model_kwargs": {}, "tokenizer_class": "AlbertTokenizer"}
1
+ {"do_lower_case": true, "remove_space": true, "keep_accents": false, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "<pad>", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "model_max_length": 256, "sp_model_kwargs": {}, "tokenizer_class": "AlbertTokenizer"}