stas commited on
Commit
679cd9b
1 Parent(s): 1165d92

smaller vocab

Browse files
config.json CHANGED
@@ -23,5 +23,5 @@
23
  "torch_dtype": "float16",
24
  "transformers_version": "4.9.0.dev0",
25
  "type_vocab_size": 2,
26
- "vocab_size": 30522
27
  }
23
  "torch_dtype": "float16",
24
  "transformers_version": "4.9.0.dev0",
25
  "type_vocab_size": 2,
26
+ "vocab_size": 5120
27
  }
make-tiny-electra.py CHANGED
@@ -63,15 +63,30 @@
63
  import sys
64
  import os
65
 
66
- from transformers import ElectraTokenizer, ElectraTokenizerFast, ElectraConfig, ElectraForMaskedLM
67
 
68
  mname_orig = "google/electra-small-generator"
69
  mname_tiny = "tiny-electra"
70
 
 
71
  ### Tokenizer
72
 
73
- tokenizer_fast_tiny = ElectraTokenizerFast.from_pretrained(mname_orig)
74
- tokenizer_tiny = ElectraTokenizer.from_pretrained(mname_orig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  ### Config
77
 
@@ -85,20 +100,17 @@ config_tiny.update(dict(
85
  max_position_embeddings=512,
86
  num_attention_heads=2,
87
  num_hidden_layers=2,
 
88
  ))
89
  print("New config", config_tiny)
90
 
 
91
  ### Model
92
 
93
  model_tiny = ElectraForMaskedLM(config_tiny)
94
- print(f"{mname_tiny}: num of params {model_tiny.num_parameters()}")
95
- model_tiny.resize_token_embeddings(len(tokenizer_tiny))
96
-
97
 
98
- inputs = tokenizer_tiny("The capital of France is [MASK].", return_tensors="pt")
99
- labels = tokenizer_tiny("The capital of France is Paris.", return_tensors="pt")["input_ids"]
100
- outputs = model_tiny(**inputs, labels=labels)
101
- print("Test with normal tokenizer:", len(outputs.logits[0]))
102
 
103
  inputs = tokenizer_fast_tiny("The capital of France is [MASK].", return_tensors="pt")
104
  labels = tokenizer_fast_tiny("The capital of France is Paris.", return_tensors="pt")["input_ids"]
@@ -108,9 +120,10 @@ print("Test with normal tokenizer:", len(outputs.logits[0]))
108
  # Save
109
  model_tiny.half() # makes it smaller
110
  model_tiny.save_pretrained(".")
111
- tokenizer_tiny.save_pretrained(".")
112
  tokenizer_fast_tiny.save_pretrained(".")
113
 
 
 
114
  readme = "README.md"
115
  if not os.path.exists(readme):
116
  with open(readme, "w") as f:
63
  import sys
64
  import os
65
 
66
+ from transformers import ElectraTokenizerFast, ElectraConfig, ElectraForMaskedLM
67
 
68
  mname_orig = "google/electra-small-generator"
69
  mname_tiny = "tiny-electra"
70
 
71
+
72
  ### Tokenizer
73
 
74
+ # Shrink the orig vocab to keep things small (just enough to tokenize any word, so letters+symbols)
75
+ # ElectraTokenizerFast is fully defined by a tokenizer.json, which contains the vocab and the ids, so we just need to truncate it wisely
76
+ import subprocess
77
+ tokenizer_fast = ElectraTokenizerFast.from_pretrained(mname_orig)
78
+ vocab_keep_items = 5120
79
+ tmp_dir = f"/tmp/{mname_tiny}"
80
+ tokenizer_fast.save_pretrained(tmp_dir)
81
+ # resize tokenizer.json (vocab.txt will be automatically resized on save_pretrained)
82
+ # perl -pi -e 's|(2999).*|$1}}}|' tokenizer.json # 0-indexed, so vocab_keep_items-1!
83
+ closing_pat = "}}}"
84
+ cmd = (f"perl -pi -e s|({vocab_keep_items-1}).*|$1{closing_pat}| {tmp_dir}/tokenizer.json").split()
85
+ result = subprocess.run(cmd, capture_output=True, text=True)
86
+ # reload with modified tokenizer
87
+ tokenizer_fast_tiny = ElectraTokenizerFast.from_pretrained(tmp_dir)
88
+ # it seems that ElectraTokenizer is not needed and ElectraTokenizerFast does the job
89
+
90
 
91
  ### Config
92
 
100
  max_position_embeddings=512,
101
  num_attention_heads=2,
102
  num_hidden_layers=2,
103
+ vocab_size=vocab_keep_items,
104
  ))
105
  print("New config", config_tiny)
106
 
107
+
108
  ### Model
109
 
110
  model_tiny = ElectraForMaskedLM(config_tiny)
 
 
 
111
 
112
+ print(f"{mname_tiny}: num of params {model_tiny.num_parameters()}")
113
+ model_tiny.resize_token_embeddings(len(tokenizer_fast_tiny))
 
 
114
 
115
  inputs = tokenizer_fast_tiny("The capital of France is [MASK].", return_tensors="pt")
116
  labels = tokenizer_fast_tiny("The capital of France is Paris.", return_tensors="pt")["input_ids"]
120
  # Save
121
  model_tiny.half() # makes it smaller
122
  model_tiny.save_pretrained(".")
 
123
  tokenizer_fast_tiny.save_pretrained(".")
124
 
125
+ #print(model_tiny)
126
+
127
  readme = "README.md"
128
  if not os.path.exists(readme):
129
  with open(readme, "w") as f:
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16490204d439317f20cd9f9376475bec315b0085d79908523dabcf03e512654f
3
- size 4163300
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:329472d5ca2d08a2af5798faadab3bebe29fdba4c26ba737240f7bb711f48080
3
+ size 861028
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "google/electra-small-generator", "tokenizer_class": "ElectraTokenizer"}
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "/tmp/tiny-electra", "tokenizer_class": "ElectraTokenizer"}
vocab.txt CHANGED
The diff for this file is too large to render. See raw diff