stas commited on
Commit
915669c
1 Parent(s): 1e39576
Files changed (1) hide show
  1. make-tiny-xlm-roberta.py +14 -17
make-tiny-xlm-roberta.py CHANGED
@@ -29,15 +29,15 @@
29
  # 3. clone
30
  # git clone https://huggingface.co/hf-internal-testing/tiny-xlm-roberta
31
  # cd tiny-xlm-roberta
32
-
33
  # 4. start with some pre-existing script from one of the https://huggingface.co/hf-internal-testing/ tiny model repos, e.g.
34
- # wget https://huggingface.co/hf-internal-testing/tiny-xlm-roberta/raw/main/make-tiny-xlm-roberta.py
35
- # chmod a+x ./make-tiny-xlm-roberta.py
36
- # mv ./make-tiny-xlm-roberta.py ./make-tiny-xlm-roberta.py
37
  #
38
  # 5. automatically rename things from the old names to new ones
39
- # perl -pi -e 's|MT5|XLMRoberta|g' make-tiny-xlm-roberta.py
40
- # perl -pi -e 's|mt5|xlm-roberta|g' make-tiny-xlm-roberta.py
41
  #
42
  # 6. edit and re-run this script while fixing it up
43
  # ./make-tiny-xlm-roberta.py
@@ -63,7 +63,10 @@
63
  import sys
64
  import os
65
 
66
- from transformers import XLMRobertaTokenizer, XLMRobertaTokenizerFast, XLMRobertaConfig, XLMRobertaForCausalLM
 
 
 
67
 
68
  mname_orig = "xlm-roberta-base"
69
  mname_tiny = "tiny-xlm-roberta"
@@ -75,13 +78,11 @@ vocab_keep_items = 5000
75
  tmp_dir = f"/tmp/{mname_tiny}"
76
  vocab_orig_path = f"{tmp_dir}/sentencepiece.bpe.model"
77
  vocab_short_path = f"{tmp_dir}/spiece-short.model"
78
- # workaround for fast tokenizer protobuf issue, and it's much faster too!
79
- os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
80
  if 1: # set to 0 to skip this after running once to speed things up during tune up
81
  # HACK: need the sentencepiece source to get sentencepiece_model_pb2, as it doesn't get installed
82
  sys.path.append("../sentencepiece/python/src/sentencepiece")
83
  import sentencepiece_model_pb2 as model
84
- tokenizer_orig = XLMRobertaTokenizer.from_pretrained(mname_orig)
85
  tokenizer_orig.save_pretrained(tmp_dir)
86
  with open(vocab_orig_path, 'rb') as f: data = f.read()
87
  # adapted from https://blog.ceshine.net/post/trim-down-sentencepiece-vocabulary/
@@ -94,11 +95,11 @@ if 1: # set to 0 to skip this after running once to speed things up during tune
94
  m = None
95
 
96
  tokenizer_fast_tiny = XLMRobertaTokenizerFast(vocab_file=vocab_short_path)
97
- tokenizer_tiny = XLMRobertaTokenizer(vocab_file=vocab_short_path)
98
 
99
  ### Config
100
 
101
  config_tiny = XLMRobertaConfig.from_pretrained(mname_orig)
 
102
  # remember to update this to the actual config as each model is different and then shrink the numbers
103
  config_tiny.update(dict(
104
  vocab_size=vocab_keep_items+12,
@@ -121,12 +122,9 @@ print("New config", config_tiny)
121
 
122
  model_tiny = XLMRobertaForCausalLM(config_tiny)
123
  print(f"{mname_tiny}: num of params {model_tiny.num_parameters()}")
124
- model_tiny.resize_token_embeddings(len(tokenizer_tiny))
125
-
126
- inputs = tokenizer_tiny("hello", return_tensors="pt")
127
- outputs = model_tiny(**inputs)
128
- print("Test with normal tokenizer:", len(outputs.logits[0]))
129
 
 
130
  inputs = tokenizer_fast_tiny("hello", return_tensors="pt")
131
  outputs = model_tiny(**inputs)
132
  print("Test with fast tokenizer:", len(outputs.logits[0]))
@@ -134,7 +132,6 @@ print("Test with fast tokenizer:", len(outputs.logits[0]))
134
  # Save
135
  model_tiny.half() # makes it smaller
136
  model_tiny.save_pretrained(".")
137
- tokenizer_tiny.save_pretrained(".")
138
  tokenizer_fast_tiny.save_pretrained(".")
139
 
140
  readme = "README.md"
29
  # 3. clone
30
  # git clone https://huggingface.co/hf-internal-testing/tiny-xlm-roberta
31
  # cd tiny-xlm-roberta
32
+ #
33
  # 4. start with some pre-existing script from one of the https://huggingface.co/hf-internal-testing/ tiny model repos, e.g.
34
+ # wget https://huggingface.co/hf-internal-testing/tiny-albert/raw/main/make-tiny-albert.py
35
+ # chmod a+x ./make-tiny-albert.py
36
+ # mv ./make-tiny-albert.py ./make-tiny-xlm-roberta.py
37
  #
38
  # 5. automatically rename things from the old names to new ones
39
+ # perl -pi -e 's|Albert|XLMRoberta|g' make-tiny-xlm-roberta.py
40
+ # perl -pi -e 's|albert|xlm-roberta|g' make-tiny-xlm-roberta.py
41
  #
42
  # 6. edit and re-run this script while fixing it up
43
  # ./make-tiny-xlm-roberta.py
63
  import sys
64
  import os
65
 
66
+ # workaround for fast tokenizer protobuf issue, and it's much faster too!
67
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
68
+
69
+ from transformers import XLMRobertaTokenizerFast, XLMRobertaConfig, XLMRobertaForCausalLM
70
 
71
  mname_orig = "xlm-roberta-base"
72
  mname_tiny = "tiny-xlm-roberta"
78
  tmp_dir = f"/tmp/{mname_tiny}"
79
  vocab_orig_path = f"{tmp_dir}/sentencepiece.bpe.model"
80
  vocab_short_path = f"{tmp_dir}/spiece-short.model"
 
 
81
  if 1: # set to 0 to skip this after running once to speed things up during tune up
82
  # HACK: need the sentencepiece source to get sentencepiece_model_pb2, as it doesn't get installed
83
  sys.path.append("../sentencepiece/python/src/sentencepiece")
84
  import sentencepiece_model_pb2 as model
85
+ tokenizer_orig = XLMRobertaTokenizerFast.from_pretrained(mname_orig)
86
  tokenizer_orig.save_pretrained(tmp_dir)
87
  with open(vocab_orig_path, 'rb') as f: data = f.read()
88
  # adapted from https://blog.ceshine.net/post/trim-down-sentencepiece-vocabulary/
95
  m = None
96
 
97
  tokenizer_fast_tiny = XLMRobertaTokenizerFast(vocab_file=vocab_short_path)
 
98
 
99
  ### Config
100
 
101
  config_tiny = XLMRobertaConfig.from_pretrained(mname_orig)
102
+ print(config_tiny)
103
  # remember to update this to the actual config as each model is different and then shrink the numbers
104
  config_tiny.update(dict(
105
  vocab_size=vocab_keep_items+12,
122
 
123
  model_tiny = XLMRobertaForCausalLM(config_tiny)
124
  print(f"{mname_tiny}: num of params {model_tiny.num_parameters()}")
125
+ model_tiny.resize_token_embeddings(len(tokenizer_fast_tiny))
 
 
 
 
126
 
127
+ # Test
128
  inputs = tokenizer_fast_tiny("hello", return_tensors="pt")
129
  outputs = model_tiny(**inputs)
130
  print("Test with fast tokenizer:", len(outputs.logits[0]))
132
  # Save
133
  model_tiny.half() # makes it smaller
134
  model_tiny.save_pretrained(".")
 
135
  tokenizer_fast_tiny.save_pretrained(".")
136
 
137
  readme = "README.md"