stas commited on
Commit
1e39576
1 Parent(s): d361e26
Files changed (2) hide show
  1. make-tiny-xlm-roberta.py +2 -6
  2. pytorch_model.bin +1 -1
make-tiny-xlm-roberta.py CHANGED
@@ -60,17 +60,11 @@
60
  # git commit -m "new tiny model"
61
  # git push
62
 
63
- from pathlib import Path
64
- import json
65
- import tempfile
66
  import sys
67
  import os
68
 
69
  from transformers import XLMRobertaTokenizer, XLMRobertaTokenizerFast, XLMRobertaConfig, XLMRobertaForCausalLM
70
 
71
- # workaround for fast tokenizer protobuffer issue, and it's much faster too!
72
- os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
73
-
74
  mname_orig = "xlm-roberta-base"
75
  mname_tiny = "tiny-xlm-roberta"
76
 
@@ -81,6 +75,8 @@ vocab_keep_items = 5000
81
  tmp_dir = f"/tmp/{mname_tiny}"
82
  vocab_orig_path = f"{tmp_dir}/sentencepiece.bpe.model"
83
  vocab_short_path = f"{tmp_dir}/spiece-short.model"
 
 
84
  if 1: # set to 0 to skip this after running once to speed things up during tune up
85
  # HACK: need the sentencepiece source to get sentencepiece_model_pb2, as it doesn't get installed
86
  sys.path.append("../sentencepiece/python/src/sentencepiece")
 
60
  # git commit -m "new tiny model"
61
  # git push
62
 
 
 
 
63
  import sys
64
  import os
65
 
66
  from transformers import XLMRobertaTokenizer, XLMRobertaTokenizerFast, XLMRobertaConfig, XLMRobertaForCausalLM
67
 
 
 
 
68
  mname_orig = "xlm-roberta-base"
69
  mname_tiny = "tiny-xlm-roberta"
70
 
 
75
  tmp_dir = f"/tmp/{mname_tiny}"
76
  vocab_orig_path = f"{tmp_dir}/sentencepiece.bpe.model"
77
  vocab_short_path = f"{tmp_dir}/spiece-short.model"
78
+ # workaround for fast tokenizer protobuf issue, and it's much faster too!
79
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
80
  if 1: # set to 0 to skip this after running once to speed things up during tune up
81
  # HACK: need the sentencepiece source to get sentencepiece_model_pb2, as it doesn't get installed
82
  sys.path.append("../sentencepiece/python/src/sentencepiece")
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b9c6cf79904c41a0ee0dd56366d40b9d2762235495edc6d04693ca3f41c50052
3
  size 4334436
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb0bccafb4bee811f2138956ea9e94596e1bfdfc868b5364d7b678fac4b2d559
3
  size 4334436