stas commited on
Commit
d361e26
1 Parent(s): c652096
Files changed (1) hide show
  1. make-tiny-xlm-roberta.py +5 -4
make-tiny-xlm-roberta.py CHANGED
@@ -33,13 +33,14 @@
33
  # 4. start with some pre-existing script from one of the https://huggingface.co/hf-internal-testing/ tiny model repos, e.g.
34
  # wget https://huggingface.co/hf-internal-testing/tiny-xlm-roberta/raw/main/make-tiny-xlm-roberta.py
35
  # chmod a+x ./make-tiny-xlm-roberta.py
 
36
  #
37
  # 5. automatically rename things from the old names to new ones
38
  # perl -pi -e 's|MT5|XLMRoberta|g' make-tiny-xlm-roberta.py
39
  # perl -pi -e 's|mt5|xlm-roberta|g' make-tiny-xlm-roberta.py
40
  #
41
  # 6. edit and re-run this script while fixing it up
42
- # ./make-tiny-xlm-roberta.py .
43
  #
44
  # 7. add/commit/push
45
  # git add *
@@ -53,7 +54,7 @@
53
  # cd tiny-xlm-roberta
54
  #
55
  # 2. edit and re-run this script after doing whatever changes are needed
56
- # ./make-tiny-xlm-roberta.py .
57
  #
58
  # 3. commit/push
59
  # git commit -m "new tiny model"
@@ -72,12 +73,12 @@ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
72
 
73
  mname_orig = "xlm-roberta-base"
74
  mname_tiny = "tiny-xlm-roberta"
75
- tmp_dir = f"/tmp/{mname_tiny}"
76
 
77
  ### Tokenizer
78
 
79
  # Shrink the orig vocab to keep things small
80
  vocab_keep_items = 5000
 
81
  vocab_orig_path = f"{tmp_dir}/sentencepiece.bpe.model"
82
  vocab_short_path = f"{tmp_dir}/spiece-short.model"
83
  if 1: # set to 0 to skip this after running once to speed things up during tune up
@@ -143,6 +144,6 @@ tokenizer_fast_tiny.save_pretrained(".")
143
  readme = "README.md"
144
  if not os.path.exists(readme):
145
  with open(readme, "w") as f:
146
- f.write(f"This is a tiny random {mname_tiny} model to be used for basic testing.")
147
 
148
  print(f"Generated {mname_tiny}")
33
  # 4. start with some pre-existing script from one of the https://huggingface.co/hf-internal-testing/ tiny model repos, e.g.
34
  # wget https://huggingface.co/hf-internal-testing/tiny-xlm-roberta/raw/main/make-tiny-xlm-roberta.py
35
  # chmod a+x ./make-tiny-xlm-roberta.py
36
+ # mv ./make-tiny-xlm-roberta.py ./make-tiny-xlm-roberta.py
37
  #
38
  # 5. automatically rename things from the old names to new ones
39
  # perl -pi -e 's|MT5|XLMRoberta|g' make-tiny-xlm-roberta.py
40
  # perl -pi -e 's|mt5|xlm-roberta|g' make-tiny-xlm-roberta.py
41
  #
42
  # 6. edit and re-run this script while fixing it up
43
+ # ./make-tiny-xlm-roberta.py
44
  #
45
  # 7. add/commit/push
46
  # git add *
54
  # cd tiny-xlm-roberta
55
  #
56
  # 2. edit and re-run this script after doing whatever changes are needed
57
+ # ./make-tiny-xlm-roberta.py
58
  #
59
  # 3. commit/push
60
  # git commit -m "new tiny model"
73
 
74
  mname_orig = "xlm-roberta-base"
75
  mname_tiny = "tiny-xlm-roberta"
 
76
 
77
  ### Tokenizer
78
 
79
  # Shrink the orig vocab to keep things small
80
  vocab_keep_items = 5000
81
+ tmp_dir = f"/tmp/{mname_tiny}"
82
  vocab_orig_path = f"{tmp_dir}/sentencepiece.bpe.model"
83
  vocab_short_path = f"{tmp_dir}/spiece-short.model"
84
  if 1: # set to 0 to skip this after running once to speed things up during tune up
144
  readme = "README.md"
145
  if not os.path.exists(readme):
146
  with open(readme, "w") as f:
147
+ f.write(f"This is a {mname_tiny} random model to be used for basic testing.\n")
148
 
149
  print(f"Generated {mname_tiny}")