Narsil HF staff commited on
Commit
559b155
1 Parent(s): 66af19f

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +17 -17
README.md CHANGED
@@ -1,18 +1,18 @@
1
  ```python
2
- def DummyTok(model_max_length=4):
3
- import tempfile
4
-
5
- from tokenizers import Tokenizer, models
6
- from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
7
-
8
- vocab = [(chr(i), i) for i in range(256)]
9
- tokenizer = Tokenizer(models.Unigram(vocab))
10
- with tempfile.NamedTemporaryFile() as f:
11
- tokenizer.save(f.name)
12
- real_tokenizer = PreTrainedTokenizerFast(
13
- tokenizer_file=f.name, model_max_length=model_max_length
14
- )
15
-
16
- real_tokenizer.save("dummy_tokenizer.json")
17
- return real_tokenizer
18
- ```
1
  ```python
2
+ import tempfile
3
+
4
+ from tokenizers import Tokenizer, models
5
+ from transformers import PreTrainedTokenizerFast
6
+
7
+ model_max_length = 4
8
+ vocab = [(chr(i), i) for i in range(256)]
9
+ tokenizer = Tokenizer(models.Unigram(vocab))
10
+ with tempfile.NamedTemporaryFile() as f:
11
+ tokenizer.save(f.name)
12
+ real_tokenizer = PreTrainedTokenizerFast(tokenizer_file=f.name, model_max_length=model_max_length)
13
+
14
+ real_tokenizer._tokenizer.save("dummy/tokenizer.json")
15
+
16
+ ```
17
+
18
+ config uses Albert which works with a minimal `config.json`