jonatanklosko commited on
Commit
e94f2b1
1 Parent(s): c919139

Upload tokenizer.json

Browse files

The persisted `tokenizer.json` does not have the template processor for adding special tokens. `transformers` overrides the processor on load, but when loading `tokenizer.json` directly with the Rust tokenizers it's nice to have the processor there already (which worked so far in case of other models). This basically re-saves the tokenizer to match exactly what is loaded by `transformers`.

---

Generated with:

```python
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base")
assert tokenizer.is_fast
tokenizer.save_pretrained("...")
```

Files changed (1) hide show
  1. tokenizer.json +52 -4
tokenizer.json CHANGED
@@ -254,10 +254,58 @@
254
  ]
255
  },
256
  "post_processor": {
257
- "type": "ByteLevel",
258
- "add_prefix_space": true,
259
- "trim_offsets": false,
260
- "use_regex": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  },
262
  "decoder": {
263
  "type": "ByteLevel",
 
254
  ]
255
  },
256
  "post_processor": {
257
+ "type": "TemplateProcessing",
258
+ "single": [
259
+ {
260
+ "SpecialToken": {
261
+ "id": "<|begin▁of▁sentence|>",
262
+ "type_id": 0
263
+ }
264
+ },
265
+ {
266
+ "Sequence": {
267
+ "id": "A",
268
+ "type_id": 0
269
+ }
270
+ }
271
+ ],
272
+ "pair": [
273
+ {
274
+ "SpecialToken": {
275
+ "id": "<|begin▁of▁sentence|>",
276
+ "type_id": 0
277
+ }
278
+ },
279
+ {
280
+ "Sequence": {
281
+ "id": "A",
282
+ "type_id": 0
283
+ }
284
+ },
285
+ {
286
+ "SpecialToken": {
287
+ "id": "<|begin▁of▁sentence|>",
288
+ "type_id": 1
289
+ }
290
+ },
291
+ {
292
+ "Sequence": {
293
+ "id": "B",
294
+ "type_id": 1
295
+ }
296
+ }
297
+ ],
298
+ "special_tokens": {
299
+ "<|begin▁of▁sentence|>": {
300
+ "id": "<|begin▁of▁sentence|>",
301
+ "ids": [
302
+ 32013
303
+ ],
304
+ "tokens": [
305
+ "<|begin▁of▁sentence|>"
306
+ ]
307
+ }
308
+ }
309
  },
310
  "decoder": {
311
  "type": "ByteLevel",