livinNector commited on
Commit
347dec7
1 Parent(s): 13bb0e3

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.json +8 -2
  2. tokenizer_config.json +1 -1
tokenizer.json CHANGED
@@ -265,7 +265,9 @@
265
  "special": true
266
  }
267
  ],
268
- "normalizer": null,
 
 
269
  "pre_tokenizer": {
270
  "type": "Whitespace"
271
  },
@@ -344,7 +346,11 @@
344
  }
345
  }
346
  },
347
- "decoder": null,
 
 
 
 
348
  "model": {
349
  "type": "WordPiece",
350
  "unk_token": "[UNK]",
 
265
  "special": true
266
  }
267
  ],
268
+ "normalizer": {
269
+ "type": "NFKD"
270
+ },
271
  "pre_tokenizer": {
272
  "type": "Whitespace"
273
  },
 
346
  }
347
  }
348
  },
349
+ "decoder": {
350
+ "type": "WordPiece",
351
+ "prefix": "##",
352
+ "cleanup": true
353
+ },
354
  "model": {
355
  "type": "WordPiece",
356
  "unk_token": "[UNK]",
tokenizer_config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
  "clean_up_tokenization_spaces": true,
3
- "model_max_length": 512,
4
  "tokenizer_class": "PreTrainedTokenizerFast"
5
  }
 
1
  {
2
  "clean_up_tokenization_spaces": true,
3
+ "model_max_length": 1000000000000000019884624838656,
4
  "tokenizer_class": "PreTrainedTokenizerFast"
5
  }