Matttttttt commited on
Commit
9f6b9b2
1 Parent(s): c94e6d5

update tokenizer and settings to fix a minor bug

Browse files
sentencepiece.bpe.model CHANGED
Binary files a/sentencepiece.bpe.model and b/sentencepiece.bpe.model differ
 
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,5 +1,6 @@
1
  {
2
  "bos_token": "<s>",
 
3
  "cls_token": "<s>",
4
  "eos_token": "</s>",
5
  "mask_token": {
@@ -10,6 +11,7 @@
10
  "rstrip": false,
11
  "single_word": false
12
  },
 
13
  "pad_token": "<pad>",
14
  "sep_token": "</s>",
15
  "sp_model_kwargs": {},
 
1
  {
2
  "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": true,
4
  "cls_token": "<s>",
5
  "eos_token": "</s>",
6
  "mask_token": {
 
11
  "rstrip": false,
12
  "single_word": false
13
  },
14
+ "model_max_length": 1000000000000000019884624838656,
15
  "pad_token": "<pad>",
16
  "sep_token": "</s>",
17
  "sp_model_kwargs": {},