Update tokenizer_config.json

This uses the tokenizer config from [NeuralBeagle](https://huggingface.co/mlabonne/NeuralBeagle14-7B), with updated `model_max_length` to Mistral's 32,768 context length.

Files changed (1) hide show

tokenizer_config.json +25 -6

tokenizer_config.json CHANGED Viewed

@@ -11,7 +11,7 @@
       "special": true
     },
     "1": {
-      "content": "<s>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
@@ -19,7 +19,23 @@
       "special": true
     },
     "2": {
-      "content": "</s>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
@@ -28,15 +44,18 @@
     }
   },
   "additional_special_tokens": [],
-  "bos_token": "<s>",
   "clean_up_tokenization_spaces": false,
-  "eos_token": "</s>",
   "legacy": true,
-  "model_max_length": 1000000000000000019884624838656,
   "pad_token": null,
   "sp_model_kwargs": {},
   "spaces_between_special_tokens": false,
   "tokenizer_class": "LlamaTokenizer",
   "unk_token": "<unk>",
-  "use_default_system_prompt": false
 }

       "special": true
     },
     "1": {
+      "content": "<|im_start|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "special": true
     },
     "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|im_start|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
     }
   },
   "additional_special_tokens": [],
+  "bos_token": "<|im_start|>",
+  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
   "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
   "legacy": true,
+  "model_max_length": 32768,
   "pad_token": null,
   "sp_model_kwargs": {},
   "spaces_between_special_tokens": false,
   "tokenizer_class": "LlamaTokenizer",
+  "trust_remote_code": false,
   "unk_token": "<unk>",
+  "use_default_system_prompt": true,
+  "use_fast": true
 }