Upload tokenizer
Browse files- special_tokens_map.json +7 -1
- tokenizer.json +2 -2
- tokenizer_config.json +3 -66
special_tokens_map.json
CHANGED
|
@@ -17,7 +17,13 @@
|
|
| 17 |
"rstrip": false,
|
| 18 |
"single_word": false
|
| 19 |
},
|
| 20 |
-
"pad_token":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
"unk_token": {
|
| 22 |
"content": "<unk>",
|
| 23 |
"lstrip": false,
|
|
|
|
| 17 |
"rstrip": false,
|
| 18 |
"single_word": false
|
| 19 |
},
|
| 20 |
+
"pad_token": {
|
| 21 |
+
"content": "<pad>",
|
| 22 |
+
"lstrip": false,
|
| 23 |
+
"normalized": false,
|
| 24 |
+
"rstrip": false,
|
| 25 |
+
"single_word": false
|
| 26 |
+
},
|
| 27 |
"unk_token": {
|
| 28 |
"content": "<unk>",
|
| 29 |
"lstrip": false,
|
tokenizer.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5f7eee611703c5ce5d1eee32d9cdcfe465647b8aff0c1dfb3bed7ad7dbb05060
|
| 3 |
+
size 34362873
|
tokenizer_config.json
CHANGED
|
@@ -1993,70 +1993,6 @@
|
|
| 1993 |
"rstrip": false,
|
| 1994 |
"single_word": false,
|
| 1995 |
"special": false
|
| 1996 |
-
},
|
| 1997 |
-
"256000": {
|
| 1998 |
-
"content": "<thinking>",
|
| 1999 |
-
"lstrip": false,
|
| 2000 |
-
"normalized": true,
|
| 2001 |
-
"rstrip": false,
|
| 2002 |
-
"single_word": false,
|
| 2003 |
-
"special": false
|
| 2004 |
-
},
|
| 2005 |
-
"256001": {
|
| 2006 |
-
"content": "<reflection>",
|
| 2007 |
-
"lstrip": false,
|
| 2008 |
-
"normalized": true,
|
| 2009 |
-
"rstrip": false,
|
| 2010 |
-
"single_word": false,
|
| 2011 |
-
"special": false
|
| 2012 |
-
},
|
| 2013 |
-
"256002": {
|
| 2014 |
-
"content": "<cite>",
|
| 2015 |
-
"lstrip": false,
|
| 2016 |
-
"normalized": true,
|
| 2017 |
-
"rstrip": false,
|
| 2018 |
-
"single_word": false,
|
| 2019 |
-
"special": false
|
| 2020 |
-
},
|
| 2021 |
-
"256003": {
|
| 2022 |
-
"content": "<output>",
|
| 2023 |
-
"lstrip": false,
|
| 2024 |
-
"normalized": true,
|
| 2025 |
-
"rstrip": false,
|
| 2026 |
-
"single_word": false,
|
| 2027 |
-
"special": false
|
| 2028 |
-
},
|
| 2029 |
-
"256004": {
|
| 2030 |
-
"content": "</thinking>",
|
| 2031 |
-
"lstrip": false,
|
| 2032 |
-
"normalized": true,
|
| 2033 |
-
"rstrip": false,
|
| 2034 |
-
"single_word": false,
|
| 2035 |
-
"special": false
|
| 2036 |
-
},
|
| 2037 |
-
"256005": {
|
| 2038 |
-
"content": "</reflection>",
|
| 2039 |
-
"lstrip": false,
|
| 2040 |
-
"normalized": true,
|
| 2041 |
-
"rstrip": false,
|
| 2042 |
-
"single_word": false,
|
| 2043 |
-
"special": false
|
| 2044 |
-
},
|
| 2045 |
-
"256006": {
|
| 2046 |
-
"content": "</cite>",
|
| 2047 |
-
"lstrip": false,
|
| 2048 |
-
"normalized": true,
|
| 2049 |
-
"rstrip": false,
|
| 2050 |
-
"single_word": false,
|
| 2051 |
-
"special": false
|
| 2052 |
-
},
|
| 2053 |
-
"256007": {
|
| 2054 |
-
"content": "</output>",
|
| 2055 |
-
"lstrip": false,
|
| 2056 |
-
"normalized": true,
|
| 2057 |
-
"rstrip": false,
|
| 2058 |
-
"single_word": false,
|
| 2059 |
-
"special": false
|
| 2060 |
}
|
| 2061 |
},
|
| 2062 |
"additional_special_tokens": [
|
|
@@ -2067,8 +2003,9 @@
|
|
| 2067 |
"chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% if message['role'] == 'assistant' %}{{ eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<start_of_turn>model\n' }}{% endif %}",
|
| 2068 |
"clean_up_tokenization_spaces": false,
|
| 2069 |
"eos_token": "<eos>",
|
| 2070 |
-
"model_max_length":
|
| 2071 |
-
"pad_token": "<
|
|
|
|
| 2072 |
"sp_model_kwargs": {},
|
| 2073 |
"spaces_between_special_tokens": false,
|
| 2074 |
"tokenizer_class": "GemmaTokenizer",
|
|
|
|
| 1993 |
"rstrip": false,
|
| 1994 |
"single_word": false,
|
| 1995 |
"special": false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1996 |
}
|
| 1997 |
},
|
| 1998 |
"additional_special_tokens": [
|
|
|
|
| 2003 |
"chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% if message['role'] == 'assistant' %}{{ eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<start_of_turn>model\n' }}{% endif %}",
|
| 2004 |
"clean_up_tokenization_spaces": false,
|
| 2005 |
"eos_token": "<eos>",
|
| 2006 |
+
"model_max_length": 8192,
|
| 2007 |
+
"pad_token": "<pad>",
|
| 2008 |
+
"padding_side": "left",
|
| 2009 |
"sp_model_kwargs": {},
|
| 2010 |
"spaces_between_special_tokens": false,
|
| 2011 |
"tokenizer_class": "GemmaTokenizer",
|