Qubitium commited on
Commit
3a0d911
1 Parent(s): 4cd58d3

add pad and extra_N special tokens

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. README.md +16 -4
  3. special_tokens_map.json +1 -4
  4. tokenizer.json +0 -0
  5. tokenizer_config.json +111 -12
  6. vocab.json +16 -3
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -5,17 +5,22 @@ tags:
5
  - tokenizers
6
  ---
7
 
8
- ## Why should you use this and not the titotken included in the orignal model?
9
- Original tokenizer pad vocabulary to correct size with `<extra_N>` tokens but encoder never uses them causing inconsistency and deterimental to training code that may want to use the unused `<extra_N>` tokens.
 
 
 
10
 
11
  modified from original code @ https://huggingface.co/Xenova/dbrx-instruct-tokenizer
12
 
13
- ```
14
  Changes:
15
  1. Remove non-base model tokens
16
  2. Keep/Add `<|pad|>` special token to make sure padding can be differentiated from eos/bos.
 
17
 
18
- "100277": {
 
19
  "content": "<|pad|>",
20
  "lstrip": false,
21
  "normalized": false,
@@ -23,6 +28,13 @@ Changes:
23
  "single_word": false,
24
  "special": true
25
  },
 
 
 
 
 
 
 
26
  ```
27
 
28
  # DBRX Instruct Tokenizer
 
5
  - tokenizers
6
  ---
7
 
8
+ ## Why should you use this and not the tiktoken included in the orignal model?
9
+ 1. Original tokenizer pad the vocabulary to correct size with `<extra_N>` tokens but encoder never uses them
10
+ 2. `len(tokenizer) != tokenizer.vocab_size` as result of 1.
11
+ 3. Original tokenizer use eos as pad token which may confuse trainers to mask out the eos token so model never output eos.
12
+
13
 
14
  modified from original code @ https://huggingface.co/Xenova/dbrx-instruct-tokenizer
15
 
16
+ ```json
17
  Changes:
18
  1. Remove non-base model tokens
19
  2. Keep/Add `<|pad|>` special token to make sure padding can be differentiated from eos/bos.
20
+ 3. Expose 15 unused/reserved `<|extra_N|>` for use
21
 
22
+ # pad token
23
+ "100256": {
24
  "content": "<|pad|>",
25
  "lstrip": false,
26
  "normalized": false,
 
28
  "single_word": false,
29
  "special": true
30
  },
31
+
32
+ # 15 unused/reserved extra tokens
33
+ "<|extra_0|>": 100261
34
+ "<|extra_1|>": 100262
35
+ ...
36
+ "<|extra_14|>": 100275
37
+
38
  ```
39
 
40
  # DBRX Instruct Tokenizer
special_tokens_map.json CHANGED
@@ -1,8 +1,5 @@
1
  {
2
- "additional_special_tokens": [
3
- "<|im_start|>",
4
- "<|im_end|>"
5
- ],
6
  "bos_token": {
7
  "content": "<|endoftext|>",
8
  "lstrip": false,
 
1
  {
2
+ "additional_special_tokens": [],
 
 
 
3
  "bos_token": {
4
  "content": "<|endoftext|>",
5
  "lstrip": false,
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -11,7 +11,7 @@
11
  "single_word": false,
12
  "special": true
13
  },
14
- "100277": {
15
  "content": "<|pad|>",
16
  "lstrip": false,
17
  "normalized": false,
@@ -19,16 +19,120 @@
19
  "single_word": false,
20
  "special": true
21
  },
22
- "100278": {
23
- "content": "<|im_start|>",
24
  "lstrip": false,
25
  "normalized": false,
26
  "rstrip": false,
27
  "single_word": false,
28
  "special": true
29
  },
30
- "100279": {
31
- "content": "<|im_end|>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  "lstrip": false,
33
  "normalized": false,
34
  "rstrip": false,
@@ -36,10 +140,7 @@
36
  "special": true
37
  }
38
  },
39
- "additional_special_tokens": [
40
- "<|im_start|>",
41
- "<|im_end|>"
42
- ],
43
  "bos_token": "<|endoftext|>",
44
  "clean_up_tokenization_spaces": true,
45
  "encoding_name": null,
@@ -49,7 +150,5 @@
49
  "model_name": "gpt-4",
50
  "pad_token": "<|pad|>",
51
  "tokenizer_class": "GPT2Tokenizer",
52
- "unk_token": "<|endoftext|>",
53
- "use_default_system_prompt": true,
54
- "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not 'system' in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks — remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER\\'S QUERY.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message.strip() + '<|im_end|>\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% endif %}{% endfor %}"
55
  }
 
11
  "single_word": false,
12
  "special": true
13
  },
14
+ "100256": {
15
  "content": "<|pad|>",
16
  "lstrip": false,
17
  "normalized": false,
 
19
  "single_word": false,
20
  "special": true
21
  },
22
+ "100261": {
23
+ "content": "<|extra_0|>",
24
  "lstrip": false,
25
  "normalized": false,
26
  "rstrip": false,
27
  "single_word": false,
28
  "special": true
29
  },
30
+ "100262": {
31
+ "content": "<|extra_1|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "100263": {
39
+ "content": "<|extra_2|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "100264": {
47
+ "content": "<|extra_3|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "100265": {
55
+ "content": "<|extra_4|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "100266": {
63
+ "content": "<|extra_5|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "100267": {
71
+ "content": "<|extra_6|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "100268": {
79
+ "content": "<|extra_7|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "100269": {
87
+ "content": "<|extra_8|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "100270": {
95
+ "content": "<|extra_9|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "100271": {
103
+ "content": "<|extra_10|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "100272": {
111
+ "content": "<|extra_11|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "100273": {
119
+ "content": "<|extra_12|>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": true
125
+ },
126
+ "100274": {
127
+ "content": "<|extra_13|>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": true
133
+ },
134
+ "100275": {
135
+ "content": "<|extra_14|>",
136
  "lstrip": false,
137
  "normalized": false,
138
  "rstrip": false,
 
140
  "special": true
141
  }
142
  },
143
+ "additional_special_tokens": [],
 
 
 
144
  "bos_token": "<|endoftext|>",
145
  "clean_up_tokenization_spaces": true,
146
  "encoding_name": null,
 
150
  "model_name": "gpt-4",
151
  "pad_token": "<|pad|>",
152
  "tokenizer_class": "GPT2Tokenizer",
153
+ "unk_token": "<|endoftext|>"
 
 
154
  }
vocab.json CHANGED
@@ -100260,7 +100260,20 @@
100260
  "<|fim_middle|>": 100259,
100261
  "<|fim_suffix|>": 100260,
100262
  "<|endofprompt|>": 100276,
100263
- "<|pad|>": 100277,
100264
- "<|im_start|>": 100278,
100265
- "<|im_end|>": 100279
 
 
 
 
 
 
 
 
 
 
 
 
 
100266
  }
 
100260
  "<|fim_middle|>": 100259,
100261
  "<|fim_suffix|>": 100260,
100262
  "<|endofprompt|>": 100276,
100263
+ "<|pad|>": 100256,
100264
+ "<|extra_0|>": 100261,
100265
+ "<|extra_1|>": 100262,
100266
+ "<|extra_2|>": 100263,
100267
+ "<|extra_3|>": 100264,
100268
+ "<|extra_4|>": 100265,
100269
+ "<|extra_5|>": 100266,
100270
+ "<|extra_6|>": 100267,
100271
+ "<|extra_7|>": 100268,
100272
+ "<|extra_8|>": 100269,
100273
+ "<|extra_9|>": 100270,
100274
+ "<|extra_10|>": 100271,
100275
+ "<|extra_11|>": 100272,
100276
+ "<|extra_12|>": 100273,
100277
+ "<|extra_13|>": 100274,
100278
+ "<|extra_14|>": 100275
100279
  }