HyperAccel commited on
Commit
c966327
·
verified ·
1 Parent(s): 543258e

Upload tokenizer from kimi_linear

Browse files
chat_template.jinja ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% macro render_content(msg) -%}
2
+ {%- set c = msg.get('content') -%}
3
+ {%- if c is string -%}
4
+ {{ c }}
5
+ {%- elif c is not none -%}
6
+ {% for content in c -%}
7
+ {% if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
8
+ <|media_start|>image<|media_content|><|media_pad|><|media_end|>
9
+ {% else -%}
10
+ {{ content['text'] }}
11
+ {%- endif -%}
12
+ {%- endfor -%}
13
+ {%- endif -%}
14
+ {%- endmacro %}
15
+
16
+
17
+ {%- if tools -%}
18
+ <|im_system|>tool_declare<|im_middle|>{{ tools | tojson(separators=(',', ':')) }}<|im_end|>
19
+ {%- endif -%}
20
+ {% for message in messages %}
21
+ {%- set role_name = message.get('name') or message['role'] -%}
22
+ {%- if message['role'] == 'user' -%}
23
+ <|im_user|>{{role_name}}<|im_middle|>
24
+ {%- elif message['role'] == 'assistant' -%}
25
+ <|im_assistant|>{{role_name}}<|im_middle|>
26
+ {%- else -%}
27
+ <|im_system|>{{role_name}}<|im_middle|>
28
+ {%- endif -%}
29
+
30
+ {%- if message['role'] == 'assistant' and message.get('tool_calls') -%}
31
+ {{render_content(message)}}<|tool_calls_section_begin|>
32
+ {%- for tool_call in message['tool_calls'] -%}
33
+ {%- set formatted_id = tool_call['id'] -%}
34
+ <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
35
+ {%- endfor -%}
36
+ <|tool_calls_section_end|>
37
+ {%- elif message['role'] == 'tool' -%}
38
+ {%- set tool_call_id = message.tool_call_id -%}
39
+ ## Return of {{ tool_call_id }}
40
+ {{render_content(message)}}
41
+ {%- elif message['content'] is not none -%}
42
+ {{render_content(message)}}
43
+ {%- endif -%}
44
+ <|im_end|>
45
+ {%- endfor -%}
46
+ {%- if add_generation_prompt -%}
47
+ <|im_assistant|>assistant<|im_middle|>
48
+ {%- endif -%}
tiktoken.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6c497a7469b33ced9c38afb1ad6e47f03f5e5dc05f15930799210ec050c5103
3
+ size 2795286
tokenization_kimi.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tiktoken
3
+
4
+ from logging import getLogger
5
+ from pathlib import Path
6
+ from typing import (
7
+ cast,
8
+ Tuple,
9
+ Dict,
10
+ Iterator,
11
+ List,
12
+ Union,
13
+ Optional,
14
+ )
15
+ from shutil import copyfile
16
+ from tiktoken.load import load_tiktoken_bpe
17
+ from tokenizers import AddedToken, pre_tokenizers, Regex
18
+ from transformers.tokenization_utils import PreTrainedTokenizer
19
+ from transformers.convert_slow_tokenizer import bytes_to_unicode
20
+ from typing import Any
21
+
22
+
23
+ logger = getLogger(__name__)
24
+ VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
25
+
26
+
27
+ class TikTokenTokenizer(PreTrainedTokenizer):
28
+ """
29
+ Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
30
+
31
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
32
+ this superclass for more information regarding those methods.
33
+
34
+ Args:
35
+ vocab_file (`str`):
36
+ The path to the Tiktoken model file.
37
+ bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
38
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
39
+ eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
40
+ The end of sequence token.
41
+ unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
42
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
43
+ token instead. The second to last item in special_tokens.
44
+ pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
45
+ The token used for padding, for example when batching sequences of different lengths.
46
+ additional_special_tokens (list of `str`, *optional*):
47
+ A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
48
+ skipped when decoding if `skip_special_tokens` is set to `True`.
49
+ """
50
+
51
+ vocab_files_names = VOCAB_FILES_NAMES
52
+
53
+ model_input_names = ["input_ids", "attention_mask"]
54
+
55
+ special_tokens: Dict[str, int]
56
+
57
+ num_reserved_special_tokens = 256
58
+
59
+ pat_str = "|".join(
60
+ [
61
+ r"""[\p{Han}]+""",
62
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
63
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
64
+ r"""\p{N}{1,3}""",
65
+ r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
66
+ r"""\s*[\r\n]+""",
67
+ r"""\s+(?!\S)""",
68
+ r"""\s+""",
69
+ ]
70
+ )
71
+
72
+ def __init__(
73
+ self,
74
+ vocab_file,
75
+ bos_token: Union[str, AddedToken]="[BOS]",
76
+ eos_token: Union[str, AddedToken]="[EOS]",
77
+ unk_token: Union[str, AddedToken, None]=None,
78
+ pad_token: Union[str, AddedToken, None]=None,
79
+ additional_special_tokens: List[str]=None,
80
+ added_tokens_decoder: Optional[dict] = None,
81
+ **kwargs,
82
+ ):
83
+ assert os.path.isfile(vocab_file), vocab_file
84
+
85
+ if additional_special_tokens is None:
86
+ additional_special_tokens = [
87
+ "<|im_end|>",
88
+ "<|im_user|>",
89
+ "<|im_assistant|>",
90
+ "<|start_header_id|>",
91
+ "<|end_header_id|>",
92
+ "[EOT]",
93
+ "<|im_system|>",
94
+ "<|im_middle|>",
95
+ ]
96
+
97
+ special_tokens_mapping = {
98
+ i: added_tokens_decoder[i].content for i in added_tokens_decoder
99
+ }
100
+
101
+ self.vocab_file = vocab_file
102
+ mergeable_ranks = load_tiktoken_bpe(vocab_file)
103
+ num_base_tokens = len(mergeable_ranks)
104
+ self.special_tokens = {
105
+ special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
106
+ for i in range(
107
+ num_base_tokens, num_base_tokens + self.num_reserved_special_tokens + 2
108
+ )
109
+ }
110
+
111
+
112
+
113
+ self.model = tiktoken.Encoding(
114
+ name=Path(vocab_file).name,
115
+ pat_str=self.pat_str,
116
+ mergeable_ranks=mergeable_ranks,
117
+ special_tokens=self.special_tokens,
118
+ )
119
+ logger.info(f"Reloaded tiktoken model from {vocab_file}")
120
+
121
+ self.n_words: int = self.model.n_vocab
122
+ # BOS / EOS token IDs
123
+ self.bos_id: int = self.special_tokens[str(bos_token)]
124
+ self.eos_id: int = self.special_tokens[str(eos_token)]
125
+ logger.info(
126
+ f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
127
+ )
128
+
129
+ self.pad_id: int = self.special_tokens[str(pad_token)]
130
+ self.unk_id: int = self.special_tokens[str(unk_token)]
131
+
132
+ self.byte_encoder = bytes_to_unicode()
133
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
134
+
135
+ self.decoder = {}
136
+ for i in range(self.n_words):
137
+ # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
138
+ decoding = ''.join([
139
+ self.byte_encoder[ord(char)] for char in
140
+ self.model.decode_single_token_bytes(i).decode('latin-1')
141
+ ])
142
+ self.decoder[i] = decoding
143
+
144
+ self.encoder = {}
145
+ for i in range(self.n_words):
146
+ if i in self.decoder:
147
+ self.encoder[self.decoder[i]] = i
148
+
149
+ super().__init__(
150
+ bos_token=bos_token,
151
+ eos_token=eos_token,
152
+ unk_token=unk_token,
153
+ pad_token=pad_token,
154
+ additional_special_tokens=additional_special_tokens,
155
+ **kwargs,
156
+ )
157
+ self.all_special_ids_set = set(self.all_special_ids)
158
+
159
+ def encode(
160
+ self,
161
+ text: str,
162
+ allow_special_tokens: bool = True,
163
+ **kwargs
164
+ ) -> List[int]:
165
+ """
166
+ Encodes a string into a list of token IDs.
167
+
168
+ Args:
169
+ text (str): The input string to be encoded.
170
+
171
+ Returns:
172
+ list[int]: A list of token IDs.
173
+ """
174
+ # If there are other args, we should call super().encode because there are a lot of code
175
+ # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
176
+ # NOTE: our encode method is not compatible with the super().encode method,
177
+ # e.g. split_special_tokens' default is True in our encode method.
178
+ if len(kwargs) > 0:
179
+ logger.warning( f"Calling super().encode with {kwargs}" )
180
+ return super().encode(text, **kwargs)
181
+
182
+ assert type(text) is str
183
+
184
+ # The tiktoken tokenizer can handle <=400k chars without
185
+ # pyo3_runtime.PanicException.
186
+ TIKTOKEN_MAX_ENCODE_CHARS = 400_000
187
+
188
+ # https://github.com/openai/tiktoken/issues/195
189
+ # Here we iterate over subsequences and split if we exceed the limit
190
+ # of max consecutive non-whitespace or whitespace characters.
191
+ MAX_NO_WHITESPACES_CHARS = 25_000
192
+
193
+ texts = self.pre_tokenizer_process(text)
194
+
195
+ all_substrs = []
196
+ for text in texts:
197
+ substrs = (
198
+ substr
199
+ for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
200
+ for substr in self._split_whitespaces_or_nonwhitespaces(
201
+ text[i: i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
202
+ )
203
+ )
204
+ all_substrs.extend(substrs)
205
+
206
+ t: List[int] = []
207
+ for substr in all_substrs:
208
+ if allow_special_tokens:
209
+ t.extend(
210
+ # we should consider special token as a common token
211
+ self.model.encode(
212
+ substr,
213
+ allowed_special="all",
214
+ )
215
+ )
216
+ else:
217
+ t.extend(
218
+ # we should consider special token as a common token
219
+ self.model.encode(
220
+ substr,
221
+ disallowed_special=(),
222
+ )
223
+ )
224
+
225
+ return t
226
+
227
+ def decode(
228
+ self,
229
+ token_ids: Union[int, List[int]],
230
+ **kwargs
231
+ ) -> str:
232
+ """
233
+ Decodes a list of token IDs into a string.
234
+
235
+ Args:
236
+ token_ids (List[int]): The list of token IDs to be decoded.
237
+
238
+ Returns:
239
+ str: The decoded string.
240
+ """
241
+ # If there are other args, we should call super().decode because there are a lot of code
242
+ # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
243
+ if len(kwargs) > 0:
244
+ return super().decode(token_ids, **kwargs)
245
+
246
+ if type(token_ids) is int:
247
+ token_ids = [token_ids]
248
+
249
+ return self.model.decode(cast(List[int], token_ids))
250
+
251
+ @staticmethod
252
+ def _split_whitespaces_or_nonwhitespaces(
253
+ s: str, max_consecutive_slice_len: int
254
+ ) -> Iterator[str]:
255
+ """
256
+ Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
257
+ consecutive whitespaces or consecutive non-whitespaces.
258
+ """
259
+ current_slice_len = 0
260
+ current_slice_is_space = s[0].isspace() if len(s) > 0 else False
261
+ slice_start = 0
262
+
263
+ for i in range(len(s)):
264
+ is_now_space = s[i].isspace()
265
+
266
+ if current_slice_is_space ^ is_now_space:
267
+ current_slice_len = 1
268
+ current_slice_is_space = is_now_space
269
+ else:
270
+ current_slice_len += 1
271
+ if current_slice_len > max_consecutive_slice_len:
272
+ yield s[slice_start:i]
273
+ slice_start = i
274
+ current_slice_len = 1
275
+ yield s[slice_start:]
276
+
277
+ def pre_tokenizer_process(self, text: str) -> List[str]:
278
+ """
279
+ pre-tokenizes the input text into a list of tokens.
280
+ This method is used to split the input text into smaller chunks for internal processing.
281
+ """
282
+ return [text]
283
+
284
+
285
+ """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
286
+ @property
287
+ def vocab_size(self) -> int:
288
+ return self.n_words
289
+
290
+ def get_vocab(self) -> Dict[str, int]:
291
+ return self.encoder
292
+
293
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
294
+ return [
295
+ self.decoder[t]
296
+ for t in self.encode(text)
297
+ ]
298
+
299
+ def _convert_token_to_id(self, token: str) -> int:
300
+ return self.encoder.get(token, self.unk_id)
301
+
302
+ def _convert_id_to_token(self, index: int) -> str:
303
+ return self.decoder.get(index)
304
+
305
+ @staticmethod
306
+ def clean_up_tokenization(out_string: str) -> str:
307
+ return out_string
308
+
309
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
310
+ text = ''.join(tokens)
311
+ text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', 'replace')
312
+ return text
313
+
314
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
315
+ if not os.path.isdir(save_directory):
316
+ raise ValueError(f"vocabulary path ({save_directory}) should be a directory")
317
+ out_vocab_file = os.path.join(
318
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
319
+ )
320
+
321
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
322
+ copyfile(self.vocab_file, out_vocab_file)
323
+
324
+ return (out_vocab_file,)
325
+
326
+
327
+
328
+ def apply_chat_template(
329
+ self, conversation, tools: Optional[list[dict]] = None,
330
+ tokenize: bool = False,
331
+ add_generation_prompt: bool = True,
332
+ **kwargs
333
+ ):
334
+ tools = deep_sort_dict(tools)
335
+ return super().apply_chat_template(conversation,
336
+ tools=tools,
337
+ tokenize=tokenize,
338
+ add_generation_prompt=add_generation_prompt,
339
+ **kwargs)
340
+
341
+
342
+ def deep_sort_dict(obj: Any) -> Any:
343
+ if isinstance(obj, dict):
344
+ return {k: deep_sort_dict(v) for k, v in sorted(obj.items())}
345
+ if isinstance(obj, list):
346
+ return [deep_sort_dict(item) for item in obj]
347
+ return obj
tokenizer_config.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "163584": {
4
+ "content": "[BOS]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "163585": {
12
+ "content": "[EOS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "163586": {
20
+ "content": "<|im_end|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "163587": {
28
+ "content": "<|im_user|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "163588": {
36
+ "content": "<|im_assistant|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "163590": {
44
+ "content": "<|start_header_id|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "163591": {
52
+ "content": "<|end_header_id|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "163593": {
60
+ "content": "[EOT]",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "163594": {
68
+ "content": "<|im_system|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "163601": {
76
+ "content": "<|im_middle|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "163838": {
84
+ "content": "[UNK]",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "163839": {
92
+ "content": "[PAD]",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ }
99
+ },
100
+ "auto_map": {
101
+ "AutoTokenizer": [
102
+ "tokenization_kimi.TikTokenTokenizer",
103
+ null
104
+ ]
105
+ },
106
+ "backend": "custom",
107
+ "bos_token": "[BOS]",
108
+ "clean_up_tokenization_spaces": false,
109
+ "eos_token": "[EOS]",
110
+ "extra_special_tokens": [
111
+ "<|im_end|>",
112
+ "<|im_user|>",
113
+ "<|im_assistant|>",
114
+ "<|start_header_id|>",
115
+ "<|end_header_id|>",
116
+ "[EOT]",
117
+ "<|im_system|>",
118
+ "<|im_middle|>"
119
+ ],
120
+ "is_local": false,
121
+ "model_max_length": 1000000000000000019884624838656,
122
+ "pad_token": "[PAD]",
123
+ "tokenizer_class": "TikTokenTokenizer",
124
+ "unk_token": "[UNK]"
125
+ }