rajammanabrolu
commited on
Commit
•
bc6270a
1
Parent(s):
32210b9
Update tiktoken.py
Browse files- tiktoken.py +78 -89
tiktoken.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1 |
# Copyright 2022 MosaicML LLM Foundry authors
|
2 |
# SPDX-License-Identifier: Apache-2.0
|
3 |
-
|
4 |
-
import
|
5 |
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
6 |
|
7 |
import torch
|
8 |
from transformers import PreTrainedTokenizer
|
@@ -10,6 +9,38 @@ from transformers import PreTrainedTokenizer
|
|
10 |
DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible."""
|
11 |
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
class TiktokenTokenizerWrapper(PreTrainedTokenizer):
|
14 |
"""A thin wrapper around tiktoken to make it compatible with Hugging Face.
|
15 |
|
@@ -93,6 +124,28 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
|
|
93 |
self.add_eos_token = add_eos_token
|
94 |
self.use_default_system_prompt = use_default_system_prompt
|
95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
super().__init__(model_name=model_name,
|
97 |
encoding_name=encoding_name,
|
98 |
add_bos_token=add_bos_token,
|
@@ -140,117 +193,53 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
|
|
140 |
Note: This function does not work properly due to difference in assumptions between tiktoken and Hugging Face tokenizers.
|
141 |
Most uses do not need to use get_vocab, so this is not a priority to fix.
|
142 |
"""
|
143 |
-
warnings.warn(
|
144 |
-
'get_vocab does not work properly with TiktokenTokenizerWrapper. Please do not rely on it being perfectly correct.'
|
145 |
-
+
|
146 |
-
' It will be called once init just to get the size of the vocab inside the base class.'
|
147 |
-
)
|
148 |
-
|
149 |
-
vocab = {}
|
150 |
-
for i in range(self.vocab_size):
|
151 |
-
try:
|
152 |
-
# need to try this first, so that we get a proper KeyError,
|
153 |
-
# otherwise it crashes in the rust code
|
154 |
-
_ = self.encoding.decode_single_token_bytes(i)
|
155 |
-
vocab[self.encoding.decode([i])] = i
|
156 |
-
except KeyError:
|
157 |
-
pass
|
158 |
-
|
159 |
# As far as I can tell, we don't require get_vocab to completely work,
|
160 |
# but when using additional_special_tokens, Hugging Face determines the next
|
161 |
# token index to add with len(self.get_vocab()) so we need the _size_ of this dictionary to be correct.
|
|
|
162 |
extra_id_index = 0
|
163 |
candidate_extra_id = f'<extra_id_{extra_id_index}>'
|
164 |
indices_to_fill_in = {i for i in range(self.vocab_size)} - set(
|
165 |
-
|
166 |
|
167 |
# Add enough indices to make get_vocab() the right length
|
168 |
for index_to_add in indices_to_fill_in:
|
169 |
# Make sure we don't overwrite a token that already exists
|
170 |
-
while candidate_extra_id in
|
171 |
extra_id_index += 1
|
172 |
candidate_extra_id = f'<extra_id_{extra_id_index}>'
|
173 |
|
174 |
# Get an index to add and add the item
|
175 |
-
|
176 |
-
|
177 |
-
return vocab
|
178 |
|
179 |
-
|
180 |
-
"""Returns a tokenized string.
|
181 |
|
182 |
-
|
183 |
-
|
184 |
-
and then the _convert_token_to_id method turns that list of strings into a list of integers.
|
185 |
-
However, not all vocab indices can be decoded into a string, so instead we just return the integers
|
186 |
-
from this function, and have adjusted the _convert_token_to_id method to handle integers as well as strings.
|
187 |
-
The only use of _tokenize that I could find was in this way, so this _should_ be safe.
|
188 |
-
"""
|
189 |
if not isinstance(text, str):
|
190 |
raise ValueError(
|
191 |
f'Expected a string input to _tokenize but got {type(text)}.')
|
192 |
|
193 |
-
tokens = [
|
|
|
|
|
|
|
194 |
|
195 |
return tokens
|
196 |
|
197 |
-
def _convert_token_to_id(self, token:
|
198 |
-
"""Converts a token (str)
|
199 |
-
|
200 |
-
return token
|
201 |
-
|
202 |
-
return self.encoding.encode(token, allowed_special='all')[0]
|
203 |
|
204 |
-
def _convert_id_to_token(self, index: int)
|
205 |
-
"""Converts an index (integer)
|
206 |
-
return self.
|
207 |
|
208 |
-
def convert_tokens_to_string(self, tokens: List[str])
|
209 |
"""Converts a sequence of tokens (string) in a single string."""
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
self,
|
214 |
-
ids: Union[int, List[int]],
|
215 |
-
skip_special_tokens: bool = False) -> Union[str, List[str]]:
|
216 |
-
"""Converts a single index or a sequence of indices into a token or a.
|
217 |
-
|
218 |
-
sequence of tokens, using the vocabulary and added tokens.
|
219 |
-
|
220 |
-
Args:
|
221 |
-
ids (`int` or `List[int]`):
|
222 |
-
The token id (or token ids) to convert to tokens.
|
223 |
-
skip_special_tokens (`bool`, *optional*, defaults to `False`):
|
224 |
-
Whether or not to remove special tokens in the decoding.
|
225 |
-
|
226 |
-
Returns:
|
227 |
-
`str` or `List[str]`: The decoded token(s).
|
228 |
-
"""
|
229 |
-
if isinstance(ids, int):
|
230 |
-
if ids in self.added_tokens_decoder:
|
231 |
-
return str(self.added_tokens_decoder[ids])
|
232 |
-
|
233 |
-
return self._convert_id_to_token(ids)
|
234 |
-
|
235 |
-
# current_stream will collect multiple tokens, and then separately add items
|
236 |
-
# for each added token. This is done so that decode works properly with token ids
|
237 |
-
# that cannot be represented naively in utf-8.
|
238 |
-
tokens = []
|
239 |
-
current_stream = []
|
240 |
-
for index in ids:
|
241 |
-
if skip_special_tokens and index in self.all_special_ids:
|
242 |
-
continue
|
243 |
-
|
244 |
-
if index in self.added_tokens_decoder:
|
245 |
-
tokens.append(self.encoding.decode(current_stream))
|
246 |
-
current_stream = []
|
247 |
-
tokens.append(str(self.added_tokens_decoder[index]))
|
248 |
-
else:
|
249 |
-
current_stream.append(index)
|
250 |
-
|
251 |
-
if len(current_stream) > 0:
|
252 |
-
tokens.append(self.encoding.decode(current_stream))
|
253 |
-
return tokens
|
254 |
|
255 |
def build_inputs_with_special_tokens(
|
256 |
self,
|
@@ -360,4 +349,4 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
|
|
360 |
return tensor
|
361 |
|
362 |
|
363 |
-
TiktokenTokenizerWrapper.register_for_auto_class()
|
|
|
1 |
# Copyright 2022 MosaicML LLM Foundry authors
|
2 |
# SPDX-License-Identifier: Apache-2.0
|
3 |
+
from functools import lru_cache
|
4 |
+
from typing import Any, Dict, List, Optional, Tuple
|
|
|
5 |
|
6 |
import torch
|
7 |
from transformers import PreTrainedTokenizer
|
|
|
9 |
DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible."""
|
10 |
|
11 |
|
12 |
+
# Taken from
|
13 |
+
# https://github.com/huggingface/transformers/blob/8aca43bdb3cb9a5020f6d57589d85679dc873b1c/src/transformers/models/gpt2/tokenization_gpt2.py#L62-L84
|
14 |
+
@lru_cache()
|
15 |
+
def bytes_to_unicode():
|
16 |
+
"""Returns list of utf-8 byte and a mapping to unicode strings.
|
17 |
+
|
18 |
+
We specifically avoids mapping to whitespace/control characters the bpe code
|
19 |
+
barfs on.
|
20 |
+
|
21 |
+
The reversible bpe codes work on unicode strings. This means you need a
|
22 |
+
large # of unicode characters in your vocab if you want to avoid UNKs. When
|
23 |
+
you're at something like a 10B token dataset you end up needing around 5K
|
24 |
+
for decent coverage. This is a significant percentage of your normal, say,
|
25 |
+
32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and
|
26 |
+
unicode strings.
|
27 |
+
"""
|
28 |
+
bs = (list(range(ord('!'),
|
29 |
+
ord('~') + 1)) + list(range(ord('¡'),
|
30 |
+
ord('¬') + 1)) +
|
31 |
+
list(range(ord('®'),
|
32 |
+
ord('ÿ') + 1)))
|
33 |
+
cs = bs[:]
|
34 |
+
n = 0
|
35 |
+
for b in range(2**8):
|
36 |
+
if b not in bs:
|
37 |
+
bs.append(b)
|
38 |
+
cs.append(2**8 + n)
|
39 |
+
n += 1
|
40 |
+
cs = [chr(n) for n in cs]
|
41 |
+
return dict(zip(bs, cs))
|
42 |
+
|
43 |
+
|
44 |
class TiktokenTokenizerWrapper(PreTrainedTokenizer):
|
45 |
"""A thin wrapper around tiktoken to make it compatible with Hugging Face.
|
46 |
|
|
|
124 |
self.add_eos_token = add_eos_token
|
125 |
self.use_default_system_prompt = use_default_system_prompt
|
126 |
|
127 |
+
self.byte_encoder = bytes_to_unicode()
|
128 |
+
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
|
129 |
+
|
130 |
+
self.decoder = {}
|
131 |
+
for i in range(self.encoding.n_vocab):
|
132 |
+
try:
|
133 |
+
self.encoding.decode_single_token_bytes(i)
|
134 |
+
except KeyError:
|
135 |
+
continue
|
136 |
+
# Taken from
|
137 |
+
# https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
|
138 |
+
decoding = ''.join([
|
139 |
+
bytes_to_unicode()[ord(char)] for char in
|
140 |
+
self.encoding.decode_single_token_bytes(i).decode('latin-1')
|
141 |
+
])
|
142 |
+
self.decoder[i] = decoding
|
143 |
+
|
144 |
+
self.encoder = {}
|
145 |
+
for i in range(self.encoding.n_vocab):
|
146 |
+
if i in self.decoder:
|
147 |
+
self.encoder[self.decoder[i]] = i
|
148 |
+
|
149 |
super().__init__(model_name=model_name,
|
150 |
encoding_name=encoding_name,
|
151 |
add_bos_token=add_bos_token,
|
|
|
193 |
Note: This function does not work properly due to difference in assumptions between tiktoken and Hugging Face tokenizers.
|
194 |
Most uses do not need to use get_vocab, so this is not a priority to fix.
|
195 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
# As far as I can tell, we don't require get_vocab to completely work,
|
197 |
# but when using additional_special_tokens, Hugging Face determines the next
|
198 |
# token index to add with len(self.get_vocab()) so we need the _size_ of this dictionary to be correct.
|
199 |
+
vocab_clone = self.encoder.copy()
|
200 |
extra_id_index = 0
|
201 |
candidate_extra_id = f'<extra_id_{extra_id_index}>'
|
202 |
indices_to_fill_in = {i for i in range(self.vocab_size)} - set(
|
203 |
+
vocab_clone.values())
|
204 |
|
205 |
# Add enough indices to make get_vocab() the right length
|
206 |
for index_to_add in indices_to_fill_in:
|
207 |
# Make sure we don't overwrite a token that already exists
|
208 |
+
while candidate_extra_id in vocab_clone:
|
209 |
extra_id_index += 1
|
210 |
candidate_extra_id = f'<extra_id_{extra_id_index}>'
|
211 |
|
212 |
# Get an index to add and add the item
|
213 |
+
vocab_clone[candidate_extra_id] = index_to_add
|
|
|
|
|
214 |
|
215 |
+
return vocab_clone
|
|
|
216 |
|
217 |
+
def _tokenize(self, text: str) -> List[str]:
|
218 |
+
"""Returns a tokenized string."""
|
|
|
|
|
|
|
|
|
|
|
219 |
if not isinstance(text, str):
|
220 |
raise ValueError(
|
221 |
f'Expected a string input to _tokenize but got {type(text)}.')
|
222 |
|
223 |
+
tokens = [
|
224 |
+
self.decoder[t]
|
225 |
+
for t in self.encoding.encode(text, allowed_special='all')
|
226 |
+
]
|
227 |
|
228 |
return tokens
|
229 |
|
230 |
+
def _convert_token_to_id(self, token: str):
|
231 |
+
"""Converts a token (str) in an id using the vocab."""
|
232 |
+
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
|
|
|
|
|
|
233 |
|
234 |
+
def _convert_id_to_token(self, index: int):
|
235 |
+
"""Converts an index (integer) in a token (str) using the vocab."""
|
236 |
+
return self.decoder.get(index)
|
237 |
|
238 |
+
def convert_tokens_to_string(self, tokens: List[str]):
|
239 |
"""Converts a sequence of tokens (string) in a single string."""
|
240 |
+
text = ''.join(tokens)
|
241 |
+
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8')
|
242 |
+
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
|
244 |
def build_inputs_with_special_tokens(
|
245 |
self,
|
|
|
349 |
return tensor
|
350 |
|
351 |
|
352 |
+
TiktokenTokenizerWrapper.register_for_auto_class()
|