rajammanabrolu commited on
Commit
e1eee7f
1 Parent(s): bc6270a

Update tiktoken.py

Browse files
Files changed (1) hide show
  1. tiktoken.py +29 -15
tiktoken.py CHANGED
@@ -127,7 +127,7 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
127
  self.byte_encoder = bytes_to_unicode()
128
  self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
129
 
130
- self.decoder = {}
131
  for i in range(self.encoding.n_vocab):
132
  try:
133
  self.encoding.decode_single_token_bytes(i)
@@ -141,7 +141,7 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
141
  ])
142
  self.decoder[i] = decoding
143
 
144
- self.encoder = {}
145
  for i in range(self.encoding.n_vocab):
146
  if i in self.decoder:
147
  self.encoder[self.decoder[i]] = i
@@ -173,12 +173,30 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
173
  Pinning default Chat ML template in case defaults change.
174
  """
175
  template = (
176
- "{% set system_message = '' %}"
177
- '{% if USE_DEFAULT_PROMPT == true %}'
178
- "{{'<|im_start|>system\n' + 'DEFAULT_SYSTEM_PROMPT'}}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  '{% endif %}'
180
- '{% for message in messages %}'
181
- "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
182
  '{% endfor %}')
183
  template = template.replace(
184
  'USE_DEFAULT_PROMPT',
@@ -188,11 +206,7 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
188
  return template
189
 
190
  def get_vocab(self) -> Dict[str, int]:
191
- """Returns vocab as a dict.
192
-
193
- Note: This function does not work properly due to difference in assumptions between tiktoken and Hugging Face tokenizers.
194
- Most uses do not need to use get_vocab, so this is not a priority to fix.
195
- """
196
  # As far as I can tell, we don't require get_vocab to completely work,
197
  # but when using additional_special_tokens, Hugging Face determines the next
198
  # token index to add with len(self.get_vocab()) so we need the _size_ of this dictionary to be correct.
@@ -227,15 +241,15 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
227
 
228
  return tokens
229
 
230
- def _convert_token_to_id(self, token: str):
231
  """Converts a token (str) in an id using the vocab."""
232
  return self.encoder.get(token, self.encoder.get(self.unk_token))
233
 
234
- def _convert_id_to_token(self, index: int):
235
  """Converts an index (integer) in a token (str) using the vocab."""
236
  return self.decoder.get(index)
237
 
238
- def convert_tokens_to_string(self, tokens: List[str]):
239
  """Converts a sequence of tokens (string) in a single string."""
240
  text = ''.join(tokens)
241
  text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8')
 
127
  self.byte_encoder = bytes_to_unicode()
128
  self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
129
 
130
+ self.decoder: Dict[int, str] = {}
131
  for i in range(self.encoding.n_vocab):
132
  try:
133
  self.encoding.decode_single_token_bytes(i)
 
141
  ])
142
  self.decoder[i] = decoding
143
 
144
+ self.encoder: Dict[str, int] = {}
145
  for i in range(self.encoding.n_vocab):
146
  if i in self.decoder:
147
  self.encoder[self.decoder[i]] = i
 
173
  Pinning default Chat ML template in case defaults change.
174
  """
175
  template = (
176
+ "{% if messages[0]['role'] == 'system' %}"
177
+ '{% set loop_messages = messages[1:] %}'
178
+ "{% set system_message = messages[0]['content'] %}"
179
+ "{% elif USE_DEFAULT_PROMPT == true and not 'system' in messages[0]['role'] %}"
180
+ '{% set loop_messages = messages %}'
181
+ "{% set system_message = 'DEFAULT_SYSTEM_PROMPT' %}"
182
+ '{% else %}'
183
+ '{% set loop_messages = messages %}'
184
+ '{% set system_message = false %}'
185
+ '{% endif %}'
186
+ '{% for message in loop_messages %}'
187
+ '{% if loop.index0 == 0 %}'
188
+ '{% if system_message != false %}'
189
+ "{{ '<|im_start|>system\n' + system_message.strip() + '\n'}}"
190
+ '{% endif %}'
191
+ "{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
192
+ '{% else %}'
193
+ "{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
194
+ '{% endif %}'
195
+ '{% if (add_generation_prompt == true) %}'
196
+ "{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}"
197
+ "{% elif (message['role'] == 'assistant') %}"
198
+ '{{ eos_token }}'
199
  '{% endif %}'
 
 
200
  '{% endfor %}')
201
  template = template.replace(
202
  'USE_DEFAULT_PROMPT',
 
206
  return template
207
 
208
  def get_vocab(self) -> Dict[str, int]:
209
+ """Returns vocab as a dict."""
 
 
 
 
210
  # As far as I can tell, we don't require get_vocab to completely work,
211
  # but when using additional_special_tokens, Hugging Face determines the next
212
  # token index to add with len(self.get_vocab()) so we need the _size_ of this dictionary to be correct.
 
241
 
242
  return tokens
243
 
244
+ def _convert_token_to_id(self, token: str) -> Optional[int]:
245
  """Converts a token (str) in an id using the vocab."""
246
  return self.encoder.get(token, self.encoder.get(self.unk_token))
247
 
248
+ def _convert_id_to_token(self, index: int) -> Optional[str]:
249
  """Converts an index (integer) in a token (str) using the vocab."""
250
  return self.decoder.get(index)
251
 
252
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
253
  """Converts a sequence of tokens (string) in a single string."""
254
  text = ''.join(tokens)
255
  text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8')