rajammanabrolu commited on
Commit
cd5239e
1 Parent(s): fd5557b

Update tiktoken.py

Browse files
Files changed (1) hide show
  1. tiktoken.py +6 -3
tiktoken.py CHANGED
@@ -198,7 +198,7 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
198
  '{% else %}'
199
  "{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
200
  '{% endif %}'
201
- '{% if (add_generation_prompt == true) %}'
202
  "{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}"
203
  "{% elif (message['role'] == 'assistant') %}"
204
  '{{ eos_token }}'
@@ -253,7 +253,10 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
253
 
254
  def _convert_id_to_token(self, index: int) -> Optional[str]:
255
  """Converts an index (integer) in a token (str) using the vocab."""
256
- return self.decoder.get(index)
 
 
 
257
 
258
  def convert_tokens_to_string(self, tokens: List[str]) -> str:
259
  """Converts a sequence of tokens (string) in a single string."""
@@ -370,4 +373,4 @@ class TiktokenTokenizerWrapper(PreTrainedTokenizer):
370
  return tensor
371
 
372
 
373
- TiktokenTokenizerWrapper.register_for_auto_class()
 
198
  '{% else %}'
199
  "{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
200
  '{% endif %}'
201
+ '{% if (add_generation_prompt == true and loop.last) %}'
202
  "{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}"
203
  "{% elif (message['role'] == 'assistant') %}"
204
  '{{ eos_token }}'
 
253
 
254
  def _convert_id_to_token(self, index: int) -> Optional[str]:
255
  """Converts an index (integer) in a token (str) using the vocab."""
256
+ # For tokens in either the gap in ids in the tokenizer, or beyond the range of the tokenizer,
257
+ # we return empty string. This matches the behavior of Hugging Face fast tokenizers,
258
+ # but not slow tokenizers.
259
+ return self.decoder.get(index, '')
260
 
261
  def convert_tokens_to_string(self, tokens: List[str]) -> str:
262
  """Converts a sequence of tokens (string) in a single string."""
 
373
  return tensor
374
 
375
 
376
+ TiktokenTokenizerWrapper.register_for_auto_class()