tokenizer-arena / vocab /gpt_35_turbo /tiktoken_decode_test.py
eson's picture
add more tokenizers
f4973d4
raw
history blame
No virus
965 Bytes
"""
gpt_35_turbo decode UnicodeDecodeError 99413 b' \xe6\xb5'
gpt_35_turbo decode UnicodeDecodeError 99742 b'\x8c\xa8'
gpt_35_turbo decode UnicodeDecodeError 99834 b'\xad\x90'
gpt_35_turbo decode UnicodeDecodeError 100112 b'\xe0\xae\xbf\xe0\xae'
gpt_35_turbo decode KeyError 100256
gpt_35_turbo decode KeyError 100261
gpt_35_turbo decode KeyError 100262
gpt_35_turbo decode KeyError 100263
"""
import json
import tiktoken
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
tokens = [100263, 99834]
tokenizer.decode(tokens)
tokenizer.decode(tokens)
tokenizer._core_bpe.decode_bytes(tokens).decode("utf-8", errors="replace")
for token_id in [100263, 99834]: # special_tokens: 200257-100260 100276
try:
tokenizer.decode_tokens_bytes([token_id])
except:
pass
try:
tokenizer.decode_single_token_bytes(token_id)
except:
pass
try:
tokenizer.decode_bytes([token_id])
except:
pass