In [1]:
from transformers import AutoTokenizer

 from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen-7B', trust_remote_code=True)

# Encode and Decode

In [3]:
# treat surface forms of special tokens as actual special tokens
# the default, but unsafe (to be compatible with other projects)
# the same as tokenizer.encode("print('<|endoftext|>')<|endoftext|>", allowed_special='all', disallowed_special=())
tokenizer.encode("print('<|endoftext|>')<|endoftext|>")

[1350, 492, 151643, 863, 151643]

In [4]:
tokenizer.decode([1350, 492, 151643, 863, 151643])

"print('<|endoftext|>')<|endoftext|>"

In [5]:
# treat texts just as texts, avoid injection attacks
tokenizer.encode("print('<|endoftext|>')", allowed_special=set(), disallowed_special=()) + [tokenizer.eod_id]

[1350, 11146, 91, 8691, 723, 427, 91, 79865, 151643]

In [6]:
tokenizer.decode([1350, 11146, 91, 8691, 723, 427, 91, 79865, 151643])

"print('<|endoftext|>')<|endoftext|>"

In [7]:
# treat texts just as texts, avoid injection attacks, and raise error if surface forms of special tokens are ever encountered
tokenizer.encode("print('<|endoftext|>')", allowed_special=set(), disallowed_special='all') + [tokenizer.eod_id]


ValueError: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


In [None]:
# fine-grained control, just keep mind of this:
# allowed_special is treated as special tokens
# disallowed_special raise errors
# allowed_special has higher priority than disallowed_special
tokenizer.encode("<|im_start|>print('<|extra_0|>')<|im_end|>", 
 allowed_special={'<|im_start|>', '<|im_end|>'}, 
 disallowed_special=['<|endoftext|>']) + [tokenizer.eod_id]

[151644, 1350, 11146, 91, 15460, 62, 15, 91, 79865, 151645, 151643]

In [None]:
tokenizer.encode("<|im_start|>print('<|extra_0|>')<|im_end|>", 
 allowed_special={'<|im_start|>', '<|im_end|>', '<|extra_0|>'}, 
 disallowed_special=['<|endoftext|>']) + [tokenizer.eod_id]

[151644, 1350, 492, 151646, 863, 151645, 151643]

# Special Token Management

In [8]:
# huggingface tokenizer has its own special token mechanism, so does tiktoken
# we only use the tiktoken mechanism for special tokens, which means many property of huggingface tokenizer will be None
tokenizer.unk_token

Using unk_token, but it is not set yet.


In [9]:
tokenizer.eos_token_id # use tokenizer.eod_id instead

In [10]:
tokenizer.pad_token_id 

In [11]:
# use one of the extras such as <|extra_0|>
tokenizer.special_tokens['<|extra_0|>']

151646

# Utility Methods

In [12]:
# special tokens are str, tokens are bytes (since tiktoken operates on the bytes level)
ids = [1350, 11146, 91, 8691, 723, 427, 91, 79865, 151643]
tokenizer.convert_ids_to_tokens(ids)

[b'print', b"('<", b'|', b'endo', b'ft', b'ext', b'|', b">')", '<|endoftext|>']

In [13]:
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(ids))

"print('<|endoftext|>')<|endoftext|>"

In [14]:
ids = tokenizer.encode("<|im_start|>print('我是一只猫<|extra_0|>')\n#喵喵喵<|im_end|>", 
 allowed_special={'<|im_start|>', '<|im_end|>', '<|extra_0|>'}, 
 disallowed_special=['<|endoftext|>']) + [tokenizer.eod_id]

In [15]:
tokenizer.convert_ids_to_tokens(ids)

['<|im_start|>',
 b'print',
 b"('",
 b'\xe6\x88\x91',
 b'\xe6\x98\xaf\xe4\xb8\x80',
 b'\xe5\x8f\xaa',
 b'\xe7\x8c\xab',
 '<|extra_0|>',
 b"')\n",
 b'#',
 b'\xe5\x96\xb5',
 b'\xe5\x96\xb5',
 b'\xe5\x96\xb5',
 '<|im_end|>',
 '<|endoftext|>']

In [16]:
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(ids))

"<|im_start|>print('我是一只猫<|extra_0|>')\n#喵喵喵<|im_end|><|endoftext|>"

In [17]:
tokenizer._convert_id_to_token(len(tokenizer)-1)

'<|extra_204|>'

In [18]:
tokenizer._convert_token_to_id('<|extra_204|>')

151850