""" 最简单的tokenizer """ import json from tokenizers import Tokenizer tokenizer = Tokenizer.from_file("20B_tokenizer.json") print("vocab_size with added_tokens:", tokenizer.get_vocab_size(with_added_tokens=True)) print("vocab_size without added_tokens:", tokenizer.get_vocab_size(with_added_tokens=False)) vocab = tokenizer.get_vocab() def test_single_token(): """ 单个字符的编码(一个字符可能会编码成多个id) """ for word in "发大厦三分赛中国解决方法黑白侗鸩,。!?;": encoding = tokenizer.encode(word) for token_id in encoding.ids: decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd" token = tokenizer.id_to_token(token_id) print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token)) def test_long_token(): """ """ words = [ "//----------------------------------------------------------------", # 代码里有 "--------------------------", "-------------------------", "-----------------------", ] for word in words: encoding = tokenizer.encode(word) for token_id in encoding.ids: decode_str = tokenizer.decode([token_id]) # token = tokenizer.id_to_token(token_id) print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token)) def test_encode(): text = "中国解决方法黑白侗鸩,。!?;一个人去哪里 一 个" encoding = tokenizer.encode(text) for token_id in encoding.ids: decode_str = tokenizer.decode([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd" token = tokenizer.id_to_token(token_id) print(token_id, decode_str, json.dumps(decode_str), token, json.dumps(token)) test_single_token() # test_long_token() # test_encode()