""" num_text_tokens: 50000 num_tokens: 50009 command_tokens: ['<|endoftext|>', '[SEP]', '[CLS]', '[MASK]', '[UNK]', '<|startofpiece|>', '<|endofpiece|>', '[gMASK]', '[sMASK]'] {'pad': 50000, 'eos': 50000, 'sep': 50001, 'ENC': 50002, 'MASK': 50003, 'unk': 50004, 'sop': 50006, 'eop': 50007, 'gMASK': 50007, 'sMASK': 50008} padded vocab (size: 50009) with 39 dummy tokens (new size: 50048) ## 注意区分 12758 ▁凯 45012 凯 这两个语义一样,id却完全不同,是bug吧。 ## ss """ from tokenization import make_tokenizer add_sentinel_token = 0 tokenizer = make_tokenizer("ChineseSPTokenizer", None, "tokenizer.model", "50048", None, add_block_symbols=True, cache_dir="cache", add_sentinel_token=add_sentinel_token, add_task_mask=True, add_decoder_mask=False, fix_command_token=False) text = "凯凯凯旋门位于意大利米兰市古城堡旁。1807年为纪念[MASK]而建,门高25米,顶上矗立两武士青铜古兵车铸像。" text = " 凯凯凯旋门位于意大利米兰市古城堡旁。1807年为纪念[MASK]而建,门高25米,顶上矗立两武士青铜古兵车铸像。" def bad_casse(): tokens = tokenizer.text_tokenizer.tokenize(text) tokens_id = tokenizer.text_tokenizer.encode(text) decode_text = tokenizer.text_tokenizer.decode(tokens_id) def good_case(): tokens_id = tokenizer.EncodeAsIds(text).tokenization # tokens_id = [12758, 44916, 43696, 801, 1435, 5140, 1876, 2669, 50003, 13702, 43359, 44095, 48151, 44321, 44537, # 6687, 34043, 44291, 43361, 43362, 26124, 43377, 2366, 50003, 1934, 7004, 44771, 44362, 43795, 5837, # 2790, 43359, 43437, 180, 2426, 43388, 14682, 7866, 12845, 43361] # # tokens_id = [50002, 27318, 43359, 17, 43785, 44326, 277, 14821, 44460, 44773, # 43389, 23, 1344, 43383, 43398, 348, 128, 403, 513, 4258, # 44460, 44773, 8471, 43689, 43389, 320, 439, 43383, 682, 2222, # 14676, 43383, 50007] tokens_id = [9737, 50007] # 圣[gMASK] tokens_id = [14643, 50007] # ▁这款[gMASK] tokens_id = [12014, 50007, 44435, 6703, 33751, 992, 16791, 43359, 5101, 44612] decode_text = tokenizer.DecodeIds(tokens_id) print(decode_text) for idx in tokens_id: print(idx, tokenizer.IdToToken(idx)) # decode_text = tokenizer.DecodeIds(tokens_id) # bad_casse() good_case()