eson's picture
update
751936e
"""
num_text_tokens: 50000
num_tokens: 50009
command_tokens: ['<|endoftext|>', '[SEP]', '[CLS]', '[MASK]', '[UNK]', '<|startofpiece|>', '<|endofpiece|>', '[gMASK]', '[sMASK]']
{'pad': 50000, 'eos': 50000, 'sep': 50001, 'ENC': 50002, 'MASK': 50003, 'unk': 50004, 'sop': 50006, 'eop': 50007, 'gMASK': 50007, 'sMASK': 50008}
padded vocab (size: 50009) with 39 dummy tokens (new size: 50048)
## 注意区分
12758 ▁凯
45012 凯
这两个语义一样,id却完全不同,是bug吧。
## ss
"""
from tokenization import make_tokenizer
add_sentinel_token = 0
tokenizer = make_tokenizer("ChineseSPTokenizer", None, "tokenizer.model", "50048",
None, add_block_symbols=True, cache_dir="cache",
add_sentinel_token=add_sentinel_token, add_task_mask=True,
add_decoder_mask=False,
fix_command_token=False)
text = "凯凯凯旋门位于意大利米兰市古城堡旁。1807年为纪念[MASK]而建,门高25米,顶上矗立两武士青铜古兵车铸像。"
text = " 凯凯凯旋门位于意大利米兰市古城堡旁。1807年为纪念[MASK]而建,门高25米,顶上矗立两武士青铜古兵车铸像。"
def bad_casse():
tokens = tokenizer.text_tokenizer.tokenize(text)
tokens_id = tokenizer.text_tokenizer.encode(text)
decode_text = tokenizer.text_tokenizer.decode(tokens_id)
def good_case():
tokens_id = tokenizer.EncodeAsIds(text).tokenization
# tokens_id = [12758, 44916, 43696, 801, 1435, 5140, 1876, 2669, 50003, 13702, 43359, 44095, 48151, 44321, 44537,
# 6687, 34043, 44291, 43361, 43362, 26124, 43377, 2366, 50003, 1934, 7004, 44771, 44362, 43795, 5837,
# 2790, 43359, 43437, 180, 2426, 43388, 14682, 7866, 12845, 43361]
#
# tokens_id = [50002, 27318, 43359, 17, 43785, 44326, 277, 14821, 44460, 44773,
# 43389, 23, 1344, 43383, 43398, 348, 128, 403, 513, 4258,
# 44460, 44773, 8471, 43689, 43389, 320, 439, 43383, 682, 2222,
# 14676, 43383, 50007]
tokens_id = [9737, 50007] # 圣[gMASK]
tokens_id = [14643, 50007] # ▁这款[gMASK]
tokens_id = [12014, 50007, 44435, 6703, 33751, 992, 16791, 43359, 5101, 44612]
decode_text = tokenizer.DecodeIds(tokens_id)
print(decode_text)
for idx in tokens_id:
print(idx, tokenizer.IdToToken(idx))
# decode_text = tokenizer.DecodeIds(tokens_id)
# bad_casse()
good_case()