Spaces:
Running
Running
""" | |
num_text_tokens: 50000 | |
num_tokens: 50009 | |
command_tokens: ['<|endoftext|>', '[SEP]', '[CLS]', '[MASK]', '[UNK]', '<|startofpiece|>', '<|endofpiece|>', '[gMASK]', '[sMASK]'] | |
{'pad': 50000, 'eos': 50000, 'sep': 50001, 'ENC': 50002, 'MASK': 50003, 'unk': 50004, 'sop': 50006, 'eop': 50007, 'gMASK': 50007, 'sMASK': 50008} | |
padded vocab (size: 50009) with 39 dummy tokens (new size: 50048) | |
## 注意区分 | |
12758 ▁凯 | |
45012 凯 | |
这两个语义一样,id却完全不同,是bug吧。 | |
## ss | |
""" | |
from tokenization import make_tokenizer | |
add_sentinel_token = 0 | |
tokenizer = make_tokenizer("ChineseSPTokenizer", None, "tokenizer.model", "50048", | |
None, add_block_symbols=True, cache_dir="cache", | |
add_sentinel_token=add_sentinel_token, add_task_mask=True, | |
add_decoder_mask=False, | |
fix_command_token=False) | |
text = "凯凯凯旋门位于意大利米兰市古城堡旁。1807年为纪念[MASK]而建,门高25米,顶上矗立两武士青铜古兵车铸像。" | |
text = " 凯凯凯旋门位于意大利米兰市古城堡旁。1807年为纪念[MASK]而建,门高25米,顶上矗立两武士青铜古兵车铸像。" | |
def bad_casse(): | |
tokens = tokenizer.text_tokenizer.tokenize(text) | |
tokens_id = tokenizer.text_tokenizer.encode(text) | |
decode_text = tokenizer.text_tokenizer.decode(tokens_id) | |
def good_case(): | |
tokens_id = tokenizer.EncodeAsIds(text).tokenization | |
# tokens_id = [12758, 44916, 43696, 801, 1435, 5140, 1876, 2669, 50003, 13702, 43359, 44095, 48151, 44321, 44537, | |
# 6687, 34043, 44291, 43361, 43362, 26124, 43377, 2366, 50003, 1934, 7004, 44771, 44362, 43795, 5837, | |
# 2790, 43359, 43437, 180, 2426, 43388, 14682, 7866, 12845, 43361] | |
# | |
# tokens_id = [50002, 27318, 43359, 17, 43785, 44326, 277, 14821, 44460, 44773, | |
# 43389, 23, 1344, 43383, 43398, 348, 128, 403, 513, 4258, | |
# 44460, 44773, 8471, 43689, 43389, 320, 439, 43383, 682, 2222, | |
# 14676, 43383, 50007] | |
tokens_id = [9737, 50007] # 圣[gMASK] | |
tokens_id = [14643, 50007] # ▁这款[gMASK] | |
tokens_id = [12014, 50007, 44435, 6703, 33751, 992, 16791, 43359, 5101, 44612] | |
decode_text = tokenizer.DecodeIds(tokens_id) | |
print(decode_text) | |
for idx in tokens_id: | |
print(idx, tokenizer.IdToToken(idx)) | |
# decode_text = tokenizer.DecodeIds(tokens_id) | |
# bad_casse() | |
good_case() | |