|
""" |
|
|
|
num_text_tokens: 50000 |
|
num_tokens: 50009 |
|
command_tokens: ['<|endoftext|>', '[SEP]', '[CLS]', '[MASK]', '[UNK]', '<|startofpiece|>', '<|endofpiece|>', '[gMASK]', '[sMASK]'] |
|
|
|
{'pad': 50000, 'eos': 50000, 'sep': 50001, 'ENC': 50002, 'MASK': 50003, 'unk': 50004, 'sop': 50006, 'eop': 50007, 'gMASK': 50007, 'sMASK': 50008} |
|
|
|
padded vocab (size: 50009) with 39 dummy tokens (new size: 50048) |
|
|
|
## 注意区分 |
|
12758 ▁凯 |
|
45012 凯 |
|
|
|
这两个语义一样,id却完全不同,是bug吧。 |
|
|
|
## ss |
|
""" |
|
|
|
from tokenization import make_tokenizer |
|
|
|
add_sentinel_token = 0 |
|
tokenizer = make_tokenizer("ChineseSPTokenizer", None, "tokenizer.model", "50048", |
|
None, add_block_symbols=True, cache_dir="cache", |
|
add_sentinel_token=add_sentinel_token, add_task_mask=True, |
|
add_decoder_mask=False, |
|
fix_command_token=False) |
|
|
|
text = "凯凯凯旋门位于意大利米兰市古城堡旁。1807年为纪念[MASK]而建,门高25米,顶上矗立两武士青铜古兵车铸像。" |
|
text = " 凯凯凯旋门位于意大利米兰市古城堡旁。1807年为纪念[MASK]而建,门高25米,顶上矗立两武士青铜古兵车铸像。" |
|
|
|
|
|
def bad_casse(): |
|
tokens = tokenizer.text_tokenizer.tokenize(text) |
|
tokens_id = tokenizer.text_tokenizer.encode(text) |
|
decode_text = tokenizer.text_tokenizer.decode(tokens_id) |
|
|
|
|
|
def good_case(): |
|
tokens_id = tokenizer.EncodeAsIds(text).tokenization |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokens_id = [9737, 50007] |
|
tokens_id = [14643, 50007] |
|
tokens_id = [12014, 50007, 44435, 6703, 33751, 992, 16791, 43359, 5101, 44612] |
|
|
|
|
|
|
|
decode_text = tokenizer.DecodeIds(tokens_id) |
|
print(decode_text) |
|
for idx in tokens_id: |
|
print(idx, tokenizer.IdToToken(idx)) |
|
|
|
|
|
|
|
|
|
good_case() |
|
|