File size: 2,499 Bytes
751936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""

num_text_tokens: 50000
num_tokens: 50009
command_tokens: ['<|endoftext|>', '[SEP]', '[CLS]', '[MASK]', '[UNK]', '<|startofpiece|>', '<|endofpiece|>', '[gMASK]', '[sMASK]']

{'pad': 50000, 'eos': 50000, 'sep': 50001, 'ENC': 50002, 'MASK': 50003, 'unk': 50004, 'sop': 50006, 'eop': 50007, 'gMASK': 50007, 'sMASK': 50008}

padded vocab (size: 50009) with 39 dummy tokens (new size: 50048)

## 注意区分
12758 ▁凯
45012 凯

这两个语义一样,id却完全不同,是bug吧。

## ss
"""

from tokenization import make_tokenizer

add_sentinel_token = 0
tokenizer = make_tokenizer("ChineseSPTokenizer", None, "tokenizer.model", "50048",
                           None, add_block_symbols=True, cache_dir="cache",
                           add_sentinel_token=add_sentinel_token, add_task_mask=True,
                           add_decoder_mask=False,
                           fix_command_token=False)

text = "凯凯凯旋门位于意大利米兰市古城堡旁。1807年为纪念[MASK]而建,门高25米,顶上矗立两武士青铜古兵车铸像。"
text = " 凯凯凯旋门位于意大利米兰市古城堡旁。1807年为纪念[MASK]而建,门高25米,顶上矗立两武士青铜古兵车铸像。"


def bad_casse():
    tokens = tokenizer.text_tokenizer.tokenize(text)
    tokens_id = tokenizer.text_tokenizer.encode(text)
    decode_text = tokenizer.text_tokenizer.decode(tokens_id)


def good_case():
    tokens_id = tokenizer.EncodeAsIds(text).tokenization
    # tokens_id = [12758, 44916, 43696, 801, 1435, 5140, 1876, 2669, 50003, 13702, 43359, 44095, 48151, 44321, 44537,
    #              6687, 34043, 44291, 43361, 43362, 26124, 43377, 2366, 50003, 1934, 7004, 44771, 44362, 43795, 5837,
    #              2790, 43359, 43437, 180, 2426, 43388, 14682, 7866, 12845, 43361]
    #
    # tokens_id = [50002, 27318, 43359,    17, 43785, 44326,   277, 14821, 44460, 44773,
    #      43389,    23,  1344, 43383, 43398,   348,   128,   403,   513,  4258,
    #      44460, 44773,  8471, 43689, 43389,   320,   439, 43383,   682,  2222,
    #      14676, 43383, 50007]
    tokens_id = [9737, 50007]  # 圣[gMASK]
    tokens_id = [14643, 50007]  # ▁这款[gMASK]
    tokens_id = [12014, 50007, 44435, 6703, 33751, 992, 16791, 43359, 5101, 44612]



    decode_text = tokenizer.DecodeIds(tokens_id)
    print(decode_text)
    for idx in tokens_id:
        print(idx, tokenizer.IdToToken(idx))
    # decode_text = tokenizer.DecodeIds(tokens_id)


# bad_casse()
good_case()