File size: 505 Bytes
751936e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
"""
超牛逼,没有OOV
"""

from tokenizers import Tokenizer
from data_sample.oov_base import space_tokens, jd_vocab_tokens, docs

tokenizer = Tokenizer.from_file("20B_tokenizer.json")

def test_oov():
    for line in space_tokens + jd_vocab_tokens + docs:
        tokens = tokenizer.encode(line)
        decode_line = tokenizer.decode(tokens.ids)
        if line != decode_line:
            print("原句:", line)
            print("解码:", decode_line)


if __name__ == "__main__":
    test_oov()