File size: 505 Bytes
751936e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
"""
超牛逼,没有OOV
"""
from tokenizers import Tokenizer
from data_sample.oov_base import space_tokens, jd_vocab_tokens, docs
tokenizer = Tokenizer.from_file("20B_tokenizer.json")
def test_oov():
for line in space_tokens + jd_vocab_tokens + docs:
tokens = tokenizer.encode(line)
decode_line = tokenizer.decode(tokens.ids)
if line != decode_line:
print("原句:", line)
print("解码:", decode_line)
if __name__ == "__main__":
test_oov() |