""" 超牛逼,没有OOV """ from tokenizers import Tokenizer from data_sample.oov_base import space_tokens, jd_vocab_tokens, docs tokenizer = Tokenizer.from_file("20B_tokenizer.json") def test_oov(): for line in space_tokens + jd_vocab_tokens + docs: tokens = tokenizer.encode(line) decode_line = tokenizer.decode(tokens.ids) if line != decode_line: print("原句:", line) print("解码:", decode_line) if __name__ == "__main__": test_oov()