""" | |
超牛逼,没有OOV | |
""" | |
from tokenizers import Tokenizer | |
from data_sample.oov_base import space_tokens, jd_vocab_tokens, docs | |
tokenizer = Tokenizer.from_file("20B_tokenizer.json") | |
def test_oov(): | |
for line in space_tokens + jd_vocab_tokens + docs: | |
tokens = tokenizer.encode(line) | |
decode_line = tokenizer.decode(tokens.ids) | |
if line != decode_line: | |
print("原句:", line) | |
print("解码:", decode_line) | |
if __name__ == "__main__": | |
test_oov() |