import difflib import webbrowser from transformers import AutoTokenizer from data_sample.oov_base import space_tokens, jd_vocab_tokens, docs tokenizer = AutoTokenizer.from_pretrained("tokenizer") def test_oov(): d = difflib.HtmlDiff(wrapcolumn=50) raw_lines = [] decode_lines = [] for line in space_tokens + jd_vocab_tokens + docs: tokens = tokenizer.encode(line) decode_line = tokenizer.decode(tokens) if line != decode_line: raw_lines.append(line) decode_lines.append(decode_line) q = d.make_file(raw_lines, decode_lines) with open('diff.html', 'w', encoding="utf-8") as f_new: f_new.write(q) webbrowser.open('diff.html') if __name__ == "__main__": test_oov()