Spaces:
Running
Running
import difflib | |
import webbrowser | |
from transformers import AutoTokenizer | |
from data_sample.oov_base import space_tokens, jd_vocab_tokens, docs | |
tokenizer = AutoTokenizer.from_pretrained("tokenizer") | |
def test_oov(): | |
d = difflib.HtmlDiff(wrapcolumn=50) | |
raw_lines = [] | |
decode_lines = [] | |
for line in space_tokens + jd_vocab_tokens + docs: | |
tokens = tokenizer.encode(line) | |
decode_line = tokenizer.decode(tokens) | |
if line != decode_line: | |
raw_lines.append(line) | |
decode_lines.append(decode_line) | |
q = d.make_file(raw_lines, decode_lines) | |
with open('diff.html', 'w', encoding="utf-8") as f_new: | |
f_new.write(q) | |
webbrowser.open('diff.html') | |
if __name__ == "__main__": | |
test_oov() |