tokenizer-arena / utils /oov_util.py
eson's picture
update
751936e
raw history blame
No virus
265 Bytes
import os
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
space_tokens = ["空格 ,两个空格 ,三个空格 ,制表符\t,换行符\n"]
docs = [line.strip() for line in open(os.path.join(CURRENT_DIR, "test.txt"), "r", encoding="utf-8")]