xu-song's picture
update
751936e
raw
history blame
827 Bytes
from .tokenization import ChineseSPTokenizer, make_tokenizer
def DecodeIds(self, Ids, type_token=False):
try:
decode_str = self.DecodeIds(Ids, type_token=type_token)
except Exception as e:
print("WARNING", Ids, e)
decode_str = ""
return decode_str
ChineseSPTokenizer.decode = DecodeIds
add_sentinel_token = 0
tokenizer = make_tokenizer("ChineseSPTokenizer", None, "tokenizer.model", "50048",
None, add_block_symbols=True, cache_dir="cache",
add_sentinel_token=add_sentinel_token, add_task_mask=True,
add_decoder_mask=False,
fix_command_token=False)
tokenizer.vocab_size = tokenizer.num_tokens
# vocab_size = len(tokenizer.get_vocab())
# vocab_size = tokenizer.vocab_size