""" 封装 sentencepiece.SentencePieceProcessor,以便符合transformers中的tokenizer标准 ## reference ## usage - grok """ import sentencepiece as spm from transformers import PreTrainedTokenizer class SPTokenizerWrapper(PreTrainedTokenizer): """ ## impl in PreTrainedTokenizer - convert_ids_to_tokens """ def __init__(self, vocab_file): self.vocab_file = vocab_file self.sp_model = spm.SentencePieceProcessor(self.vocab_file) super().__init__() @property def vocab_size(self): """Returns vocab size""" return self.sp_model.get_piece_size() def get_vocab(self): """Returns vocab as a dict""" vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} return vocab def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" return self.sp_model.piece_to_id(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" token = self.sp_model.IdToPiece(index) return token # def (self, ids, skip_special_tokens=False): # impl in PreTrainedTokenizer def encode(self, *args, **kwargs): kwargs.pop("add_special_tokens", None) kwargs.pop("allowed_special", None) return self.sp_model.Encode(*args, **kwargs) def decode(self, *args, **kwargs): kwargs.pop("skip_special_tokens", None) return self.sp_model.Decode(*args, **kwargs) # PreTrainedTokenizer.convert_ids_to_tokens