""" ## adapt to transformer tokenizer https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/tokenization_utils.py#L379 ## usage - grok ## 风险评估 - 可能会干扰 sentencepiece.SentencePieceProcessor的正常使用,比如 .vocab_size 原来是个方法,patch后是个property ## TODO 不用patch,改用wrapper。常见的 tokenizer通常是封装的 sentencepiece, """ import sentencepiece @property def vocab_size(self): """Returns vocab size""" return self.get_piece_size() def get_vocab(self): """Returns vocab as a dict""" vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} # vocab.update(self.added_tokens_encoder) return vocab def _tokenize(self, text): """Returns a tokenized string.""" return self.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" return self.piece_to_id(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" token = self.IdToPiece(index) return token def convert_ids_to_tokens(self, ids, skip_special_tokens=False): """ copy from transformers.PreTrainedTokenizer Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and added tokens. Args: ids (`int` or `List[int]`): The token id (or token ids) to convert to tokens. skip_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to remove special tokens in the decoding. Returns: `str` or `List[str]`: The decoded token(s). """ self._added_tokens_decoder = {} # add by xs if isinstance(ids, int): if ids in self._added_tokens_decoder: return self._added_tokens_decoder[ids].content else: return self._convert_id_to_token(ids) tokens = [] for index in ids: index = int(index) if skip_special_tokens and index in self.all_special_ids: continue if index in self._added_tokens_decoder: tokens.append(self._added_tokens_decoder[index].content) else: tokens.append(self._convert_id_to_token(index)) return tokens def encode(self, *args, **kwargs): """ add_special_token 是为了兼容 hf_tokenizer """ kwargs.pop("add_special_tokens", None) kwargs.pop("allowed_special", None) return self.Encode(*args, **kwargs) def decode(self, *args, **kwargs): kwargs.pop("skip_special_tokens", None) return self.Decode(*args, **kwargs) sentencepiece.SentencePieceProcessor.vocab_size = vocab_size # sentencepiece.SentencePieceProcessor.get_vocab = get_vocab sentencepiece.SentencePieceProcessor._convert_id_to_token = _convert_id_to_token sentencepiece.SentencePieceProcessor.convert_ids_to_tokens = convert_ids_to_tokens # sentencepiece.SentencePieceProcessor.tokenize = _tokenize sentencepiece.SentencePieceProcessor.encode = encode sentencepiece.SentencePieceProcessor.decode = decode