Source code for transformers.models.pegasus.tokenization_pegasus

# coding=utf-8
# Copyright 2020 Google and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict, List, Optional

from ...file_utils import add_start_docstrings
from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
from ..reformer.tokenization_reformer import ReformerTokenizer


SPIECE_UNDERLINE = "▁"

VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {"google/pegasus-xsum": "https://cdn.huggingface.co/google/pegasus-xsum/spiece.model"}
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "google/pegasus-xsum": 512,
}


[docs]class PegasusTokenizer(ReformerTokenizer): r""" Construct a Pegasus tokenizer. :class:`~transformers.PegasusTokenizer` is identical to :class:`~transformers.ReformerTokenizer` and adds a new :meth:`~transformers.PegasusTokenizer.prepare_seq2seq_batch` Refer to superclass :class:`~transformers.ReformerTokenizer` for usage examples and documentation concerning the initialization parameters and other methods. """ offset = 103 # entries 2-104 are only used for pretraining vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__(self, *args, pad_token="<pad>", **kwargs): super().__init__(*args, **kwargs, pad_token="<pad>") # Don't use reserved words added_token_encoder, added_tokens_decoder because of # AssertionError: Non-consecutive added token '1' found. in from_pretrained assert len(self.added_tokens_decoder) == 0 self.encoder: Dict[int, str] = {0: self.pad_token, 1: self.eos_token} # entries 2-104 are only used for pretraining and called unk_2, ...unk_104 self.encoder.update({i: f"unk_{i}" for i in range(2, self.offset + 2)}) self.decoder: Dict[str, int] = {v: k for k, v in self.encoder.items()} def _convert_token_to_id(self, token: str) -> int: """ Converts a token (str) to an id using the vocab. """ if token in self.decoder: return self.decoder[token] elif token in self.added_tokens_decoder: return self.added_tokens_decoder[token] sp_id = self.sp_model.piece_to_id(token) return sp_id + self.offset def _convert_id_to_token(self, index: int) -> str: """Converts an index (integer) to a token (str) using the vocab.""" if index in self.encoder: return self.encoder[index] elif index in self.added_tokens_encoder: return self.added_tokens_encoder[index] else: # assert index > self.offset, f"cannot decode ids between 2 and {self.offset}. Got {index}" token = self.sp_model.IdToPiece(index - self.offset) return token @property def vocab_size(self) -> int: return len(self.sp_model) + self.offset def num_special_tokens_to_add(self, pair=False): """Just EOS""" return 1 def _special_token_mask(self, seq): all_special_ids = set(self.all_special_ids) # call it once instead of inside list comp all_special_ids.remove(self.unk_token_id) # <unk> is only sometimes special assert all_special_ids == set([0, 1]) return [1 if x in all_special_ids else 0 for x in seq] def get_special_tokens_mask( self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False ) -> List[int]: """Get list where entries are [1] if a token is [eos] or [pad] else 0.""" if already_has_special_tokens: return self._special_token_mask(token_ids_0) elif token_ids_1 is None: return self._special_token_mask(token_ids_0) + [1] else: return self._special_token_mask(token_ids_0 + token_ids_1) + [1] def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]: """ Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and adding special tokens. A Pegasus sequence has the following format, where ``X`` represents the sequence: - single sequence: ``X </s>`` - pair of sequences: ``A B </s>`` (not intended use) BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a separator. Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added. token_ids_1 (:obj:`List[int]`, `optional`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ if token_ids_1 is None: return token_ids_0 + [self.eos_token_id] # We don't expect to process pairs, but leave the pair logic for API consistency return token_ids_0 + token_ids_1 + [self.eos_token_id]
[docs] @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING) def prepare_seq2seq_batch( self, src_texts: List[str], tgt_texts: Optional[List[str]] = None, max_length: Optional[int] = None, max_target_length: Optional[int] = None, return_tensors: str = None, truncation=True, padding="longest", **unused, ) -> BatchEncoding: if "" in src_texts: raise ValueError(f"found empty string in src_texts: {src_texts}") tokenizer_kwargs = dict( add_special_tokens=True, return_tensors=return_tensors, max_length=max_length, truncation=truncation, padding=padding, ) model_inputs: BatchEncoding = self(src_texts, **tokenizer_kwargs) if tgt_texts is None: return model_inputs if max_target_length is not None: tokenizer_kwargs["max_length"] = max_target_length labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"] model_inputs["labels"] = labels return model_inputs