Update tokenizer

Browse files

Files changed (3) hide show

README.md +22 -1
entity_vocab.json +2 -2
tokenization_luke_bert_japanese.py +412 -44

README.md CHANGED Viewed

@@ -13,4 +13,25 @@ tags:
 - 2023年7月1日時点の日本語Wikipediaのデータで事前学習をおこないました
 - `[UNK]` (unknown) エンティティを扱えるようにしました
-詳細は[ブログ記事](https://tech.uzabase.com/entry/2023/09/07/172958)をご参照ください。

 - 2023年7月1日時点の日本語Wikipediaのデータで事前学習をおこないました
 - `[UNK]` (unknown) エンティティを扱えるようにしました
+詳細は[ブログ記事](https://tech.uzabase.com/entry/2023/09/07/172958)をご参照ください。
+## 使用方法
+```python
+from transformers import AutoTokenizer, AutoModel
+# 本モデル用のトークナイザのコードを使用するため、trust_remote_code=True の指定が必要です
+tokenizer = AutoTokenizer.from_pretrained("uzabase/luke-japanese-wordpiece-base", trust_remote_code=True)
+model = AutoModel.from_pretrained("uzabase/luke-japanese-wordpiece-base")
+```
+## 更新情報
+- **2023/11/28:** 以下の更新を行いました。
+    - トークナイザが transformers v4.34.0 以降で読み込み不可となっていた問題を修正しました。
+    - トークナイザの出力に `position_ids` を含めるように変更しました。
+        - 以前は LUKE のモデルが [自動的に付与](https://github.com/huggingface/transformers/blob/v4.35.2/src/transformers/models/luke/modeling_luke.py#L424) する `position_ids` が使われていましたが、これは RoBERTa 仕様のものであり、BERT を使った本モデルでは正しい値となっていませんでした。そこで、 BERT 向けの正しい `position_ids` の値がモデルに入力されるように、`position_ids` を明示的にトークナイザの出力に含めるようにしました。
+    - トークナイザの `entity_vocab` の各トークン（`"[PAD]"` 等の特殊トークンを除く）の先頭に付いていた `"None:"` の文字列を除去しました。
+        - 例えば、 `"None:聖徳太子"` となっていたトークンは `"聖徳太子"` に修正されています。
+- **2023/09/07:** モデルを公開しました。

entity_vocab.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:be6327e7cafc2f2b5f694a594d57113fd2bf6b620c592929202f75683b18b67d
-size 23721849

 version https://git-lfs.github.com/spec/v1
+oid sha256:44b62a4236024bcfbc396e434fb137edecbb106e7f6bc36bc2465016d99d84dd
+size 20763373

tokenization_luke_bert_japanese.py CHANGED Viewed

@@ -18,7 +18,7 @@ import collections
 import copy
 import json
 import os
-from typing import List, Optional, Tuple
 from transformers.models.bert_japanese.tokenization_bert_japanese import (
     BasicTokenizer,
@@ -31,7 +31,9 @@ from transformers.models.bert_japanese.tokenization_bert_japanese import (
     load_vocab,
 )
 from transformers.models.luke import LukeTokenizer
-from transformers.tokenization_utils_base import AddedToken
 from transformers.utils import logging
@@ -53,7 +55,7 @@ class LukeBertJapaneseTokenizer(LukeTokenizer):
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    model_input_names = ["input_ids", "attention_mask"]
     def __init__(
         self,
@@ -85,35 +87,6 @@ class LukeBertJapaneseTokenizer(LukeTokenizer):
         jumanpp_kwargs=None,
         **kwargs,
     ):
-        # We call the grandparent's init, not the parent's.
-        super(LukeTokenizer, self).__init__(
-            spm_file=spm_file,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            do_lower_case=do_lower_case,
-            do_word_tokenize=do_word_tokenize,
-            do_subword_tokenize=do_subword_tokenize,
-            word_tokenizer_type=word_tokenizer_type,
-            subword_tokenizer_type=subword_tokenizer_type,
-            never_split=never_split,
-            mecab_kwargs=mecab_kwargs,
-            sudachi_kwargs=sudachi_kwargs,
-            jumanpp_kwargs=jumanpp_kwargs,
-            task=task,
-            max_entity_length=32,
-            max_mention_length=30,
-            entity_token_1="<ent>",
-            entity_token_2="<ent2>",
-            entity_unk_token=entity_unk_token,
-            entity_pad_token=entity_pad_token,
-            entity_mask_token=entity_mask_token,
-            entity_mask2_token=entity_mask2_token,
-            **kwargs,
-        )
         if subword_tokenizer_type == "sentencepiece":
             if not os.path.isfile(spm_file):
                 raise ValueError(
@@ -161,11 +134,11 @@ class LukeBertJapaneseTokenizer(LukeTokenizer):
         self.subword_tokenizer_type = subword_tokenizer_type
         if do_subword_tokenize:
             if subword_tokenizer_type == "wordpiece":
-                self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
             elif subword_tokenizer_type == "character":
-                self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token)
             elif subword_tokenizer_type == "sentencepiece":
-                self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=self.unk_token)
             else:
                 raise ValueError(f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified.")
@@ -212,6 +185,35 @@ class LukeBertJapaneseTokenizer(LukeTokenizer):
         self.max_mention_length = max_mention_length
     @property
     # Copied from BertJapaneseTokenizer
     def do_lower_case(self):
@@ -298,16 +300,13 @@ class LukeBertJapaneseTokenizer(LukeTokenizer):
         """
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A BERT sequence has the following format:
         - single sequence: `[CLS] X [SEP]`
         - pair of sequences: `[CLS] A [SEP] B [SEP]`
         Args:
             token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
             token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
         Returns:
             `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
@@ -324,7 +323,6 @@ class LukeBertJapaneseTokenizer(LukeTokenizer):
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer `prepare_for_model` method.
         Args:
             token_ids_0 (`List[int]`):
                 List of IDs.
@@ -332,7 +330,6 @@ class LukeBertJapaneseTokenizer(LukeTokenizer):
                 Optional second list of IDs for sequence pairs.
             already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
         Returns:
             `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
@@ -353,20 +350,16 @@ class LukeBertJapaneseTokenizer(LukeTokenizer):
         """
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
         pair mask has the following format:
         ```
         0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
         | first sequence    | second sequence |
         ```
         If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
         Args:
             token_ids_0 (`List[int]`):
                 List of IDs.
             token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
         Returns:
             `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
         """
@@ -376,9 +369,384 @@ class LukeBertJapaneseTokenizer(LukeTokenizer):
             return len(cls + token_ids_0 + sep) * [0]
         return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
     def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
         return (text, kwargs)
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         if os.path.isdir(save_directory):
             if self.subword_tokenizer_type == "sentencepiece":

 import copy
 import json
 import os
+from typing import Dict, List, Optional, Tuple, Union
 from transformers.models.bert_japanese.tokenization_bert_japanese import (
     BasicTokenizer,
     load_vocab,
 )
 from transformers.models.luke import LukeTokenizer
+from transformers.tokenization_utils_base import (
+    AddedToken, BatchEncoding, EncodedInput, PaddingStrategy, TensorType, TruncationStrategy
+)
 from transformers.utils import logging
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
     def __init__(
         self,
         jumanpp_kwargs=None,
         **kwargs,
     ):
         if subword_tokenizer_type == "sentencepiece":
             if not os.path.isfile(spm_file):
                 raise ValueError(
         self.subword_tokenizer_type = subword_tokenizer_type
         if do_subword_tokenize:
             if subword_tokenizer_type == "wordpiece":
+                self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
             elif subword_tokenizer_type == "character":
+                self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=str(unk_token))
             elif subword_tokenizer_type == "sentencepiece":
+                self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=str(unk_token))
             else:
                 raise ValueError(f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified.")
         self.max_mention_length = max_mention_length
+        # We call the grandparent's init, not the parent's.
+        super(LukeTokenizer, self).__init__(
+            spm_file=spm_file,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            do_lower_case=do_lower_case,
+            do_word_tokenize=do_word_tokenize,
+            do_subword_tokenize=do_subword_tokenize,
+            word_tokenizer_type=word_tokenizer_type,
+            subword_tokenizer_type=subword_tokenizer_type,
+            never_split=never_split,
+            mecab_kwargs=mecab_kwargs,
+            sudachi_kwargs=sudachi_kwargs,
+            jumanpp_kwargs=jumanpp_kwargs,
+            task=task,
+            max_entity_length=32,
+            max_mention_length=30,
+            entity_token_1="<ent>",
+            entity_token_2="<ent2>",
+            entity_unk_token=entity_unk_token,
+            entity_pad_token=entity_pad_token,
+            entity_mask_token=entity_mask_token,
+            entity_mask2_token=entity_mask2_token,
+            **kwargs,
+        )
     @property
     # Copied from BertJapaneseTokenizer
     def do_lower_case(self):
         """
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A BERT sequence has the following format:
         - single sequence: `[CLS] X [SEP]`
         - pair of sequences: `[CLS] A [SEP] B [SEP]`
         Args:
             token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
             token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
         Returns:
             `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer `prepare_for_model` method.
         Args:
             token_ids_0 (`List[int]`):
                 List of IDs.
                 Optional second list of IDs for sequence pairs.
             already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
         Returns:
             `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         """
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
         pair mask has the following format:
         ```
         0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
         | first sequence    | second sequence |
         ```
         If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
         Args:
             token_ids_0 (`List[int]`):
                 List of IDs.
             token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
         Returns:
             `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
         """
             return len(cls + token_ids_0 + sep) * [0]
         return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+    # Copied and modified from LukeTokenizer, removing the `add_prefix_space` process
     def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
         return (text, kwargs)
+    # Copied and modified from LukeTokenizer, adding `position_ids` to the output
+    def prepare_for_model(
+        self,
+        ids: List[int],
+        pair_ids: Optional[List[int]] = None,
+        entity_ids: Optional[List[int]] = None,
+        pair_entity_ids: Optional[List[int]] = None,
+        entity_token_spans: Optional[List[Tuple[int, int]]] = None,
+        pair_entity_token_spans: Optional[List[Tuple[int, int]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        max_entity_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        prepend_batch_axis: bool = False,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Prepares a sequence of input id, entity id and entity span, or a pair of sequences of inputs ids, entity ids,
+        entity spans so that it can be used by the model. It adds special tokens, truncates sequences if overflowing
+        while taking into account the special tokens and manages a moving window (with user defined stride) for
+        overflowing tokens. Please Note, for *pair_ids* different than `None` and *truncation_strategy = longest_first*
+        or `True`, it is not possible to return overflowing tokens. Such a combination of arguments will raise an
+        error.
+        Args:
+            ids (`List[int]`):
+                Tokenized input ids of the first sequence.
+            pair_ids (`List[int]`, *optional*):
+                Tokenized input ids of the second sequence.
+            entity_ids (`List[int]`, *optional*):
+                Entity ids of the first sequence.
+            pair_entity_ids (`List[int]`, *optional*):
+                Entity ids of the second sequence.
+            entity_token_spans (`List[Tuple[int, int]]`, *optional*):
+                Entity spans of the first sequence.
+            pair_entity_token_spans (`List[Tuple[int, int]]`, *optional*):
+                Entity spans of the second sequence.
+            max_entity_length (`int`, *optional*):
+                The maximum length of the entity sequence.
+        """
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+        # Compute lengths
+        pair = bool(pair_ids is not None)
+        len_ids = len(ids)
+        len_pair_ids = len(pair_ids) if pair else 0
+        if return_token_type_ids and not add_special_tokens:
+            raise ValueError(
+                "Asking to return token_type_ids while setting add_special_tokens to False "
+                "results in an undefined behavior. Please set add_special_tokens to True or "
+                "set return_token_type_ids to None."
+            )
+        if (
+            return_overflowing_tokens
+            and truncation_strategy == TruncationStrategy.LONGEST_FIRST
+            and pair_ids is not None
+        ):
+            raise ValueError(
+                "Not possible to return overflowing tokens for pair of sequences with the "
+                "`longest_first`. Please select another truncation strategy than `longest_first`, "
+                "for instance `only_second` or `only_first`."
+            )
+        # Load from model defaults
+        if return_token_type_ids is None:
+            return_token_type_ids = "token_type_ids" in self.model_input_names
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+        encoded_inputs = {}
+        # Compute the total size of the returned word encodings
+        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
+        # Truncation: Handle max sequence length and max_entity_length
+        overflowing_tokens = []
+        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
+            # truncate words up to max_length
+            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
+                ids,
+                pair_ids=pair_ids,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                stride=stride,
+            )
+        if return_overflowing_tokens:
+            encoded_inputs["overflowing_tokens"] = overflowing_tokens
+            encoded_inputs["num_truncated_tokens"] = total_len - max_length
+        # Add special tokens
+        if add_special_tokens:
+            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
+            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
+            entity_token_offset = 1  # 1 * <s> token
+            pair_entity_token_offset = len(ids) + 3  # 1 * <s> token & 2 * <sep> tokens
+        else:
+            sequence = ids + pair_ids if pair else ids
+            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
+            entity_token_offset = 0
+            pair_entity_token_offset = len(ids)
+        # Build output dictionary
+        encoded_inputs["input_ids"] = sequence
+        encoded_inputs["position_ids"] = list(range(len(sequence)))
+        if return_token_type_ids:
+            encoded_inputs["token_type_ids"] = token_type_ids
+        if return_special_tokens_mask:
+            if add_special_tokens:
+                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
+            else:
+                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
+        # Set max entity length
+        if not max_entity_length:
+            max_entity_length = self.max_entity_length
+        if entity_ids is not None:
+            total_entity_len = 0
+            num_invalid_entities = 0
+            valid_entity_ids = [ent_id for ent_id, span in zip(entity_ids, entity_token_spans) if span[1] <= len(ids)]
+            valid_entity_token_spans = [span for span in entity_token_spans if span[1] <= len(ids)]
+            total_entity_len += len(valid_entity_ids)
+            num_invalid_entities += len(entity_ids) - len(valid_entity_ids)
+            valid_pair_entity_ids, valid_pair_entity_token_spans = None, None
+            if pair_entity_ids is not None:
+                valid_pair_entity_ids = [
+                    ent_id
+                    for ent_id, span in zip(pair_entity_ids, pair_entity_token_spans)
+                    if span[1] <= len(pair_ids)
+                ]
+                valid_pair_entity_token_spans = [span for span in pair_entity_token_spans if span[1] <= len(pair_ids)]
+                total_entity_len += len(valid_pair_entity_ids)
+                num_invalid_entities += len(pair_entity_ids) - len(valid_pair_entity_ids)
+            if num_invalid_entities != 0:
+                logger.warning(
+                    f"{num_invalid_entities} entities are ignored because their entity spans are invalid due to the"
+                    " truncation of input tokens"
+                )
+            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and total_entity_len > max_entity_length:
+                # truncate entities up to max_entity_length
+                valid_entity_ids, valid_pair_entity_ids, overflowing_entities = self.truncate_sequences(
+                    valid_entity_ids,
+                    pair_ids=valid_pair_entity_ids,
+                    num_tokens_to_remove=total_entity_len - max_entity_length,
+                    truncation_strategy=truncation_strategy,
+                    stride=stride,
+                )
+                valid_entity_token_spans = valid_entity_token_spans[: len(valid_entity_ids)]
+                if valid_pair_entity_token_spans is not None:
+                    valid_pair_entity_token_spans = valid_pair_entity_token_spans[: len(valid_pair_entity_ids)]
+            if return_overflowing_tokens:
+                encoded_inputs["overflowing_entities"] = overflowing_entities
+                encoded_inputs["num_truncated_entities"] = total_entity_len - max_entity_length
+            final_entity_ids = valid_entity_ids + valid_pair_entity_ids if valid_pair_entity_ids else valid_entity_ids
+            encoded_inputs["entity_ids"] = list(final_entity_ids)
+            entity_position_ids = []
+            entity_start_positions = []
+            entity_end_positions = []
+            for token_spans, offset in (
+                (valid_entity_token_spans, entity_token_offset),
+                (valid_pair_entity_token_spans, pair_entity_token_offset),
+            ):
+                if token_spans is not None:
+                    for start, end in token_spans:
+                        start += offset
+                        end += offset
+                        position_ids = list(range(start, end))[: self.max_mention_length]
+                        position_ids += [-1] * (self.max_mention_length - end + start)
+                        entity_position_ids.append(position_ids)
+                        entity_start_positions.append(start)
+                        entity_end_positions.append(end - 1)
+            encoded_inputs["entity_position_ids"] = entity_position_ids
+            if self.task == "entity_span_classification":
+                encoded_inputs["entity_start_positions"] = entity_start_positions
+                encoded_inputs["entity_end_positions"] = entity_end_positions
+            if return_token_type_ids:
+                encoded_inputs["entity_token_type_ids"] = [0] * len(encoded_inputs["entity_ids"])
+        # Check lengths
+        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
+        # Padding
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
+            encoded_inputs = self.pad(
+                encoded_inputs,
+                max_length=max_length,
+                max_entity_length=max_entity_length,
+                padding=padding_strategy.value,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+            )
+        if return_length:
+            encoded_inputs["length"] = len(encoded_inputs["input_ids"])
+        batch_outputs = BatchEncoding(
+            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
+        )
+        return batch_outputs
+    # Copied and modified from LukeTokenizer, adding the padding of `position_ids`
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        max_entity_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            max_entity_length: The maximum length of the entity sequence.
+            padding_strategy: PaddingStrategy to use for padding.
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        entities_provided = bool("entity_ids" in encoded_inputs)
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(encoded_inputs["input_ids"])
+            if entities_provided:
+                max_entity_length = len(encoded_inputs["entity_ids"])
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+        if (
+            entities_provided
+            and max_entity_length is not None
+            and pad_to_multiple_of is not None
+            and (max_entity_length % pad_to_multiple_of != 0)
+        ):
+            max_entity_length = ((max_entity_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and (
+            len(encoded_inputs["input_ids"]) != max_length
+            or (entities_provided and len(encoded_inputs["entity_ids"]) != max_entity_length)
+        )
+        # Initialize attention mask if not present.
+        if return_attention_mask and "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
+        if entities_provided and return_attention_mask and "entity_attention_mask" not in encoded_inputs:
+            encoded_inputs["entity_attention_mask"] = [1] * len(encoded_inputs["entity_ids"])
+        if needs_to_be_padded:
+            difference = max_length - len(encoded_inputs["input_ids"])
+            if entities_provided:
+                entity_difference = max_entity_length - len(encoded_inputs["entity_ids"])
+            if self.padding_side == "right":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
+                    if entities_provided:
+                        encoded_inputs["entity_attention_mask"] = (
+                            encoded_inputs["entity_attention_mask"] + [0] * entity_difference
+                        )
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"] + [0] * difference
+                    if entities_provided:
+                        encoded_inputs["entity_token_type_ids"] = (
+                            encoded_inputs["entity_token_type_ids"] + [0] * entity_difference
+                        )
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+                encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
+                encoded_inputs["position_ids"] = encoded_inputs["position_ids"] + [0] * difference
+                if entities_provided:
+                    encoded_inputs["entity_ids"] = (
+                        encoded_inputs["entity_ids"] + [self.entity_pad_token_id] * entity_difference
+                    )
+                    encoded_inputs["entity_position_ids"] = (
+                        encoded_inputs["entity_position_ids"] + [[-1] * self.max_mention_length] * entity_difference
+                    )
+                    if self.task == "entity_span_classification":
+                        encoded_inputs["entity_start_positions"] = (
+                            encoded_inputs["entity_start_positions"] + [0] * entity_difference
+                        )
+                        encoded_inputs["entity_end_positions"] = (
+                            encoded_inputs["entity_end_positions"] + [0] * entity_difference
+                        )
+            elif self.padding_side == "left":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+                    if entities_provided:
+                        encoded_inputs["entity_attention_mask"] = [0] * entity_difference + encoded_inputs[
+                            "entity_attention_mask"
+                        ]
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = [0] * difference + encoded_inputs["token_type_ids"]
+                    if entities_provided:
+                        encoded_inputs["entity_token_type_ids"] = [0] * entity_difference + encoded_inputs[
+                            "entity_token_type_ids"
+                        ]
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+                encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
+                encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
+                if entities_provided:
+                    encoded_inputs["entity_ids"] = [self.entity_pad_token_id] * entity_difference + encoded_inputs[
+                        "entity_ids"
+                    ]
+                    encoded_inputs["entity_position_ids"] = [
+                        [-1] * self.max_mention_length
+                    ] * entity_difference + encoded_inputs["entity_position_ids"]
+                    if self.task == "entity_span_classification":
+                        encoded_inputs["entity_start_positions"] = [0] * entity_difference + encoded_inputs[
+                            "entity_start_positions"
+                        ]
+                        encoded_inputs["entity_end_positions"] = [0] * entity_difference + encoded_inputs[
+                            "entity_end_positions"
+                        ]
+            else:
+                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+        return encoded_inputs
+    # Copied and modified from BertJapaneseTokenizer and LukeTokenizer
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         if os.path.isdir(save_directory):
             if self.subword_tokenizer_type == "sentencepiece":