Source code for transformers.models.roformer.tokenization_roformer_fast

# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for RoFormer."""
import json
from typing import List, Optional, Tuple

from tokenizers import normalizers
from tokenizers.pre_tokenizers import BertPreTokenizer, PreTokenizer

from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_roformer import RoFormerTokenizer
from .tokenization_utils import JiebaPreTokenizer


logger = logging.get_logger(__name__)

VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "junnyu/roformer_chinese_small": "https://huggingface.co/junnyu/roformer_chinese_small/resolve/main/vocab.txt",
        "junnyu/roformer_chinese_base": "https://huggingface.co/junnyu/roformer_chinese_base/resolve/main/vocab.txt",
    }
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"junnyu/roformer_chinese_small": 1536, "junnyu/roformer_chinese_base": 1536}


PRETRAINED_INIT_CONFIGURATION = {
    "junnyu/roformer_chinese_small": {"do_lower_case": True},
    "junnyu/roformer_chinese_base": {"do_lower_case": True},
}


[docs]class RoFormerTokenizerFast(PreTrainedTokenizerFast): r""" Construct a "fast" RoFormer tokenizer (backed by HuggingFace's `tokenizers` library). :class:`~transformers.RoFormerTokenizerFast` is almost identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end tokenization: punctuation splitting and wordpiece. There are some difference between them when tokenizing Chinese. This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should refer to this superclass for more information regarding those methods. Example:: >>> from transformers import RoFormerTokenizerFast >>> tokenizer = RoFormerTokenizerFast.from_pretrained('junnyu/roformer_chinese_base') >>> tokenizer.tokenize("今天天气非常好。") # ['今', '天', '天', '气', '非常', '好', '。'] """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION slow_tokenizer_class = RoFormerTokenizer def __init__( self, vocab_file, tokenizer_file=None, do_lower_case=True, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, **kwargs ): super().__init__( vocab_file, tokenizer_file=tokenizer_file, do_lower_case=do_lower_case, unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, **kwargs, ) pre_tok_state = json.loads(self.backend_tokenizer.normalizer.__getstate__()) if ( pre_tok_state.get("lowercase", do_lower_case) != do_lower_case or pre_tok_state.get("strip_accents", strip_accents) != strip_accents ): pre_tok_class = getattr(normalizers, pre_tok_state.pop("type")) pre_tok_state["lowercase"] = do_lower_case pre_tok_state["strip_accents"] = strip_accents self.backend_tokenizer.normalizer = pre_tok_class(**pre_tok_state) self.do_lower_case = do_lower_case def __getstate__(self): state = self.__dict__.copy() state["_tokenizer"].pre_tokenizer = BertPreTokenizer() return state def __setstate__(self, d): self.__dict__ = d vocab = self.__dict__["_tokenizer"].get_vocab() self.__dict__["_tokenizer"].pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer(vocab))
[docs] def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A RoFormer sequence has the following format: - single sequence: ``[CLS] X [SEP]`` - pair of sequences: ``[CLS] A [SEP] B [SEP]`` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added. token_ids_1 (:obj:`List[int]`, `optional`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] if token_ids_1: output += token_ids_1 + [self.sep_token_id] return output
def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Create a mask from the two sequences passed to be used in a sequence-pair classification task. A RoFormer sequence pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence | If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). Args: token_ids_0 (:obj:`List[int]`): List of IDs. token_ids_1 (:obj:`List[int]`, `optional`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given sequence(s). """ sep = [self.sep_token_id] cls = [self.cls_token_id] if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: files = self._tokenizer.model.save(save_directory, name=filename_prefix) return tuple(files) def save_pretrained( self, save_directory, legacy_format=None, filename_prefix=None, push_to_hub=False, **kwargs, ): self.backend_tokenizer.pre_tokenizer = BertPreTokenizer() return super().save_pretrained(save_directory, legacy_format, filename_prefix, push_to_hub, **kwargs)