# coding=utf-8
# Copyright 2020 The Google AI Language Team Authors, Allegro.pl, Facebook Inc. and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ...utils import logging
from ..bert.tokenization_bert import BasicTokenizer
from ..xlm.tokenization_xlm import XLMTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"allegro/herbert-base-cased": "https://huggingface.co/allegro/herbert-base-cased/resolve/main/vocab.json"
},
"merges_file": {
"allegro/herbert-base-cased": "https://huggingface.co/allegro/herbert-base-cased/resolve/main/merges.txt"
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"allegro/herbert-base-cased": 514}
PRETRAINED_INIT_CONFIGURATION = {}
[docs]class HerbertTokenizer(XLMTokenizer):
"""
Construct a BPE tokenizer for HerBERT.
Peculiarities:
- uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of a
punctuation character will be treated separately.
- Such pretokenized input is BPE subtokenized
This tokenizer inherits from :class:`~transformers.XLMTokenizer` which contains most of the methods. Users should
refer to the superclass for more information regarding methods.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(
self,
vocab_file,
merges_file,
tokenizer_file=None,
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
sep_token="</s>",
do_lowercase_and_remove_accent=False,
**kwargs
):
super().__init__(
vocab_file,
merges_file,
tokenizer_file=None,
cls_token=cls_token,
unk_token=unk_token,
pad_token=pad_token,
mask_token=mask_token,
sep_token=sep_token,
do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
**kwargs,
)
self.bert_pre_tokenizer = BasicTokenizer(
do_lower_case=False,
never_split=self.all_special_tokens,
tokenize_chinese_chars=False,
strip_accents=False,
)
def _tokenize(self, text):
pre_tokens = self.bert_pre_tokenizer.tokenize(text)
split_tokens = []
for token in pre_tokens:
if token:
split_tokens.extend([t for t in self.bpe(token).split(" ")])
return split_tokens