Grounded-Segment-Anything
/
transformers_4_35_0
/models
/blenderbot_small
/tokenization_blenderbot_small_fast.py
# coding=utf-8 | |
# Copyright 2021, The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Fast tokenization class for BlenderbotSmall.""" | |
from typing import List, Optional | |
from tokenizers import ByteLevelBPETokenizer | |
from ...tokenization_utils_fast import PreTrainedTokenizerFast | |
from ...utils import logging | |
from .tokenization_blenderbot_small import BlenderbotSmallTokenizer | |
logger = logging.get_logger(__name__) | |
VOCAB_FILES_NAMES = { | |
"vocab_file": "vocab.json", | |
"merges_file": "merges.txt", | |
"tokenizer_config_file": "tokenizer_config.json", | |
} | |
PRETRAINED_VOCAB_FILES_MAP = { | |
"vocab_file": { | |
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/vocab.json" | |
}, | |
"merges_file": { | |
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/merges.txt" | |
}, | |
"tokenizer_config_file": { | |
"facebook/blenderbot_small-90M": ( | |
"https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/tokenizer_config.json" | |
) | |
}, | |
} | |
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { | |
"facebook/blenderbot_small-90M": 512, | |
} | |
class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast): | |
""" | |
Construct a "fast" BlenderbotSmall tokenizer (backed by HuggingFace's *tokenizers* library). | |
Args: | |
vocab_file (`str`): | |
Path to the vocabulary file. | |
""" | |
vocab_files_names = VOCAB_FILES_NAMES | |
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP | |
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES | |
slow_tokenizer_class = BlenderbotSmallTokenizer | |
def __init__( | |
self, | |
vocab_file=None, | |
merges_file=None, | |
unk_token="<|endoftext|>", | |
bos_token="<|endoftext|>", | |
eos_token="<|endoftext|>", | |
add_prefix_space=False, | |
trim_offsets=True, | |
**kwargs, | |
): | |
super().__init__( | |
ByteLevelBPETokenizer( | |
vocab=vocab_file, | |
merges=merges_file, | |
add_prefix_space=add_prefix_space, | |
trim_offsets=trim_offsets, | |
), | |
bos_token=bos_token, | |
eos_token=eos_token, | |
unk_token=unk_token, | |
**kwargs, | |
) | |
self.add_prefix_space = add_prefix_space | |
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): | |
output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] | |
if token_ids_1 is None: | |
return output | |
return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id] | |
def create_token_type_ids_from_sequences( | |
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None | |
) -> List[int]: | |
""" | |
Create a mask from the two sequences passed to be used in a sequence-pair classification task. BlenderbotSmall | |
does not make use of token type ids, therefore a list of zeros is returned. | |
Args: | |
token_ids_0 (`List[int]`): | |
List of IDs. | |
token_ids_1 (`List[int]`, *optional*): | |
Optional second list of IDs for sequence pairs. | |
Returns: | |
`List[int]`: List of zeros. | |
""" | |
sep = [self.sep_token_id] | |
cls = [self.cls_token_id] | |
if token_ids_1 is None: | |
return len(cls + token_ids_0 + sep) * [0] | |
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] | |
# Copied from transformers.models.blenderbot.tokenization_blenderbot.BlenderbotTokenizer.default_chat_template | |
def default_chat_template(self): | |
""" | |
A very simple chat template that just adds whitespace between messages. | |
""" | |
return ( | |
"{% for message in messages %}" | |
"{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}" | |
"{{ message['content'] }}" | |
"{% if not loop.last %}{{ ' ' }}{% endif %}" | |
"{% endfor %}" | |
"{{ eos_token }}" | |
) | |