Spaces:

yizhangliu
/

Grounded-Segment-Anything

Sleeping

Grounded-Segment-Anything / transformers_4_35_0 /models /blenderbot_small /tokenization_blenderbot_small_fast.py

liuyizhang

add transformers_4_35_0

1ce5e18 8 months ago

No virus

4.62 kB

	# coding=utf-8
	# Copyright 2021, The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""Fast tokenization class for BlenderbotSmall."""
	from typing import List, Optional

	from tokenizers import ByteLevelBPETokenizer

	from ...tokenization_utils_fast import PreTrainedTokenizerFast
	from ...utils import logging
	from .tokenization_blenderbot_small import BlenderbotSmallTokenizer


	logger = logging.get_logger(__name__)

	VOCAB_FILES_NAMES = {
	"vocab_file": "vocab.json",
	"merges_file": "merges.txt",
	"tokenizer_config_file": "tokenizer_config.json",
	}

	PRETRAINED_VOCAB_FILES_MAP = {
	"vocab_file": {
	"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/vocab.json"
	},
	"merges_file": {
	"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/merges.txt"
	},
	"tokenizer_config_file": {
	"facebook/blenderbot_small-90M": (
	"https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/tokenizer_config.json"
	)
	},
	}

	PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
	"facebook/blenderbot_small-90M": 512,
	}


	class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast):
	"""
	Construct a "fast" BlenderbotSmall tokenizer (backed by HuggingFace's tokenizers library).

	Args:
	vocab_file (`str`):
	Path to the vocabulary file.
	"""

	vocab_files_names = VOCAB_FILES_NAMES
	pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
	max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
	slow_tokenizer_class = BlenderbotSmallTokenizer

	def __init__(
	self,
	vocab_file=None,
	merges_file=None,
	unk_token="<\|endoftext\|>",
	bos_token="<\|endoftext\|>",
	eos_token="<\|endoftext\|>",
	add_prefix_space=False,
	trim_offsets=True,
	**kwargs,
	):
	super().__init__(
	ByteLevelBPETokenizer(
	vocab=vocab_file,
	merges=merges_file,
	add_prefix_space=add_prefix_space,
	trim_offsets=trim_offsets,
	),
	bos_token=bos_token,
	eos_token=eos_token,
	unk_token=unk_token,
	**kwargs,
	)
	self.add_prefix_space = add_prefix_space

	def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
	output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
	if token_ids_1 is None:
	return output

	return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]

	def create_token_type_ids_from_sequences(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
	) -> List[int]:
	"""
	Create a mask from the two sequences passed to be used in a sequence-pair classification task. BlenderbotSmall
	does not make use of token type ids, therefore a list of zeros is returned.

	Args:
	token_ids_0 (`List[int]`):
	List of IDs.
	token_ids_1 (`List[int]`, optional):
	Optional second list of IDs for sequence pairs.

	Returns:
	`List[int]`: List of zeros.
	"""
	sep = [self.sep_token_id]
	cls = [self.cls_token_id]

	if token_ids_1 is None:
	return len(cls + token_ids_0 + sep) * [0]
	return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

	@property
	# Copied from transformers.models.blenderbot.tokenization_blenderbot.BlenderbotTokenizer.default_chat_template
	def default_chat_template(self):
	"""
	A very simple chat template that just adds whitespace between messages.
	"""
	return (
	"{% for message in messages %}"
	"{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
	"{{ message['content'] }}"
	"{% if not loop.last %}{{ ' ' }}{% endif %}"
	"{% endfor %}"
	"{{ eos_token }}"
	)