NeMo_Canary / nemo /collections /common /tokenizers /null_tokenizer.py

Upload folder using huggingface_hub

b386992 verified 3 months ago

2.01 kB

	# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer


	class NullTokenizer(MegatronTokenizer):
	"""
	Synthetic tokenizer for performance benchmarking and debugging

	Args:
	vocab_size: vocabulary size for embedding
	"""

	def __init__(self, vocab_size):
	super().__init__(None, vocab_size=vocab_size)
	self._vocab_size_without_eod = int(vocab_size)
	self._eod_id = self._vocab_size_without_eod

	def tokenize(self, text):
	return [int(x) for x in text.split(' ')]

	def detokenize(self, ids):
	text = [str(x) for x in ids]
	return ' '.join(text)

	def offsets(self, ids: list[int], text: str) -> list[int]:
	offsets, start_idx = [], 0
	for id_ in ids:
	offsets.append(start_idx)
	start_idx += 1 + len(str(id_))
	return offsets

	@property
	def vocab_size(self):
	return self._vocab_size_without_eod + 1

	@property
	def vocab(self):
	raise NotImplementedError

	@property
	def inv_vocab(self):
	raise NotImplementedError

	@property
	def cls(self):
	return -1

	@property
	def sep(self):
	return -1

	@property
	def mask(self):
	return -1

	@property
	def eod(self):
	return self._eod_id

	@property
	def additional_special_tokens_ids(self):
	return None