| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						class NullTokenizer(MegatronTokenizer): | 
					
					
						
						| 
							 | 
						    """ | 
					
					
						
						| 
							 | 
						    Synthetic tokenizer for performance benchmarking and debugging | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						    Args: | 
					
					
						
						| 
							 | 
						        vocab_size: vocabulary size for embedding | 
					
					
						
						| 
							 | 
						    """ | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    def __init__(self, vocab_size): | 
					
					
						
						| 
							 | 
						        super().__init__(None, vocab_size=vocab_size) | 
					
					
						
						| 
							 | 
						        self._vocab_size_without_eod = int(vocab_size) | 
					
					
						
						| 
							 | 
						        self._eod_id = self._vocab_size_without_eod | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    def tokenize(self, text): | 
					
					
						
						| 
							 | 
						        return [int(x) for x in text.split(' ')] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    def detokenize(self, ids): | 
					
					
						
						| 
							 | 
						        text = [str(x) for x in ids] | 
					
					
						
						| 
							 | 
						        return ' '.join(text) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    def offsets(self, ids: list[int], text: str) -> list[int]: | 
					
					
						
						| 
							 | 
						        offsets, start_idx = [], 0 | 
					
					
						
						| 
							 | 
						        for id_ in ids: | 
					
					
						
						| 
							 | 
						            offsets.append(start_idx) | 
					
					
						
						| 
							 | 
						            start_idx += 1 + len(str(id_)) | 
					
					
						
						| 
							 | 
						        return offsets | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    @property | 
					
					
						
						| 
							 | 
						    def vocab_size(self): | 
					
					
						
						| 
							 | 
						        return self._vocab_size_without_eod + 1 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    @property | 
					
					
						
						| 
							 | 
						    def vocab(self): | 
					
					
						
						| 
							 | 
						        raise NotImplementedError | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    @property | 
					
					
						
						| 
							 | 
						    def inv_vocab(self): | 
					
					
						
						| 
							 | 
						        raise NotImplementedError | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    @property | 
					
					
						
						| 
							 | 
						    def cls(self): | 
					
					
						
						| 
							 | 
						        return -1 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    @property | 
					
					
						
						| 
							 | 
						    def sep(self): | 
					
					
						
						| 
							 | 
						        return -1 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    @property | 
					
					
						
						| 
							 | 
						    def mask(self): | 
					
					
						
						| 
							 | 
						        return -1 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    @property | 
					
					
						
						| 
							 | 
						    def eod(self): | 
					
					
						
						| 
							 | 
						        return self._eod_id | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    @property | 
					
					
						
						| 
							 | 
						    def additional_special_tokens_ids(self): | 
					
					
						
						| 
							 | 
						        return None | 
					
					
						
						| 
							 | 
						
 |