subhankarg's picture
Upload folder using huggingface_hub
0558aa4 verified
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from abc import ABC, abstractmethod
from collections import OrderedDict
from typing import List
__all__ = ['TokenizerSpec']
class TokenizerSpec(ABC):
"""
Inherit this class to implement a new tokenizer.
"""
@abstractmethod
def text_to_tokens(self, text):
"""Converts text into a list of tokens."""
pass
@abstractmethod
def tokens_to_text(self, tokens):
"""Converts a list of tokens back into text."""
pass
@abstractmethod
def tokens_to_ids(self, tokens):
"""Converts a list of tokens to their corresponding IDs."""
pass
@abstractmethod
def ids_to_tokens(self, ids):
"""Converts a list of token IDs back to tokens."""
pass
@abstractmethod
def text_to_ids(self, text):
"""Converts text directly to token IDs."""
pass
@abstractmethod
def ids_to_text(self, ids):
"""Converts token IDs back to text."""
pass
def add_special_tokens(self, special_tokens: List[str]):
"""Adds special tokens (eos, pad, cls...) to vocab."""
raise NotImplementedError("To be implemented")
def apply_chat_template(self, *args, **kwargs):
"""Appies chat template and tokenizes results"""
raise NotImplementedError("To be implemented")
@property
def name(self):
"""name of the class"""
return type(self).__name__
@property
def unique_identifiers(self):
"""Property required for use with megatron-core datasets."""
return OrderedDict({"class": f"{type(self).__module__}.{type(self).__qualname__}"})
@property
def cls(self):
"""Property alias to match MegatronTokenizer; returns cls_id if available."""
if hasattr(self, 'cls_id'):
return self.cls_id
raise AttributeError(f"{type(self).__name__} has no attribute 'cls' or 'cls_id'")
@property
def sep(self):
"""Property alias to match MegatronTokenizer; returns sep_id if available."""
if hasattr(self, 'sep_id'):
return self.sep_id
raise AttributeError(f"{type(self).__name__} has no attribute 'sep' or 'sep_id'")
@property
def pad(self):
"""Property alias to match MegatronTokenizer; returns pad_id if available."""
if hasattr(self, 'pad_id'):
return self.pad_id
raise AttributeError(f"{type(self).__name__} has no attribute 'pad' or 'pad_id'")
@property
def eod(self):
"""Property alias to match MegatronTokenizer; returns eod_id if available."""
if hasattr(self, 'eod_id'):
return self.eod_id
if hasattr(self, 'eos_id'):
# Default to end-of-sentence id if end-of-document is not defined.
return self.eos_id
raise AttributeError(f"{type(self).__name__} has no attribute 'eod', 'eod_id', 'eos', or 'eos_id'")
@property
def bos(self):
"""Property alias to match MegatronTokenizer; returns bos_id if available."""
if hasattr(self, 'bos_id'):
return self.bos_id
raise AttributeError(f"{type(self).__name__} has no attribute 'bos' or 'bos_id'")
@property
def eos(self):
"""Property alias to match MegatronTokenizer; returns eos_id if available."""
if hasattr(self, 'eos_id'):
return self.eos_id
raise AttributeError(f"{type(self).__name__} has no attribute 'eos' or 'eos_id'")
@property
def mask(self):
"""Property alias to match MegatronTokenizer; returns mask_id if available."""
if hasattr(self, 'mask_id'):
return self.mask_id
raise AttributeError(f"{type(self).__name__} has no attribute 'mask' or 'mask_id'")