from typing import Dict | |
from transformers import PreTrainedTokenizer, AddedToken | |
class CustomTokenizer(PreTrainedTokenizer): | |
def __init__(self, **kwargs): | |
super().__init__(**kwargs) | |
print("Initializing CustomTokenizer") | |
def tokenize(self, text): | |
print("Tokenizing text", text) | |
return text.split() | |
def get_vocab(self) -> Dict[str, int]: | |
return {} |