custom-model / custom_tokenizer.py
charlesxsh's picture
yo
504dd41
raw
history blame
400 Bytes
from typing import Dict
from transformers import PreTrainedTokenizer, AddedToken
class CustomTokenizer(PreTrainedTokenizer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
print("Initializing CustomTokenizer")
def tokenize(self, text):
print("Tokenizing text", text)
return text.split()
def get_vocab(self) -> Dict[str, int]:
return {}