charlesxsh
/

custom-model

Model card Files Files and versions Community

custom-model / custom_tokenizer.py

charlesxsh's picture

yo

504dd41 2 months ago

400 Bytes

	from typing import Dict
	from transformers import PreTrainedTokenizer, AddedToken

	class CustomTokenizer(PreTrainedTokenizer):
	def __init__(self, **kwargs):
	super().__init__(**kwargs)
	print("Initializing CustomTokenizer")

	def tokenize(self, text):
	print("Tokenizing text", text)
	return text.split()

	def get_vocab(self) -> Dict[str, int]:
	return {}