from transformers import DebertaV2Tokenizer class DebertaV2JumanppTokenizer(DebertaV2Tokenizer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.juman_tokenizer = JumanppTokenizer() def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs) -> tuple[str, dict]: text = self.juman_tokenizer.tokenize(text) add_prefix_space = kwargs.pop("add_prefix_space", False) if is_split_into_words or add_prefix_space: text = " " + text return (text, kwargs) class JumanppTokenizer: def __init__(self): try: import rhoknp except ImportError: raise ImportError( "You need to install rhoknp to use JumanppPreTokenizer. " "See https://github.com/ku-nlp/rhoknp for installation." ) self.juman = rhoknp.Jumanpp() def tokenize(self, text: str) -> str: return " ".join([morpheme.surf for morpheme in self.juman.apply_to_sentence(text).morphemes])