deberta-v2-base-japanese / tokenization_deberta_v2_jumanpp.py
tealgreen0503's picture
feat: add custom normal tokenizer
ac868f8
raw history blame
No virus
1.06 kB
from transformers import DebertaV2Tokenizer
class DebertaV2JumanppTokenizer(DebertaV2Tokenizer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.juman_tokenizer = JumanppTokenizer()
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs) -> tuple[str, dict]:
text = self.juman_tokenizer.tokenize(text)
add_prefix_space = kwargs.pop("add_prefix_space", False)
if is_split_into_words or add_prefix_space:
text = " " + text
return (text, kwargs)
class JumanppTokenizer:
def __init__(self):
try:
import rhoknp
except ImportError:
raise ImportError(
"You need to install rhoknp to use JumanppPreTokenizer. "
"See https://github.com/ku-nlp/rhoknp for installation."
)
self.juman = rhoknp.Jumanpp()
def tokenize(self, text: str) -> str:
return " ".join([morpheme.surf for morpheme in self.juman.apply_to_sentence(text).morphemes])