deberta-v2-base-japanese-with-auto-jumanpp / tokenization_deberta_v2_jumanpp.py
nobu-g's picture
Fix handling of long text. (#3)
06e4a39
from transformers import DebertaV2Tokenizer
class DebertaV2JumanppTokenizer(DebertaV2Tokenizer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.juman_tokenizer = JumanppTokenizer()
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs) -> tuple[str, dict]:
text = self.juman_tokenizer.tokenize(text)
add_prefix_space = kwargs.pop("add_prefix_space", False)
if is_split_into_words or add_prefix_space:
text = " " + text
return (text, kwargs)
class JumanppTokenizer:
def __init__(self):
try:
import rhoknp
except ImportError:
raise ImportError(
"You need to install rhoknp to use JumanppPreTokenizer. "
"See https://github.com/ku-nlp/rhoknp for installation."
)
self.rhoknp = rhoknp
self.jumanpp = rhoknp.Jumanpp()
def tokenize(self, text: str) -> str:
try:
morphemes = self.jumanpp.apply_to_sentence(text).morphemes
except RuntimeError:
doc = self.rhoknp.Document.from_raw_text(text)
morphemes = self.jumanpp.apply_to_document(doc).morphemes
return " ".join([morpheme.surf for morpheme in morphemes])