File size: 3,028 Bytes
ff444a8 299710f ff444a8 299710f ff444a8 299710f 71e8994 ff444a8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import copy
from typing import List
from tokenizers import NormalizedString, PreTokenizedString, normalizers, pre_tokenizers
from transformers import DebertaV2TokenizerFast
class DebertaV2JumanppTokenizerFast(DebertaV2TokenizerFast):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.juman_normalizer = normalizers.Sequence(
[
# cf. https://github.com/ku-nlp/rhoknp/blob/v1.3.0/src/rhoknp/units/sentence.py#L36
normalizers.Replace("\r", ""),
normalizers.Replace("\n", ""),
# cf. https://github.com/ku-nlp/jumanpp/blob/v2.0.0-rc3/src/jumandic/shared/juman_format.cc#L44-L61
normalizers.Replace("\t", "\\t"),
normalizers.Replace(" ", " "),
normalizers.Replace('"', "”"),
normalizers.Replace("<", "<"),
normalizers.Replace(">", ">"),
]
)
self.juman_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(JumanppPreTokenizer())
self.default_normalizer = copy.deepcopy(self.backend_tokenizer.normalizer)
self.default_pre_tokenizer = copy.deepcopy(self.backend_tokenizer.pre_tokenizer)
self.backend_tokenizer.normalizer = normalizers.Sequence(
[self.juman_normalizer, self.backend_tokenizer.normalizer]
)
self.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[self.juman_pre_tokenizer, self.backend_tokenizer.pre_tokenizer]
)
def save_pretrained(self, *args, **kwargs):
self.backend_tokenizer.normalizer = self.default_normalizer
self.backend_tokenizer.pre_tokenizer = self.default_pre_tokenizer
super().save_pretrained(*args, **kwargs)
self.backend_tokenizer.normalizer = normalizers.Sequence(
[self.juman_normalizer, self.backend_tokenizer.normalizer]
)
self.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[self.juman_pre_tokenizer, self.backend_tokenizer.pre_tokenizer]
)
class JumanppPreTokenizer:
def __init__(self):
try:
import rhoknp
except ImportError:
raise ImportError(
"You need to install rhoknp to use JumanppPreTokenizer. "
"See https://github.com/ku-nlp/rhoknp for installation."
)
self.jumanpp = rhoknp.Jumanpp()
def pre_tokenize(self, pretok: PreTokenizedString):
pretok.split(self.jumanpp_split)
def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_sentence(str(normalized_string)).morphemes]
if not offsets:
doc = rhoknp.Document.from_raw_text(str(normalized_string))
offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_document(doc).morphemes]
return [normalized_string[offset[0]:offset[1]] for offset in offsets]
|