tealgreen0503 commited on
Commit
dc384f8
1 Parent(s): 8336809

feat: add custom fast tokenizer

Browse files
tokenization_deberta_v2_jumanpp_fast.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+
3
+ from tokenizers import NormalizedString, PreTokenizedString, normalizers, pre_tokenizers
4
+ from transformers import DebertaV2TokenizerFast
5
+
6
+
7
+ class DebertaV2JumanppTokenizerFast(DebertaV2TokenizerFast):
8
+ def __init__(self, *args, **kwargs):
9
+ super().__init__(*args, **kwargs)
10
+ self.juman_normalizer = normalizers.Sequence(
11
+ [
12
+ # cf. https://github.com/ku-nlp/rhoknp/blob/v1.3.0/src/rhoknp/units/sentence.py#L36
13
+ normalizers.Replace("\r", ""),
14
+ normalizers.Replace("\n", ""),
15
+ # cf. https://github.com/ku-nlp/jumanpp/blob/v2.0.0-rc3/src/jumandic/shared/juman_format.cc#L44-L61
16
+ normalizers.Replace("\t", "\\t"),
17
+ normalizers.Replace(" ", " "),
18
+ normalizers.Replace('"', "”"),
19
+ normalizers.Replace("<", "<"),
20
+ normalizers.Replace(">", ">"),
21
+ ]
22
+ )
23
+ self.juman_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(JumanppPreTokenizer())
24
+
25
+ self.default_normalizer = copy.deepcopy(self.backend_tokenizer.normalizer)
26
+ self.default_pre_tokenizer = copy.deepcopy(self.backend_tokenizer.pre_tokenizer)
27
+
28
+ self.backend_tokenizer.normalizer = normalizers.Sequence(
29
+ [self.juman_normalizer, self.backend_tokenizer.normalizer]
30
+ )
31
+ self.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
32
+ [self.juman_pre_tokenizer, self.backend_tokenizer.pre_tokenizer]
33
+ )
34
+
35
+ def save_pretrained(self, *args, **kwargs):
36
+ self.backend_tokenizer.normalizer = self.default_normalizer
37
+ self.backend_tokenizer.pre_tokenizer = self.default_pre_tokenizer
38
+ super().save_pretrained(*args, **kwargs)
39
+
40
+ self.backend_tokenizer.normalizer = normalizers.Sequence(
41
+ [self.juman_normalizer, self.backend_tokenizer.normalizer]
42
+ )
43
+ self.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
44
+ [self.juman_pre_tokenizer, self.backend_tokenizer.pre_tokenizer]
45
+ )
46
+
47
+
48
+ class JumanppPreTokenizer:
49
+ def __init__(self):
50
+ try:
51
+ import rhoknp
52
+ except ImportError:
53
+ raise ImportError(
54
+ "You need to install rhoknp to use JumanppPreTokenizer. "
55
+ "See https://github.com/ku-nlp/rhoknp for installation."
56
+ )
57
+ self.juman = rhoknp.Jumanpp()
58
+
59
+ def pre_tokenize(self, pretok: PreTokenizedString):
60
+ pretok.split(self.jumanpp_split)
61
+
62
+ def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> list[NormalizedString]:
63
+ offsets = [morpheme.span for morpheme in self.juman.apply_to_sentence(str(normalized_string)).morphemes]
64
+ return [normalized_string[offset[0]:offset[1]] for offset in offsets]