tealgreen0503 commited on
Commit
ac868f8
1 Parent(s): dc384f8

feat: add custom normal tokenizer

Browse files
Files changed (1) hide show
  1. tokenization_deberta_v2_jumanpp.py +30 -0
tokenization_deberta_v2_jumanpp.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import DebertaV2Tokenizer
2
+
3
+
4
+ class DebertaV2JumanppTokenizer(DebertaV2Tokenizer):
5
+ def __init__(self, *args, **kwargs):
6
+ super().__init__(*args, **kwargs)
7
+ self.juman_tokenizer = JumanppTokenizer()
8
+
9
+ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs) -> tuple[str, dict]:
10
+ text = self.juman_tokenizer.tokenize(text)
11
+
12
+ add_prefix_space = kwargs.pop("add_prefix_space", False)
13
+ if is_split_into_words or add_prefix_space:
14
+ text = " " + text
15
+ return (text, kwargs)
16
+
17
+
18
+ class JumanppTokenizer:
19
+ def __init__(self):
20
+ try:
21
+ import rhoknp
22
+ except ImportError:
23
+ raise ImportError(
24
+ "You need to install rhoknp to use JumanppPreTokenizer. "
25
+ "See https://github.com/ku-nlp/rhoknp for installation."
26
+ )
27
+ self.juman = rhoknp.Jumanpp()
28
+
29
+ def tokenize(self, text: str) -> str:
30
+ return " ".join([morpheme.surf for morpheme in self.juman.apply_to_sentence(text).morphemes])