add-custom-tokenizer
#3
by
tealgreen0503
- opened
- README.md +6 -4
- tokenization_deberta_v2_jumanpp.py +30 -0
- tokenization_deberta_v2_jumanpp_fast.py +64 -0
- tokenizer_config.json +8 -2
README.md
CHANGED
@@ -14,7 +14,7 @@ metrics:
|
|
14 |
- accuracy
|
15 |
mask_token: "[MASK]"
|
16 |
widget:
|
17 |
-
- text: "
|
18 |
---
|
19 |
|
20 |
# Model Card for Japanese DeBERTa V2 base
|
@@ -29,10 +29,10 @@ You can use this model for masked language modeling as follows:
|
|
29 |
|
30 |
```python
|
31 |
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
32 |
-
tokenizer = AutoTokenizer.from_pretrained('ku-nlp/deberta-v2-base-japanese')
|
33 |
model = AutoModelForMaskedLM.from_pretrained('ku-nlp/deberta-v2-base-japanese')
|
34 |
|
35 |
-
sentence = '
|
36 |
encoding = tokenizer(sentence, return_tensors='pt')
|
37 |
...
|
38 |
```
|
@@ -41,7 +41,9 @@ You can also fine-tune this model on downstream tasks.
|
|
41 |
|
42 |
## Tokenization
|
43 |
|
44 |
-
The input text should be segmented into words by [Juman++](https://github.com/ku-nlp/jumanpp) in advance. [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) was used for pre-training. Each word is tokenized into subwords by [sentencepiece](https://github.com/google/sentencepiece)
|
|
|
|
|
45 |
|
46 |
## Training data
|
47 |
|
|
|
14 |
- accuracy
|
15 |
mask_token: "[MASK]"
|
16 |
widget:
|
17 |
+
- text: "京都大学で自然言語処理を[MASK]する。"
|
18 |
---
|
19 |
|
20 |
# Model Card for Japanese DeBERTa V2 base
|
|
|
29 |
|
30 |
```python
|
31 |
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
32 |
+
tokenizer = AutoTokenizer.from_pretrained('ku-nlp/deberta-v2-base-japanese', trust_remote_code=True)
|
33 |
model = AutoModelForMaskedLM.from_pretrained('ku-nlp/deberta-v2-base-japanese')
|
34 |
|
35 |
+
sentence = '京都大学で自然言語処理を[MASK]する。'
|
36 |
encoding = tokenizer(sentence, return_tensors='pt')
|
37 |
...
|
38 |
```
|
|
|
41 |
|
42 |
## Tokenization
|
43 |
|
44 |
+
~~The input text should be segmented into words by [Juman++](https://github.com/ku-nlp/jumanpp) in advance. [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) was used for pre-training. Each word is tokenized into subwords by [sentencepiece](https://github.com/google/sentencepiece).~~
|
45 |
+
|
46 |
+
UPDATE: The input text is internally segmented by [Juman++](https://github.com/ku-nlp/jumanpp) within `DebertaV2JumanppTokenizer(Fast)`, so there's no need to segment it in advance. To use `DebertaV2JumanppTokenizer(Fast)`, you need to install [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) and [rhoknp](https://github.com/ku-nlp/rhoknp).
|
47 |
|
48 |
## Training data
|
49 |
|
tokenization_deberta_v2_jumanpp.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import DebertaV2Tokenizer
|
2 |
+
|
3 |
+
|
4 |
+
class DebertaV2JumanppTokenizer(DebertaV2Tokenizer):
|
5 |
+
def __init__(self, *args, **kwargs):
|
6 |
+
super().__init__(*args, **kwargs)
|
7 |
+
self.juman_tokenizer = JumanppTokenizer()
|
8 |
+
|
9 |
+
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs) -> tuple[str, dict]:
|
10 |
+
text = self.juman_tokenizer.tokenize(text)
|
11 |
+
|
12 |
+
add_prefix_space = kwargs.pop("add_prefix_space", False)
|
13 |
+
if is_split_into_words or add_prefix_space:
|
14 |
+
text = " " + text
|
15 |
+
return (text, kwargs)
|
16 |
+
|
17 |
+
|
18 |
+
class JumanppTokenizer:
|
19 |
+
def __init__(self):
|
20 |
+
try:
|
21 |
+
import rhoknp
|
22 |
+
except ImportError:
|
23 |
+
raise ImportError(
|
24 |
+
"You need to install rhoknp to use JumanppPreTokenizer. "
|
25 |
+
"See https://github.com/ku-nlp/rhoknp for installation."
|
26 |
+
)
|
27 |
+
self.juman = rhoknp.Jumanpp()
|
28 |
+
|
29 |
+
def tokenize(self, text: str) -> str:
|
30 |
+
return " ".join([morpheme.surf for morpheme in self.juman.apply_to_sentence(text).morphemes])
|
tokenization_deberta_v2_jumanpp_fast.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
|
3 |
+
from tokenizers import NormalizedString, PreTokenizedString, normalizers, pre_tokenizers
|
4 |
+
from transformers import DebertaV2TokenizerFast
|
5 |
+
|
6 |
+
|
7 |
+
class DebertaV2JumanppTokenizerFast(DebertaV2TokenizerFast):
|
8 |
+
def __init__(self, *args, **kwargs):
|
9 |
+
super().__init__(*args, **kwargs)
|
10 |
+
self.juman_normalizer = normalizers.Sequence(
|
11 |
+
[
|
12 |
+
# cf. https://github.com/ku-nlp/rhoknp/blob/v1.3.0/src/rhoknp/units/sentence.py#L36
|
13 |
+
normalizers.Replace("\r", ""),
|
14 |
+
normalizers.Replace("\n", ""),
|
15 |
+
# cf. https://github.com/ku-nlp/jumanpp/blob/v2.0.0-rc3/src/jumandic/shared/juman_format.cc#L44-L61
|
16 |
+
normalizers.Replace("\t", "\\t"),
|
17 |
+
normalizers.Replace(" ", " "),
|
18 |
+
normalizers.Replace('"', "”"),
|
19 |
+
normalizers.Replace("<", "<"),
|
20 |
+
normalizers.Replace(">", ">"),
|
21 |
+
]
|
22 |
+
)
|
23 |
+
self.juman_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(JumanppPreTokenizer())
|
24 |
+
|
25 |
+
self.default_normalizer = copy.deepcopy(self.backend_tokenizer.normalizer)
|
26 |
+
self.default_pre_tokenizer = copy.deepcopy(self.backend_tokenizer.pre_tokenizer)
|
27 |
+
|
28 |
+
self.backend_tokenizer.normalizer = normalizers.Sequence(
|
29 |
+
[self.juman_normalizer, self.backend_tokenizer.normalizer]
|
30 |
+
)
|
31 |
+
self.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
32 |
+
[self.juman_pre_tokenizer, self.backend_tokenizer.pre_tokenizer]
|
33 |
+
)
|
34 |
+
|
35 |
+
def save_pretrained(self, *args, **kwargs):
|
36 |
+
self.backend_tokenizer.normalizer = self.default_normalizer
|
37 |
+
self.backend_tokenizer.pre_tokenizer = self.default_pre_tokenizer
|
38 |
+
super().save_pretrained(*args, **kwargs)
|
39 |
+
|
40 |
+
self.backend_tokenizer.normalizer = normalizers.Sequence(
|
41 |
+
[self.juman_normalizer, self.backend_tokenizer.normalizer]
|
42 |
+
)
|
43 |
+
self.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
44 |
+
[self.juman_pre_tokenizer, self.backend_tokenizer.pre_tokenizer]
|
45 |
+
)
|
46 |
+
|
47 |
+
|
48 |
+
class JumanppPreTokenizer:
|
49 |
+
def __init__(self):
|
50 |
+
try:
|
51 |
+
import rhoknp
|
52 |
+
except ImportError:
|
53 |
+
raise ImportError(
|
54 |
+
"You need to install rhoknp to use JumanppPreTokenizer. "
|
55 |
+
"See https://github.com/ku-nlp/rhoknp for installation."
|
56 |
+
)
|
57 |
+
self.juman = rhoknp.Jumanpp()
|
58 |
+
|
59 |
+
def pre_tokenize(self, pretok: PreTokenizedString):
|
60 |
+
pretok.split(self.jumanpp_split)
|
61 |
+
|
62 |
+
def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> list[NormalizedString]:
|
63 |
+
offsets = [morpheme.span for morpheme in self.juman.apply_to_sentence(str(normalized_string)).morphemes]
|
64 |
+
return [normalized_string[offset[0]:offset[1]] for offset in offsets]
|
tokenizer_config.json
CHANGED
@@ -10,6 +10,12 @@
|
|
10 |
"sp_model_kwargs": {},
|
11 |
"special_tokens_map_file": null,
|
12 |
"split_by_punct": false,
|
13 |
-
"tokenizer_class": "
|
14 |
-
"unk_token": "[UNK]"
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
}
|
|
|
10 |
"sp_model_kwargs": {},
|
11 |
"special_tokens_map_file": null,
|
12 |
"split_by_punct": false,
|
13 |
+
"tokenizer_class": "DebertaV2JumanppTokenizer",
|
14 |
+
"unk_token": "[UNK]",
|
15 |
+
"auto_map": {
|
16 |
+
"AutoTokenizer": [
|
17 |
+
"tokenization_deberta_v2_jumanpp.DebertaV2JumanppTokenizer",
|
18 |
+
"tokenization_deberta_v2_jumanpp_fast.DebertaV2JumanppTokenizerFast"
|
19 |
+
]
|
20 |
+
}
|
21 |
}
|