Update README.md, tokenization_deberta_v2_jumanpp.py, tokenization_deberta_v2_jumanpp_fast.py
Browse files
README.md
CHANGED
@@ -29,8 +29,8 @@ You can use this model for masked language modeling as follows:
|
|
29 |
|
30 |
```python
|
31 |
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
32 |
-
tokenizer = AutoTokenizer.from_pretrained('ku-nlp/deberta-v2-base-japanese', trust_remote_code=True)
|
33 |
-
model = AutoModelForMaskedLM.from_pretrained('ku-nlp/deberta-v2-base-japanese')
|
34 |
|
35 |
sentence = '京都大学で自然言語処理を[MASK]する。'
|
36 |
encoding = tokenizer(sentence, return_tensors='pt')
|
@@ -41,9 +41,8 @@ You can also fine-tune this model on downstream tasks.
|
|
41 |
|
42 |
## Tokenization
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
UPDATE: The input text is internally segmented by [Juman++](https://github.com/ku-nlp/jumanpp) within `DebertaV2JumanppTokenizer(Fast)`, so there's no need to segment it in advance. To use `DebertaV2JumanppTokenizer(Fast)`, you need to install [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) and [rhoknp](https://github.com/ku-nlp/rhoknp).
|
47 |
|
48 |
## Training data
|
49 |
|
|
|
29 |
|
30 |
```python
|
31 |
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
32 |
+
tokenizer = AutoTokenizer.from_pretrained('ku-nlp/deberta-v2-base-japanese-with-auto-jumanpp', trust_remote_code=True)
|
33 |
+
model = AutoModelForMaskedLM.from_pretrained('ku-nlp/deberta-v2-base-japanese-with-auto-jumanpp')
|
34 |
|
35 |
sentence = '京都大学で自然言語処理を[MASK]する。'
|
36 |
encoding = tokenizer(sentence, return_tensors='pt')
|
|
|
41 |
|
42 |
## Tokenization
|
43 |
|
44 |
+
The input text is internally segmented by [Juman++](https://github.com/ku-nlp/jumanpp) within `DebertaV2JumanppTokenizer` or `DebertaV2JumanppTokenizerFast`, so there's no need to segment it in advance.
|
45 |
+
To use `DebertaV2JumanppTokenizer` or `DebertaV2JumanppTokenizerFast`, you need to install [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) and [rhoknp](https://github.com/ku-nlp/rhoknp).
|
|
|
46 |
|
47 |
## Training data
|
48 |
|
tokenization_deberta_v2_jumanpp.py
CHANGED
@@ -24,7 +24,7 @@ class JumanppTokenizer:
|
|
24 |
"You need to install rhoknp to use JumanppPreTokenizer. "
|
25 |
"See https://github.com/ku-nlp/rhoknp for installation."
|
26 |
)
|
27 |
-
self.
|
28 |
|
29 |
def tokenize(self, text: str) -> str:
|
30 |
-
return " ".join([morpheme.surf for morpheme in self.
|
|
|
24 |
"You need to install rhoknp to use JumanppPreTokenizer. "
|
25 |
"See https://github.com/ku-nlp/rhoknp for installation."
|
26 |
)
|
27 |
+
self.jumanpp = rhoknp.Jumanpp()
|
28 |
|
29 |
def tokenize(self, text: str) -> str:
|
30 |
+
return " ".join([morpheme.surf for morpheme in self.jumanpp.apply_to_sentence(text).morphemes])
|
tokenization_deberta_v2_jumanpp_fast.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import copy
|
|
|
2 |
|
3 |
from tokenizers import NormalizedString, PreTokenizedString, normalizers, pre_tokenizers
|
4 |
from transformers import DebertaV2TokenizerFast
|
@@ -54,11 +55,11 @@ class JumanppPreTokenizer:
|
|
54 |
"You need to install rhoknp to use JumanppPreTokenizer. "
|
55 |
"See https://github.com/ku-nlp/rhoknp for installation."
|
56 |
)
|
57 |
-
self.
|
58 |
|
59 |
def pre_tokenize(self, pretok: PreTokenizedString):
|
60 |
pretok.split(self.jumanpp_split)
|
61 |
|
62 |
-
def jumanpp_split(self, i: int, normalized_string: NormalizedString) ->
|
63 |
-
offsets = [morpheme.span for morpheme in self.
|
64 |
return [normalized_string[offset[0]:offset[1]] for offset in offsets]
|
|
|
1 |
import copy
|
2 |
+
from typing import List
|
3 |
|
4 |
from tokenizers import NormalizedString, PreTokenizedString, normalizers, pre_tokenizers
|
5 |
from transformers import DebertaV2TokenizerFast
|
|
|
55 |
"You need to install rhoknp to use JumanppPreTokenizer. "
|
56 |
"See https://github.com/ku-nlp/rhoknp for installation."
|
57 |
)
|
58 |
+
self.jumanpp = rhoknp.Jumanpp()
|
59 |
|
60 |
def pre_tokenize(self, pretok: PreTokenizedString):
|
61 |
pretok.split(self.jumanpp_split)
|
62 |
|
63 |
+
def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
|
64 |
+
offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_sentence(str(normalized_string)).morphemes]
|
65 |
return [normalized_string[offset[0]:offset[1]] for offset in offsets]
|