README.md CHANGED
@@ -14,7 +14,7 @@ metrics:
14
  - accuracy
15
  mask_token: "[MASK]"
16
  widget:
17
- - text: "京都 大学 で 自然 言語 処理 を [MASK] する 。"
18
  ---
19
 
20
  # Model Card for Japanese DeBERTa V2 base
@@ -29,10 +29,10 @@ You can use this model for masked language modeling as follows:
29
 
30
  ```python
31
  from transformers import AutoTokenizer, AutoModelForMaskedLM
32
- tokenizer = AutoTokenizer.from_pretrained('ku-nlp/deberta-v2-base-japanese')
33
  model = AutoModelForMaskedLM.from_pretrained('ku-nlp/deberta-v2-base-japanese')
34
 
35
- sentence = '京都 大学 で 自然 言語 処理 を [MASK] する 。' # input should be segmented into words by Juman++ in advance
36
  encoding = tokenizer(sentence, return_tensors='pt')
37
  ...
38
  ```
@@ -41,7 +41,9 @@ You can also fine-tune this model on downstream tasks.
41
 
42
  ## Tokenization
43
 
44
- The input text should be segmented into words by [Juman++](https://github.com/ku-nlp/jumanpp) in advance. [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) was used for pre-training. Each word is tokenized into subwords by [sentencepiece](https://github.com/google/sentencepiece).
 
 
45
 
46
  ## Training data
47
 
 
14
  - accuracy
15
  mask_token: "[MASK]"
16
  widget:
17
+ - text: "京都大学で自然言語処理を[MASK]する。"
18
  ---
19
 
20
  # Model Card for Japanese DeBERTa V2 base
 
29
 
30
  ```python
31
  from transformers import AutoTokenizer, AutoModelForMaskedLM
32
+ tokenizer = AutoTokenizer.from_pretrained('ku-nlp/deberta-v2-base-japanese', trust_remote_code=True)
33
  model = AutoModelForMaskedLM.from_pretrained('ku-nlp/deberta-v2-base-japanese')
34
 
35
+ sentence = '京都大学で自然言語処理を[MASK]する。'
36
  encoding = tokenizer(sentence, return_tensors='pt')
37
  ...
38
  ```
 
41
 
42
  ## Tokenization
43
 
44
+ ~~The input text should be segmented into words by [Juman++](https://github.com/ku-nlp/jumanpp) in advance. [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) was used for pre-training. Each word is tokenized into subwords by [sentencepiece](https://github.com/google/sentencepiece).~~
45
+
46
+ UPDATE: The input text is internally segmented by [Juman++](https://github.com/ku-nlp/jumanpp) within `DebertaV2JumanppTokenizer(Fast)`, so there's no need to segment it in advance. To use `DebertaV2JumanppTokenizer(Fast)`, you need to install [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) and [rhoknp](https://github.com/ku-nlp/rhoknp).
47
 
48
  ## Training data
49
 
tokenization_deberta_v2_jumanpp.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import DebertaV2Tokenizer
2
+
3
+
4
+ class DebertaV2JumanppTokenizer(DebertaV2Tokenizer):
5
+ def __init__(self, *args, **kwargs):
6
+ super().__init__(*args, **kwargs)
7
+ self.juman_tokenizer = JumanppTokenizer()
8
+
9
+ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs) -> tuple[str, dict]:
10
+ text = self.juman_tokenizer.tokenize(text)
11
+
12
+ add_prefix_space = kwargs.pop("add_prefix_space", False)
13
+ if is_split_into_words or add_prefix_space:
14
+ text = " " + text
15
+ return (text, kwargs)
16
+
17
+
18
+ class JumanppTokenizer:
19
+ def __init__(self):
20
+ try:
21
+ import rhoknp
22
+ except ImportError:
23
+ raise ImportError(
24
+ "You need to install rhoknp to use JumanppPreTokenizer. "
25
+ "See https://github.com/ku-nlp/rhoknp for installation."
26
+ )
27
+ self.juman = rhoknp.Jumanpp()
28
+
29
+ def tokenize(self, text: str) -> str:
30
+ return " ".join([morpheme.surf for morpheme in self.juman.apply_to_sentence(text).morphemes])
tokenization_deberta_v2_jumanpp_fast.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+
3
+ from tokenizers import NormalizedString, PreTokenizedString, normalizers, pre_tokenizers
4
+ from transformers import DebertaV2TokenizerFast
5
+
6
+
7
+ class DebertaV2JumanppTokenizerFast(DebertaV2TokenizerFast):
8
+ def __init__(self, *args, **kwargs):
9
+ super().__init__(*args, **kwargs)
10
+ self.juman_normalizer = normalizers.Sequence(
11
+ [
12
+ # cf. https://github.com/ku-nlp/rhoknp/blob/v1.3.0/src/rhoknp/units/sentence.py#L36
13
+ normalizers.Replace("\r", ""),
14
+ normalizers.Replace("\n", ""),
15
+ # cf. https://github.com/ku-nlp/jumanpp/blob/v2.0.0-rc3/src/jumandic/shared/juman_format.cc#L44-L61
16
+ normalizers.Replace("\t", "\\t"),
17
+ normalizers.Replace(" ", " "),
18
+ normalizers.Replace('"', "”"),
19
+ normalizers.Replace("<", "<"),
20
+ normalizers.Replace(">", ">"),
21
+ ]
22
+ )
23
+ self.juman_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(JumanppPreTokenizer())
24
+
25
+ self.default_normalizer = copy.deepcopy(self.backend_tokenizer.normalizer)
26
+ self.default_pre_tokenizer = copy.deepcopy(self.backend_tokenizer.pre_tokenizer)
27
+
28
+ self.backend_tokenizer.normalizer = normalizers.Sequence(
29
+ [self.juman_normalizer, self.backend_tokenizer.normalizer]
30
+ )
31
+ self.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
32
+ [self.juman_pre_tokenizer, self.backend_tokenizer.pre_tokenizer]
33
+ )
34
+
35
+ def save_pretrained(self, *args, **kwargs):
36
+ self.backend_tokenizer.normalizer = self.default_normalizer
37
+ self.backend_tokenizer.pre_tokenizer = self.default_pre_tokenizer
38
+ super().save_pretrained(*args, **kwargs)
39
+
40
+ self.backend_tokenizer.normalizer = normalizers.Sequence(
41
+ [self.juman_normalizer, self.backend_tokenizer.normalizer]
42
+ )
43
+ self.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
44
+ [self.juman_pre_tokenizer, self.backend_tokenizer.pre_tokenizer]
45
+ )
46
+
47
+
48
+ class JumanppPreTokenizer:
49
+ def __init__(self):
50
+ try:
51
+ import rhoknp
52
+ except ImportError:
53
+ raise ImportError(
54
+ "You need to install rhoknp to use JumanppPreTokenizer. "
55
+ "See https://github.com/ku-nlp/rhoknp for installation."
56
+ )
57
+ self.juman = rhoknp.Jumanpp()
58
+
59
+ def pre_tokenize(self, pretok: PreTokenizedString):
60
+ pretok.split(self.jumanpp_split)
61
+
62
+ def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> list[NormalizedString]:
63
+ offsets = [morpheme.span for morpheme in self.juman.apply_to_sentence(str(normalized_string)).morphemes]
64
+ return [normalized_string[offset[0]:offset[1]] for offset in offsets]
tokenizer_config.json CHANGED
@@ -10,6 +10,12 @@
10
  "sp_model_kwargs": {},
11
  "special_tokens_map_file": null,
12
  "split_by_punct": false,
13
- "tokenizer_class": "DebertaV2Tokenizer",
14
- "unk_token": "[UNK]"
 
 
 
 
 
 
15
  }
 
10
  "sp_model_kwargs": {},
11
  "special_tokens_map_file": null,
12
  "split_by_punct": false,
13
+ "tokenizer_class": "DebertaV2JumanppTokenizer",
14
+ "unk_token": "[UNK]",
15
+ "auto_map": {
16
+ "AutoTokenizer": [
17
+ "tokenization_deberta_v2_jumanpp.DebertaV2JumanppTokenizer",
18
+ "tokenization_deberta_v2_jumanpp_fast.DebertaV2JumanppTokenizerFast"
19
+ ]
20
+ }
21
  }