liwii commited on
Commit
02b287e
1 Parent(s): bcbdf6d

Update distilbert_japanese_tokenizer.py

Browse files

As [discussed in the community](https://huggingface.co/line-corporation/line-distilbert-base-japanese/discussions/3), current tokenizer code does not work with `transformers>=4.34`, this is because the [tokenizer refactoring](https://github.com/huggingface/transformers/pull/23909) introduced in that version.

With this change, `PreTrainedTokenizer.__init__()` starts to access `get_vocab()`, so `self.subword_tokenizer_type` needs to be initialized before `super().__init__()` of `DistilBertJapaneseTokenizer`.

This issue is already fixed in `transformers` with [2da8853](https://github.com/huggingface/transformers/commit/2da8853775b61cde0894dee17c6c713aba711688). This PR basically follows that change.

Confirmed it works with [my repository](https://huggingface.co/liwii/line-distilbert-base-japanese-fork) forked from line-corporation/line-distilbert-base-japanese.

Files changed (1) hide show
  1. distilbert_japanese_tokenizer.py +22 -22
distilbert_japanese_tokenizer.py CHANGED
@@ -170,25 +170,6 @@ class DistilBertJapaneseTokenizer(PreTrainedTokenizer):
170
  jumanpp_kwargs=None,
171
  **kwargs
172
  ):
173
- super().__init__(
174
- spm_file=spm_file,
175
- unk_token=unk_token,
176
- sep_token=sep_token,
177
- pad_token=pad_token,
178
- cls_token=cls_token,
179
- mask_token=mask_token,
180
- do_lower_case=do_lower_case,
181
- do_word_tokenize=do_word_tokenize,
182
- do_subword_tokenize=do_subword_tokenize,
183
- word_tokenizer_type=word_tokenizer_type,
184
- subword_tokenizer_type=subword_tokenizer_type,
185
- never_split=never_split,
186
- mecab_kwargs=mecab_kwargs,
187
- sudachi_kwargs=sudachi_kwargs,
188
- jumanpp_kwargs=jumanpp_kwargs,
189
- **kwargs,
190
- )
191
-
192
  if subword_tokenizer_type == "sentencepiece":
193
  if not os.path.isfile(spm_file):
194
  raise ValueError(
@@ -236,14 +217,33 @@ class DistilBertJapaneseTokenizer(PreTrainedTokenizer):
236
  self.subword_tokenizer_type = subword_tokenizer_type
237
  if do_subword_tokenize:
238
  if subword_tokenizer_type == "wordpiece":
239
- self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
240
  elif subword_tokenizer_type == "character":
241
- self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token)
242
  elif subword_tokenizer_type == "sentencepiece":
243
- self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=self.unk_token)
244
  else:
245
  raise ValueError(f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified.")
246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  @property
248
  def do_lower_case(self):
249
  return self.lower_case
 
170
  jumanpp_kwargs=None,
171
  **kwargs
172
  ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  if subword_tokenizer_type == "sentencepiece":
174
  if not os.path.isfile(spm_file):
175
  raise ValueError(
 
217
  self.subword_tokenizer_type = subword_tokenizer_type
218
  if do_subword_tokenize:
219
  if subword_tokenizer_type == "wordpiece":
220
+ self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
221
  elif subword_tokenizer_type == "character":
222
+ self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=str(unk_token))
223
  elif subword_tokenizer_type == "sentencepiece":
224
+ self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=str(unk_token))
225
  else:
226
  raise ValueError(f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified.")
227
 
228
+ super().__init__(
229
+ spm_file=spm_file,
230
+ unk_token=unk_token,
231
+ sep_token=sep_token,
232
+ pad_token=pad_token,
233
+ cls_token=cls_token,
234
+ mask_token=mask_token,
235
+ do_lower_case=do_lower_case,
236
+ do_word_tokenize=do_word_tokenize,
237
+ do_subword_tokenize=do_subword_tokenize,
238
+ word_tokenizer_type=word_tokenizer_type,
239
+ subword_tokenizer_type=subword_tokenizer_type,
240
+ never_split=never_split,
241
+ mecab_kwargs=mecab_kwargs,
242
+ sudachi_kwargs=sudachi_kwargs,
243
+ jumanpp_kwargs=jumanpp_kwargs,
244
+ **kwargs,
245
+ )
246
+
247
  @property
248
  def do_lower_case(self):
249
  return self.lower_case