ryo0634 commited on
Commit
9dd709e
1 Parent(s): 0afd48f

Upload tokenizer

Browse files
distilbert_japanese_tokenizer.py ADDED
@@ -0,0 +1,835 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+
3
+ # Copyright 2023 LINE Corporation.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ # Almost copied from [transformers.BertJapaneseTokenizer](https://github.com/huggingface/transformers/blob/v4.26.1/src/transformers/models/bert_japanese/tokenization_bert_japanese.py#)
18
+ # This code is distributed under the Apache License 2.0.
19
+
20
+ """Tokenization classes."""
21
+
22
+
23
+ import collections
24
+ import copy
25
+ import os
26
+ import unicodedata
27
+ from typing import Any, Dict, List, Optional, Tuple
28
+
29
+ from transformers.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
30
+ from transformers.utils import is_sentencepiece_available, logging
31
+
32
+ try:
33
+ import sentencepiece as spm
34
+ except ModuleNotFoundError as error:
35
+ raise error.__class__(
36
+ "The sentencepiece is not installed. "
37
+ "See https://github.com/google/sentencepiece for installation."
38
+ )
39
+
40
+
41
+
42
+ logger = logging.get_logger(__name__)
43
+
44
+ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "spm_file": "spiece.model"}
45
+
46
+ SPIECE_UNDERLINE = "▁"
47
+
48
+ PRETRAINED_VOCAB_FILES_MAP = {
49
+ "vocab_file": {
50
+ "cl-tohoku/bert-base-japanese": "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/vocab.txt",
51
+ "cl-tohoku/bert-base-japanese-whole-word-masking": (
52
+ "https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/vocab.txt"
53
+ ),
54
+ "cl-tohoku/bert-base-japanese-char": (
55
+ "https://huggingface.co/cl-tohoku/bert-base-japanese-char/resolve/main/vocab.txt"
56
+ ),
57
+ "cl-tohoku/bert-base-japanese-char-whole-word-masking": (
58
+ "https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/resolve/main/vocab.txt"
59
+ ),
60
+ }
61
+ }
62
+
63
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
64
+ "cl-tohoku/bert-base-japanese": 512,
65
+ "cl-tohoku/bert-base-japanese-whole-word-masking": 512,
66
+ "cl-tohoku/bert-base-japanese-char": 512,
67
+ "cl-tohoku/bert-base-japanese-char-whole-word-masking": 512,
68
+ }
69
+
70
+ PRETRAINED_INIT_CONFIGURATION = {
71
+ "cl-tohoku/bert-base-japanese": {
72
+ "do_lower_case": False,
73
+ "word_tokenizer_type": "mecab",
74
+ "subword_tokenizer_type": "wordpiece",
75
+ },
76
+ "cl-tohoku/bert-base-japanese-whole-word-masking": {
77
+ "do_lower_case": False,
78
+ "word_tokenizer_type": "mecab",
79
+ "subword_tokenizer_type": "wordpiece",
80
+ },
81
+ "cl-tohoku/bert-base-japanese-char": {
82
+ "do_lower_case": False,
83
+ "word_tokenizer_type": "mecab",
84
+ "subword_tokenizer_type": "character",
85
+ },
86
+ "cl-tohoku/bert-base-japanese-char-whole-word-masking": {
87
+ "do_lower_case": False,
88
+ "word_tokenizer_type": "mecab",
89
+ "subword_tokenizer_type": "character",
90
+ },
91
+ }
92
+
93
+
94
+ # Copied from transformers.models.bert.tokenization_bert.load_vocab
95
+ def load_vocab(vocab_file):
96
+ """Loads a vocabulary file into a dictionary."""
97
+ vocab = collections.OrderedDict()
98
+ with open(vocab_file, "r", encoding="utf-8") as reader:
99
+ tokens = reader.readlines()
100
+ for index, token in enumerate(tokens):
101
+ token = token.rstrip("\n")
102
+ vocab[token] = index
103
+ return vocab
104
+
105
+
106
+ # Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
107
+ def whitespace_tokenize(text):
108
+ """Runs basic whitespace cleaning and splitting on a piece of text."""
109
+ text = text.strip()
110
+ if not text:
111
+ return []
112
+ tokens = text.split()
113
+ return tokens
114
+
115
+
116
+ class DistilBertJapaneseTokenizer(PreTrainedTokenizer):
117
+ r"""
118
+ Construct a BERT tokenizer for Japanese text.
119
+
120
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer
121
+ to: this superclass for more information regarding those methods.
122
+
123
+ Args:
124
+ vocab_file (`str`):
125
+ Path to a one-wordpiece-per-line vocabulary file.
126
+ spm_file (`str`, *optional*):
127
+ Path to [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm or .model
128
+ extension) that contains the vocabulary.
129
+ do_lower_case (`bool`, *optional*, defaults to `True`):
130
+ Whether to lower case the input. Only has an effect when do_basic_tokenize=True.
131
+ do_word_tokenize (`bool`, *optional*, defaults to `True`):
132
+ Whether to do word tokenization.
133
+ do_subword_tokenize (`bool`, *optional*, defaults to `True`):
134
+ Whether to do subword tokenization.
135
+ word_tokenizer_type (`str`, *optional*, defaults to `"basic"`):
136
+ Type of word tokenizer. Choose from ["basic", "mecab", "sudachi", "jumanpp"].
137
+ subword_tokenizer_type (`str`, *optional*, defaults to `"wordpiece"`):
138
+ Type of subword tokenizer. Choose from ["wordpiece", "character", "sentencepiece",].
139
+ mecab_kwargs (`dict`, *optional*):
140
+ Dictionary passed to the `MecabTokenizer` constructor.
141
+ sudachi_kwargs (`dict`, *optional*):
142
+ Dictionary passed to the `SudachiTokenizer` constructor.
143
+ jumanpp_kwargs (`dict`, *optional*):
144
+ Dictionary passed to the `JumanppTokenizer` constructor.
145
+ """
146
+
147
+ vocab_files_names = VOCAB_FILES_NAMES
148
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
149
+ pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
150
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
151
+ model_input_names = [ "input_ids" , "attention_mask" ]
152
+
153
+ def __init__(
154
+ self,
155
+ vocab_file,
156
+ spm_file=None,
157
+ do_lower_case=False,
158
+ do_word_tokenize=True,
159
+ do_subword_tokenize=True,
160
+ word_tokenizer_type="basic",
161
+ subword_tokenizer_type="wordpiece",
162
+ never_split=None,
163
+ unk_token="[UNK]",
164
+ sep_token="[SEP]",
165
+ pad_token="[PAD]",
166
+ cls_token="[CLS]",
167
+ mask_token="[MASK]",
168
+ mecab_kwargs=None,
169
+ sudachi_kwargs=None,
170
+ jumanpp_kwargs=None,
171
+ **kwargs
172
+ ):
173
+ super().__init__(
174
+ spm_file=spm_file,
175
+ unk_token=unk_token,
176
+ sep_token=sep_token,
177
+ pad_token=pad_token,
178
+ cls_token=cls_token,
179
+ mask_token=mask_token,
180
+ do_lower_case=do_lower_case,
181
+ do_word_tokenize=do_word_tokenize,
182
+ do_subword_tokenize=do_subword_tokenize,
183
+ word_tokenizer_type=word_tokenizer_type,
184
+ subword_tokenizer_type=subword_tokenizer_type,
185
+ never_split=never_split,
186
+ mecab_kwargs=mecab_kwargs,
187
+ sudachi_kwargs=sudachi_kwargs,
188
+ jumanpp_kwargs=jumanpp_kwargs,
189
+ **kwargs,
190
+ )
191
+
192
+ if subword_tokenizer_type == "sentencepiece":
193
+ if not os.path.isfile(spm_file):
194
+ raise ValueError(
195
+ f"Can't find a vocabulary file at path '{spm_file}'. To load the vocabulary from a Google"
196
+ " pretrained model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
197
+ )
198
+ self.spm_file = spm_file
199
+ else:
200
+ if not os.path.isfile(vocab_file):
201
+ raise ValueError(
202
+ f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google"
203
+ " pretrained model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
204
+ )
205
+ self.vocab = load_vocab(vocab_file)
206
+ self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
207
+
208
+ self.do_word_tokenize = do_word_tokenize
209
+ self.word_tokenizer_type = word_tokenizer_type
210
+ self.lower_case = do_lower_case
211
+ self.never_split = never_split
212
+ self.mecab_kwargs = copy.deepcopy(mecab_kwargs)
213
+ self.sudachi_kwargs = copy.deepcopy(sudachi_kwargs)
214
+ self.jumanpp_kwargs = copy.deepcopy(jumanpp_kwargs)
215
+ if do_word_tokenize:
216
+ if word_tokenizer_type == "basic":
217
+ self.word_tokenizer = BasicTokenizer(
218
+ do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=False
219
+ )
220
+ elif word_tokenizer_type == "mecab":
221
+ self.word_tokenizer = MecabTokenizer(
222
+ do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {})
223
+ )
224
+ elif word_tokenizer_type == "sudachi":
225
+ self.word_tokenizer = SudachiTokenizer(
226
+ do_lower_case=do_lower_case, never_split=never_split, **(sudachi_kwargs or {})
227
+ )
228
+ elif word_tokenizer_type == "jumanpp":
229
+ self.word_tokenizer = JumanppTokenizer(
230
+ do_lower_case=do_lower_case, never_split=never_split, **(jumanpp_kwargs or {})
231
+ )
232
+ else:
233
+ raise ValueError(f"Invalid word_tokenizer_type '{word_tokenizer_type}' is specified.")
234
+
235
+ self.do_subword_tokenize = do_subword_tokenize
236
+ self.subword_tokenizer_type = subword_tokenizer_type
237
+ if do_subword_tokenize:
238
+ if subword_tokenizer_type == "wordpiece":
239
+ self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
240
+ elif subword_tokenizer_type == "character":
241
+ self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token)
242
+ elif subword_tokenizer_type == "sentencepiece":
243
+ self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=self.unk_token)
244
+ else:
245
+ raise ValueError(f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified.")
246
+
247
+ @property
248
+ def do_lower_case(self):
249
+ return self.lower_case
250
+
251
+ def __getstate__(self):
252
+ state = dict(self.__dict__)
253
+ if self.word_tokenizer_type in ["mecab", "sudachi", "jumanpp"]:
254
+ del state["word_tokenizer"]
255
+ return state
256
+
257
+ def __setstate__(self, state):
258
+ self.__dict__ = state
259
+ if self.word_tokenizer_type == "mecab":
260
+ self.word_tokenizer = MecabTokenizer(
261
+ do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.mecab_kwargs or {})
262
+ )
263
+ elif self.word_tokenizer_type == "sudachi":
264
+ self.word_tokenizer = SudachiTokenizer(
265
+ do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.sudachi_kwargs or {})
266
+ )
267
+ elif self.word_tokenizer_type == "jumanpp":
268
+ self.word_tokenizer = JumanppTokenizer(
269
+ do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.jumanpp_kwargs or {})
270
+ )
271
+
272
+ def _tokenize(self, text):
273
+ if self.do_word_tokenize:
274
+ tokens = self.word_tokenizer.tokenize(text, never_split=self.all_special_tokens)
275
+ else:
276
+ tokens = [text]
277
+
278
+ if self.do_subword_tokenize:
279
+ split_tokens = [sub_token for token in tokens for sub_token in self.subword_tokenizer.tokenize(token)]
280
+ else:
281
+ split_tokens = tokens
282
+
283
+ return split_tokens
284
+
285
+ @property
286
+ def vocab_size(self):
287
+ if self.subword_tokenizer_type == "sentencepiece":
288
+ return len(self.subword_tokenizer.sp_model)
289
+ return len(self.vocab)
290
+
291
+ def get_vocab(self):
292
+ if self.subword_tokenizer_type == "sentencepiece":
293
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
294
+ vocab.update(self.added_tokens_encoder)
295
+ return vocab
296
+ return dict(self.vocab, **self.added_tokens_encoder)
297
+
298
+ def _convert_token_to_id(self, token):
299
+ """Converts a token (str) in an id using the vocab."""
300
+ if self.subword_tokenizer_type == "sentencepiece":
301
+ return self.subword_tokenizer.sp_model.PieceToId(token)
302
+ return self.vocab.get(token, self.vocab.get(self.unk_token))
303
+
304
+ def _convert_id_to_token(self, index):
305
+ """Converts an index (integer) in a token (str) using the vocab."""
306
+ if self.subword_tokenizer_type == "sentencepiece":
307
+ return self.subword_tokenizer.sp_model.IdToPiece(index)
308
+ return self.ids_to_tokens.get(index, self.unk_token)
309
+
310
+ def convert_tokens_to_string(self, tokens):
311
+ """Converts a sequence of tokens (string) in a single string."""
312
+ if self.subword_tokenizer_type == "sentencepiece":
313
+ return self.subword_tokenizer.sp_model.decode(tokens)
314
+ out_string = " ".join(tokens).replace(" ##", "").strip()
315
+ return out_string
316
+
317
+ # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens
318
+ def build_inputs_with_special_tokens(
319
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
320
+ ) -> List[int]:
321
+ """
322
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
323
+ adding special tokens. A BERT sequence has the following format:
324
+
325
+ - single sequence: `[CLS] X [SEP]`
326
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
327
+
328
+ Args:
329
+ token_ids_0 (`List[int]`):
330
+ List of IDs to which the special tokens will be added.
331
+ token_ids_1 (`List[int]`, *optional*):
332
+ Optional second list of IDs for sequence pairs.
333
+
334
+ Returns:
335
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
336
+ """
337
+ if token_ids_1 is None:
338
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
339
+ cls = [self.cls_token_id]
340
+ sep = [self.sep_token_id]
341
+ return cls + token_ids_0 + sep + token_ids_1 + sep
342
+
343
+ # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask
344
+ def get_special_tokens_mask(
345
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
346
+ ) -> List[int]:
347
+ """
348
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
349
+ special tokens using the tokenizer `prepare_for_model` method.
350
+
351
+ Args:
352
+ token_ids_0 (`List[int]`):
353
+ List of IDs.
354
+ token_ids_1 (`List[int]`, *optional*):
355
+ Optional second list of IDs for sequence pairs.
356
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
357
+ Whether or not the token list is already formatted with special tokens for the model.
358
+
359
+ Returns:
360
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
361
+ """
362
+
363
+ if already_has_special_tokens:
364
+ return super().get_special_tokens_mask(
365
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
366
+ )
367
+
368
+ if token_ids_1 is not None:
369
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
370
+ return [1] + ([0] * len(token_ids_0)) + [1]
371
+
372
+ # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.create_token_type_ids_from_sequences
373
+ def create_token_type_ids_from_sequences(
374
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
375
+ ) -> List[int]:
376
+ """
377
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
378
+ pair mask has the following format:
379
+
380
+ ```
381
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
382
+ | first sequence | second sequence |
383
+ ```
384
+
385
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
386
+
387
+ Args:
388
+ token_ids_0 (`List[int]`):
389
+ List of IDs.
390
+ token_ids_1 (`List[int]`, *optional*):
391
+ Optional second list of IDs for sequence pairs.
392
+
393
+ Returns:
394
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
395
+ """
396
+ sep = [self.sep_token_id]
397
+ cls = [self.cls_token_id]
398
+ if token_ids_1 is None:
399
+ return len(cls + token_ids_0 + sep) * [0]
400
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
401
+
402
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
403
+ if os.path.isdir(save_directory):
404
+ if self.subword_tokenizer_type == "sentencepiece":
405
+ vocab_file = os.path.join(
406
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["spm_file"]
407
+ )
408
+ else:
409
+ vocab_file = os.path.join(
410
+ save_directory,
411
+ (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"],
412
+ )
413
+ else:
414
+ vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
415
+
416
+ if self.subword_tokenizer_type == "sentencepiece":
417
+ with open(vocab_file, "wb") as writer:
418
+ content_spiece_model = self.subword_tokenizer.sp_model.serialized_model_proto()
419
+ writer.write(content_spiece_model)
420
+ else:
421
+ with open(vocab_file, "w", encoding="utf-8") as writer:
422
+ index = 0
423
+ for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
424
+ if index != token_index:
425
+ logger.warning(
426
+ f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
427
+ " Please check that the vocabulary is not corrupted!"
428
+ )
429
+ index = token_index
430
+ writer.write(token + "\n")
431
+ index += 1
432
+ return (vocab_file,)
433
+
434
+
435
+ class MecabTokenizer:
436
+ """Runs basic tokenization with MeCab morphological parser."""
437
+
438
+ def __init__(
439
+ self,
440
+ do_lower_case=False,
441
+ never_split=None,
442
+ normalize_text=True,
443
+ mecab_dic: Optional[str] = "unidic_lite",
444
+ mecab_option: Optional[str] = None,
445
+ ):
446
+ """
447
+ Constructs a MecabTokenizer.
448
+
449
+ Args:
450
+ **do_lower_case**: (*optional*) boolean (default True)
451
+ Whether to lowercase the input.
452
+ **never_split**: (*optional*) list of str
453
+ Kept for backward compatibility purposes. Now implemented directly at the base class level (see
454
+ [`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
455
+ **normalize_text**: (*optional*) boolean (default True)
456
+ Whether to apply unicode normalization to text before tokenization.
457
+ **mecab_dic**: (*optional*) string (default "unidic_lite")
458
+ Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
459
+ set this option to `None` and modify *mecab_option*.
460
+ **mecab_option**: (*optional*) string
461
+ String passed to MeCab constructor.
462
+ """
463
+ self.do_lower_case = do_lower_case
464
+ self.never_split = never_split if never_split is not None else []
465
+ self.normalize_text = normalize_text
466
+
467
+ try:
468
+ import fugashi
469
+ except ModuleNotFoundError as error:
470
+ raise error.__class__(
471
+ "You need to install fugashi to use MecabTokenizer. "
472
+ "See https://pypi.org/project/fugashi/ for installation."
473
+ )
474
+
475
+ mecab_option = mecab_option or ""
476
+
477
+ if mecab_dic is not None:
478
+ if mecab_dic == "unidic_lite":
479
+ try:
480
+ import unidic_lite
481
+ except ModuleNotFoundError as error:
482
+ raise error.__class__(
483
+ "The unidic_lite dictionary is not installed. "
484
+ "See https://github.com/polm/unidic-lite for installation."
485
+ )
486
+
487
+ dic_dir = unidic_lite.DICDIR
488
+ else:
489
+ raise ValueError("Invalid mecab_dic is specified.")
490
+
491
+ mecabrc = os.path.join(dic_dir, "mecabrc")
492
+ mecab_option = f'-d "{dic_dir}" -r "{mecabrc}" ' + mecab_option
493
+
494
+ self.mecab = fugashi.GenericTagger(mecab_option)
495
+
496
+ def tokenize(self, text, never_split=None, **kwargs):
497
+ """Tokenizes a piece of text."""
498
+ if self.normalize_text:
499
+ text = unicodedata.normalize("NFKC", text)
500
+
501
+ never_split = self.never_split + (never_split if never_split is not None else [])
502
+ tokens = []
503
+
504
+ for word in self.mecab(text):
505
+ token = word.surface
506
+
507
+ if self.do_lower_case and token not in never_split:
508
+ token = token.lower()
509
+
510
+ tokens.append(token)
511
+
512
+ return tokens
513
+
514
+
515
+ class CharacterTokenizer:
516
+ """Runs Character tokenization."""
517
+
518
+ def __init__(self, vocab, unk_token, normalize_text=True):
519
+ """
520
+ Constructs a CharacterTokenizer.
521
+
522
+ Args:
523
+ **vocab**:
524
+ Vocabulary object.
525
+ **unk_token**: str
526
+ A special symbol for out-of-vocabulary token.
527
+ **normalize_text**: (`optional`) boolean (default True)
528
+ Whether to apply unicode normalization to text before tokenization.
529
+ """
530
+ self.vocab = vocab
531
+ self.unk_token = unk_token
532
+ self.normalize_text = normalize_text
533
+
534
+ def tokenize(self, text):
535
+ """
536
+ Tokenizes a piece of text into characters.
537
+
538
+ For example, `input = "apple""` wil return as output `["a", "p", "p", "l", "e"]`.
539
+
540
+ Args:
541
+ text: A single token or whitespace separated tokens.
542
+ This should have already been passed through *BasicTokenizer*.
543
+
544
+ Returns:
545
+ A list of characters.
546
+ """
547
+ if self.normalize_text:
548
+ text = unicodedata.normalize("NFKC", text)
549
+
550
+ output_tokens = []
551
+ for char in text:
552
+ if char not in self.vocab:
553
+ output_tokens.append(self.unk_token)
554
+ continue
555
+
556
+ output_tokens.append(char)
557
+
558
+ return output_tokens
559
+
560
+
561
+ # Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
562
+ class BasicTokenizer(object):
563
+ """
564
+ Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
565
+
566
+ Args:
567
+ do_lower_case (`bool`, *optional*, defaults to `True`):
568
+ Whether or not to lowercase the input when tokenizing.
569
+ never_split (`Iterable`, *optional*):
570
+ Collection of tokens which will never be split during tokenization. Only has an effect when
571
+ `do_basic_tokenize=True`
572
+ tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
573
+ Whether or not to tokenize Chinese characters.
574
+
575
+ This should likely be deactivated for Japanese (see this
576
+ [issue](https://github.com/huggingface/transformers/issues/328)).
577
+ strip_accents (`bool`, *optional*):
578
+ Whether or not to strip all accents. If this option is not specified, then it will be determined by the
579
+ value for `lowercase` (as in the original BERT).
580
+ """
581
+
582
+ def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
583
+ if never_split is None:
584
+ never_split = []
585
+ self.do_lower_case = do_lower_case
586
+ self.never_split = set(never_split)
587
+ self.tokenize_chinese_chars = tokenize_chinese_chars
588
+ self.strip_accents = strip_accents
589
+
590
+ def tokenize(self, text, never_split=None):
591
+ """
592
+ Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
593
+ WordPieceTokenizer.
594
+
595
+ Args:
596
+ never_split (`List[str]`, *optional*)
597
+ Kept for backward compatibility purposes. Now implemented directly at the base class level (see
598
+ [`PreTrainedTokenizer.tokenize`]) List of token not to split.
599
+ """
600
+ # union() returns a new set by concatenating the two sets.
601
+ never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
602
+ text = self._clean_text(text)
603
+
604
+ # This was added on November 1st, 2018 for the multilingual and Chinese
605
+ # models. This is also applied to the English models now, but it doesn't
606
+ # matter since the English models were not trained on any Chinese data
607
+ # and generally don't have any Chinese data in them (there are Chinese
608
+ # characters in the vocabulary because Wikipedia does have some Chinese
609
+ # words in the English Wikipedia.).
610
+ if self.tokenize_chinese_chars:
611
+ text = self._tokenize_chinese_chars(text)
612
+ orig_tokens = whitespace_tokenize(text)
613
+ split_tokens = []
614
+ for token in orig_tokens:
615
+ if token not in never_split:
616
+ if self.do_lower_case:
617
+ token = token.lower()
618
+ if self.strip_accents is not False:
619
+ token = self._run_strip_accents(token)
620
+ elif self.strip_accents:
621
+ token = self._run_strip_accents(token)
622
+ split_tokens.extend(self._run_split_on_punc(token, never_split))
623
+
624
+ output_tokens = whitespace_tokenize(" ".join(split_tokens))
625
+ return output_tokens
626
+
627
+ def _run_strip_accents(self, text):
628
+ """Strips accents from a piece of text."""
629
+ text = unicodedata.normalize("NFD", text)
630
+ output = []
631
+ for char in text:
632
+ cat = unicodedata.category(char)
633
+ if cat == "Mn":
634
+ continue
635
+ output.append(char)
636
+ return "".join(output)
637
+
638
+ def _run_split_on_punc(self, text, never_split=None):
639
+ """Splits punctuation on a piece of text."""
640
+ if never_split is not None and text in never_split:
641
+ return [text]
642
+ chars = list(text)
643
+ i = 0
644
+ start_new_word = True
645
+ output = []
646
+ while i < len(chars):
647
+ char = chars[i]
648
+ if _is_punctuation(char):
649
+ output.append([char])
650
+ start_new_word = True
651
+ else:
652
+ if start_new_word:
653
+ output.append([])
654
+ start_new_word = False
655
+ output[-1].append(char)
656
+ i += 1
657
+
658
+ return ["".join(x) for x in output]
659
+
660
+ def _tokenize_chinese_chars(self, text):
661
+ """Adds whitespace around any CJK character."""
662
+ output = []
663
+ for char in text:
664
+ cp = ord(char)
665
+ if self._is_chinese_char(cp):
666
+ output.append(" ")
667
+ output.append(char)
668
+ output.append(" ")
669
+ else:
670
+ output.append(char)
671
+ return "".join(output)
672
+
673
+ def _is_chinese_char(self, cp):
674
+ """Checks whether CP is the codepoint of a CJK character."""
675
+ # This defines a "chinese character" as anything in the CJK Unicode block:
676
+ # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
677
+ #
678
+ # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
679
+ # despite its name. The modern Korean Hangul alphabet is a different block,
680
+ # as is Japanese Hiragana and Katakana. Those alphabets are used to write
681
+ # space-separated words, so they are not treated specially and handled
682
+ # like the all of the other languages.
683
+ if (
684
+ (cp >= 0x4E00 and cp <= 0x9FFF)
685
+ or (cp >= 0x3400 and cp <= 0x4DBF) #
686
+ or (cp >= 0x20000 and cp <= 0x2A6DF) #
687
+ or (cp >= 0x2A700 and cp <= 0x2B73F) #
688
+ or (cp >= 0x2B740 and cp <= 0x2B81F) #
689
+ or (cp >= 0x2B820 and cp <= 0x2CEAF) #
690
+ or (cp >= 0xF900 and cp <= 0xFAFF)
691
+ or (cp >= 0x2F800 and cp <= 0x2FA1F) #
692
+ ): #
693
+ return True
694
+
695
+ return False
696
+
697
+ def _clean_text(self, text):
698
+ """Performs invalid character removal and whitespace cleanup on text."""
699
+ output = []
700
+ for char in text:
701
+ cp = ord(char)
702
+ if cp == 0 or cp == 0xFFFD or _is_control(char):
703
+ continue
704
+ if _is_whitespace(char):
705
+ output.append(" ")
706
+ else:
707
+ output.append(char)
708
+ return "".join(output)
709
+
710
+
711
+ # Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
712
+ class WordpieceTokenizer(object):
713
+ """Runs WordPiece tokenization."""
714
+
715
+ def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
716
+ self.vocab = vocab
717
+ self.unk_token = unk_token
718
+ self.max_input_chars_per_word = max_input_chars_per_word
719
+
720
+ def tokenize(self, text):
721
+ """
722
+ Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
723
+ tokenization using the given vocabulary.
724
+
725
+ For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
726
+
727
+ Args:
728
+ text: A single token or whitespace separated tokens. This should have
729
+ already been passed through *BasicTokenizer*.
730
+
731
+ Returns:
732
+ A list of wordpiece tokens.
733
+ """
734
+
735
+ output_tokens = []
736
+ for token in whitespace_tokenize(text):
737
+ chars = list(token)
738
+ if len(chars) > self.max_input_chars_per_word:
739
+ output_tokens.append(self.unk_token)
740
+ continue
741
+
742
+ is_bad = False
743
+ start = 0
744
+ sub_tokens = []
745
+ while start < len(chars):
746
+ end = len(chars)
747
+ cur_substr = None
748
+ while start < end:
749
+ substr = "".join(chars[start:end])
750
+ if start > 0:
751
+ substr = "##" + substr
752
+ if substr in self.vocab:
753
+ cur_substr = substr
754
+ break
755
+ end -= 1
756
+ if cur_substr is None:
757
+ is_bad = True
758
+ break
759
+ sub_tokens.append(cur_substr)
760
+ start = end
761
+
762
+ if is_bad:
763
+ output_tokens.append(self.unk_token)
764
+ else:
765
+ output_tokens.extend(sub_tokens)
766
+ return output_tokens
767
+
768
+
769
+ class SentencepieceTokenizer(object):
770
+ """
771
+ Runs sentencepiece tokenization. Based on transformers.models.albert.tokenization_albert.AlbertTokenizer.
772
+ """
773
+
774
+ def __init__(
775
+ self,
776
+ vocab,
777
+ unk_token,
778
+ do_lower_case=False,
779
+ remove_space=True,
780
+ keep_accents=True,
781
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
782
+ ):
783
+ self.vocab = vocab
784
+ self.unk_token = unk_token
785
+ self.do_lower_case = do_lower_case
786
+ self.remove_space = remove_space
787
+ self.keep_accents = keep_accents
788
+
789
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
790
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
791
+ self.sp_model.Load(self.vocab)
792
+
793
+ def preprocess_text(self, inputs):
794
+ if self.remove_space:
795
+ outputs = " ".join(inputs.strip().split())
796
+ else:
797
+ outputs = inputs
798
+ outputs = outputs.replace("``", '"').replace("''", '"')
799
+
800
+ if not self.keep_accents:
801
+ outputs = unicodedata.normalize("NFKD", outputs)
802
+ outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
803
+ if self.do_lower_case:
804
+ outputs = outputs.lower()
805
+
806
+ return outputs
807
+
808
+ def tokenize(self, text):
809
+ """
810
+ Tokenizes text by sentencepiece. Based on [SentencePiece](https://github.com/google/sentencepiece).
811
+ Tokenization needs the given vocabulary.
812
+
813
+ Args:
814
+ text: A string needs to be tokenized.
815
+
816
+ Returns:
817
+ A list of sentencepiece tokens.
818
+ """
819
+ text = self.preprocess_text(text)
820
+ pieces = self.sp_model.encode(text, out_type=str)
821
+ new_pieces = []
822
+ for piece in pieces:
823
+ if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
824
+ cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
825
+ if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
826
+ if len(cur_pieces[0]) == 1:
827
+ cur_pieces = cur_pieces[1:]
828
+ else:
829
+ cur_pieces[0] = cur_pieces[0][1:]
830
+ cur_pieces.append(piece[-1])
831
+ new_pieces.extend(cur_pieces)
832
+ else:
833
+ new_pieces.append(piece)
834
+
835
+ return new_pieces
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": {
6
+ "content": "[MASK]",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "[SEP]",
14
+ "unk_token": "<unk>"
15
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcfafc8c0662d9c8f39621a64c74260f2ad120310c8dd24886de2dddaf599b4e
3
+ size 439391
tokenizer_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "distilbert_japanese_tokenizer.DistilBertJapaneseTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "bos_token": "[CLS]",
9
+ "clean_up_tokenization_spaces": true,
10
+ "cls_token": "[CLS]",
11
+ "do_lower_case": true,
12
+ "do_subword_tokenize": true,
13
+ "do_word_tokenize": true,
14
+ "eos_token": "[SEP]",
15
+ "jumanpp_kwargs": null,
16
+ "keep_accents": true,
17
+ "mask_token": {
18
+ "__type": "AddedToken",
19
+ "content": "[MASK]",
20
+ "lstrip": true,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "mecab_kwargs": {
26
+ "mecab_dic": "unidic_lite"
27
+ },
28
+ "model_max_length": 1000000000000000019884624838656,
29
+ "never_split": null,
30
+ "pad_token": "<pad>",
31
+ "remove_space": true,
32
+ "sep_token": "[SEP]",
33
+ "subword_tokenizer_type": "sentencepiece",
34
+ "sudachi_kwargs": null,
35
+ "tokenize_chinese_chars": false,
36
+ "tokenizer_class": "DistilBertJapaneseTokenizer",
37
+ "unk_token": "<unk>",
38
+ "verbose": false,
39
+ "word_tokenizer_type": "mecab"
40
+ }