singletongue commited on
Commit
4117b7f
1 Parent(s): f01bdac

Add model files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ entity_vocab.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<ent2>": 32769,
3
+ "<ent>": 32768
4
+ }
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cl-tohoku/bert-base-japanese-v3",
3
+ "architectures": [
4
+ "LukeForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bert_model_name": "cl-tohoku/bert-base-japanese-v3",
8
+ "bos_token_id": null,
9
+ "classifier_dropout": null,
10
+ "cls_entity_prediction": false,
11
+ "entity_emb_size": 256,
12
+ "entity_vocab_size": 591699,
13
+ "eos_token_id": null,
14
+ "hidden_act": "gelu",
15
+ "hidden_dropout_prob": 0.1,
16
+ "hidden_size": 768,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 3072,
19
+ "layer_norm_eps": 1e-12,
20
+ "max_position_embeddings": 512,
21
+ "model_type": "luke",
22
+ "num_attention_heads": 12,
23
+ "num_hidden_layers": 12,
24
+ "pad_token_id": 0,
25
+ "position_embedding_type": "absolute",
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.30.2",
28
+ "type_vocab_size": 2,
29
+ "use_cache": true,
30
+ "use_entity_aware_attention": true,
31
+ "vocab_size": 32770
32
+ }
entity_vocab.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be6327e7cafc2f2b5f694a594d57113fd2bf6b620c592929202f75683b18b67d
3
+ size 23721849
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12fc608cd4f1662905c6e025fea20ca90f8494fa93a5c1f7c825ed41220ef2e7
3
+ size 1143901513
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<ent>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<ent2>",
12
+ "lstrip": false,
13
+ "normalized": true,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
+ "cls_token": "[CLS]",
19
+ "mask_token": "[MASK]",
20
+ "pad_token": "[PAD]",
21
+ "sep_token": "[SEP]",
22
+ "unk_token": "[UNK]"
23
+ }
tokenization_luke_bert_japanese.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright Studio-Ouisa and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for LUKE."""
16
+
17
+ import collections
18
+ import copy
19
+ import json
20
+ import os
21
+ from typing import List, Optional, Tuple
22
+
23
+ from transformers.models.bert_japanese.tokenization_bert_japanese import (
24
+ BasicTokenizer,
25
+ CharacterTokenizer,
26
+ JumanppTokenizer,
27
+ MecabTokenizer,
28
+ SentencepieceTokenizer,
29
+ SudachiTokenizer,
30
+ WordpieceTokenizer,
31
+ load_vocab,
32
+ )
33
+ from transformers.models.luke import LukeTokenizer
34
+ from transformers.tokenization_utils_base import AddedToken
35
+ from transformers.utils import logging
36
+
37
+
38
+ logger = logging.get_logger(__name__)
39
+
40
+ EntitySpan = Tuple[int, int]
41
+ EntitySpanInput = List[EntitySpan]
42
+ Entity = str
43
+ EntityInput = List[Entity]
44
+
45
+ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "entity_vocab_file": "entity_vocab.json"}
46
+
47
+ PRETRAINED_VOCAB_FILES_MAP = {"vocab_file": {}, "entity_vocab_file": {}}
48
+
49
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
50
+
51
+
52
+ class LukeBertJapaneseTokenizer(LukeTokenizer):
53
+ vocab_files_names = VOCAB_FILES_NAMES
54
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
55
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
56
+ model_input_names = ["input_ids", "attention_mask"]
57
+
58
+ def __init__(
59
+ self,
60
+ vocab_file,
61
+ entity_vocab_file,
62
+ spm_file=None,
63
+ task=None,
64
+ max_entity_length=32,
65
+ max_mention_length=30,
66
+ entity_token_1="<ent>",
67
+ entity_token_2="<ent2>",
68
+ entity_unk_token="[UNK]",
69
+ entity_pad_token="[PAD]",
70
+ entity_mask_token="[MASK]",
71
+ entity_mask2_token="[MASK2]",
72
+ do_lower_case=False,
73
+ do_word_tokenize=True,
74
+ do_subword_tokenize=True,
75
+ word_tokenizer_type="basic",
76
+ subword_tokenizer_type="wordpiece",
77
+ never_split=None,
78
+ unk_token="[UNK]",
79
+ sep_token="[SEP]",
80
+ pad_token="[PAD]",
81
+ cls_token="[CLS]",
82
+ mask_token="[MASK]",
83
+ mecab_kwargs=None,
84
+ sudachi_kwargs=None,
85
+ jumanpp_kwargs=None,
86
+ **kwargs,
87
+ ):
88
+ # We call the grandparent's init, not the parent's.
89
+ super(LukeTokenizer, self).__init__(
90
+ spm_file=spm_file,
91
+ unk_token=unk_token,
92
+ sep_token=sep_token,
93
+ pad_token=pad_token,
94
+ cls_token=cls_token,
95
+ mask_token=mask_token,
96
+ do_lower_case=do_lower_case,
97
+ do_word_tokenize=do_word_tokenize,
98
+ do_subword_tokenize=do_subword_tokenize,
99
+ word_tokenizer_type=word_tokenizer_type,
100
+ subword_tokenizer_type=subword_tokenizer_type,
101
+ never_split=never_split,
102
+ mecab_kwargs=mecab_kwargs,
103
+ sudachi_kwargs=sudachi_kwargs,
104
+ jumanpp_kwargs=jumanpp_kwargs,
105
+ task=task,
106
+ max_entity_length=32,
107
+ max_mention_length=30,
108
+ entity_token_1="<ent>",
109
+ entity_token_2="<ent2>",
110
+ entity_unk_token=entity_unk_token,
111
+ entity_pad_token=entity_pad_token,
112
+ entity_mask_token=entity_mask_token,
113
+ entity_mask2_token=entity_mask2_token,
114
+ **kwargs,
115
+ )
116
+
117
+ if subword_tokenizer_type == "sentencepiece":
118
+ if not os.path.isfile(spm_file):
119
+ raise ValueError(
120
+ f"Can't find a vocabulary file at path '{spm_file}'. To load the vocabulary from a Google"
121
+ " pretrained model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
122
+ )
123
+ self.spm_file = spm_file
124
+ else:
125
+ if not os.path.isfile(vocab_file):
126
+ raise ValueError(
127
+ f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google"
128
+ " pretrained model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
129
+ )
130
+ self.vocab = load_vocab(vocab_file)
131
+ self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
132
+
133
+ self.do_word_tokenize = do_word_tokenize
134
+ self.word_tokenizer_type = word_tokenizer_type
135
+ self.lower_case = do_lower_case
136
+ self.never_split = never_split
137
+ self.mecab_kwargs = copy.deepcopy(mecab_kwargs)
138
+ self.sudachi_kwargs = copy.deepcopy(sudachi_kwargs)
139
+ self.jumanpp_kwargs = copy.deepcopy(jumanpp_kwargs)
140
+ if do_word_tokenize:
141
+ if word_tokenizer_type == "basic":
142
+ self.word_tokenizer = BasicTokenizer(
143
+ do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=False
144
+ )
145
+ elif word_tokenizer_type == "mecab":
146
+ self.word_tokenizer = MecabTokenizer(
147
+ do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {})
148
+ )
149
+ elif word_tokenizer_type == "sudachi":
150
+ self.word_tokenizer = SudachiTokenizer(
151
+ do_lower_case=do_lower_case, never_split=never_split, **(sudachi_kwargs or {})
152
+ )
153
+ elif word_tokenizer_type == "jumanpp":
154
+ self.word_tokenizer = JumanppTokenizer(
155
+ do_lower_case=do_lower_case, never_split=never_split, **(jumanpp_kwargs or {})
156
+ )
157
+ else:
158
+ raise ValueError(f"Invalid word_tokenizer_type '{word_tokenizer_type}' is specified.")
159
+
160
+ self.do_subword_tokenize = do_subword_tokenize
161
+ self.subword_tokenizer_type = subword_tokenizer_type
162
+ if do_subword_tokenize:
163
+ if subword_tokenizer_type == "wordpiece":
164
+ self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
165
+ elif subword_tokenizer_type == "character":
166
+ self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token)
167
+ elif subword_tokenizer_type == "sentencepiece":
168
+ self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=self.unk_token)
169
+ else:
170
+ raise ValueError(f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified.")
171
+
172
+ # we add 2 special tokens for downstream tasks
173
+ # for more information about lstrip and rstrip, see https://github.com/huggingface/transformers/pull/2778
174
+ entity_token_1 = (
175
+ AddedToken(entity_token_1, lstrip=False, rstrip=False)
176
+ if isinstance(entity_token_1, str)
177
+ else entity_token_1
178
+ )
179
+ entity_token_2 = (
180
+ AddedToken(entity_token_2, lstrip=False, rstrip=False)
181
+ if isinstance(entity_token_2, str)
182
+ else entity_token_2
183
+ )
184
+ kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
185
+ kwargs["additional_special_tokens"] += [entity_token_1, entity_token_2]
186
+
187
+ with open(entity_vocab_file, encoding="utf-8") as entity_vocab_handle:
188
+ self.entity_vocab = json.load(entity_vocab_handle)
189
+ for entity_special_token in [entity_unk_token, entity_pad_token, entity_mask_token, entity_mask2_token]:
190
+ if entity_special_token not in self.entity_vocab:
191
+ raise ValueError(
192
+ f"Specified entity special token ``{entity_special_token}`` is not found in entity_vocab. "
193
+ f"Probably an incorrect entity vocab file is loaded: {entity_vocab_file}."
194
+ )
195
+ self.entity_unk_token_id = self.entity_vocab[entity_unk_token]
196
+ self.entity_pad_token_id = self.entity_vocab[entity_pad_token]
197
+ self.entity_mask_token_id = self.entity_vocab[entity_mask_token]
198
+ self.entity_mask2_token_id = self.entity_vocab[entity_mask2_token]
199
+
200
+ self.task = task
201
+ if task is None or task == "entity_span_classification":
202
+ self.max_entity_length = max_entity_length
203
+ elif task == "entity_classification":
204
+ self.max_entity_length = 1
205
+ elif task == "entity_pair_classification":
206
+ self.max_entity_length = 2
207
+ else:
208
+ raise ValueError(
209
+ f"Task {task} not supported. Select task from ['entity_classification', 'entity_pair_classification',"
210
+ " 'entity_span_classification'] only."
211
+ )
212
+
213
+ self.max_mention_length = max_mention_length
214
+
215
+ @property
216
+ # Copied from BertJapaneseTokenizer
217
+ def do_lower_case(self):
218
+ return self.lower_case
219
+
220
+ # Copied from BertJapaneseTokenizer
221
+ def __getstate__(self):
222
+ state = dict(self.__dict__)
223
+ if self.word_tokenizer_type in ["mecab", "sudachi", "jumanpp"]:
224
+ del state["word_tokenizer"]
225
+ return state
226
+
227
+ # Copied from BertJapaneseTokenizer
228
+ def __setstate__(self, state):
229
+ self.__dict__ = state
230
+ if self.word_tokenizer_type == "mecab":
231
+ self.word_tokenizer = MecabTokenizer(
232
+ do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.mecab_kwargs or {})
233
+ )
234
+ elif self.word_tokenizer_type == "sudachi":
235
+ self.word_tokenizer = SudachiTokenizer(
236
+ do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.sudachi_kwargs or {})
237
+ )
238
+ elif self.word_tokenizer_type == "jumanpp":
239
+ self.word_tokenizer = JumanppTokenizer(
240
+ do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.jumanpp_kwargs or {})
241
+ )
242
+
243
+ # Copied from BertJapaneseTokenizer
244
+ def _tokenize(self, text):
245
+ if self.do_word_tokenize:
246
+ tokens = self.word_tokenizer.tokenize(text, never_split=self.all_special_tokens)
247
+ else:
248
+ tokens = [text]
249
+
250
+ if self.do_subword_tokenize:
251
+ split_tokens = [sub_token for token in tokens for sub_token in self.subword_tokenizer.tokenize(token)]
252
+ else:
253
+ split_tokens = tokens
254
+
255
+ return split_tokens
256
+
257
+ @property
258
+ # Copied from BertJapaneseTokenizer
259
+ def vocab_size(self):
260
+ if self.subword_tokenizer_type == "sentencepiece":
261
+ return len(self.subword_tokenizer.sp_model)
262
+ return len(self.vocab)
263
+
264
+ # Copied from BertJapaneseTokenizer
265
+ def get_vocab(self):
266
+ if self.subword_tokenizer_type == "sentencepiece":
267
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
268
+ vocab.update(self.added_tokens_encoder)
269
+ return vocab
270
+ return dict(self.vocab, **self.added_tokens_encoder)
271
+
272
+ # Copied from BertJapaneseTokenizer
273
+ def _convert_token_to_id(self, token):
274
+ """Converts a token (str) in an id using the vocab."""
275
+ if self.subword_tokenizer_type == "sentencepiece":
276
+ return self.subword_tokenizer.sp_model.PieceToId(token)
277
+ return self.vocab.get(token, self.vocab.get(self.unk_token))
278
+
279
+ # Copied from BertJapaneseTokenizer
280
+ def _convert_id_to_token(self, index):
281
+ """Converts an index (integer) in a token (str) using the vocab."""
282
+ if self.subword_tokenizer_type == "sentencepiece":
283
+ return self.subword_tokenizer.sp_model.IdToPiece(index)
284
+ return self.ids_to_tokens.get(index, self.unk_token)
285
+
286
+ # Copied from BertJapaneseTokenizer
287
+ def convert_tokens_to_string(self, tokens):
288
+ """Converts a sequence of tokens (string) in a single string."""
289
+ if self.subword_tokenizer_type == "sentencepiece":
290
+ return self.subword_tokenizer.sp_model.decode(tokens)
291
+ out_string = " ".join(tokens).replace(" ##", "").strip()
292
+ return out_string
293
+
294
+ # Copied from BertJapaneseTokenizer
295
+ def build_inputs_with_special_tokens(
296
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
297
+ ) -> List[int]:
298
+ """
299
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
300
+ adding special tokens. A BERT sequence has the following format:
301
+
302
+ - single sequence: `[CLS] X [SEP]`
303
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
304
+
305
+ Args:
306
+ token_ids_0 (`List[int]`):
307
+ List of IDs to which the special tokens will be added.
308
+ token_ids_1 (`List[int]`, *optional*):
309
+ Optional second list of IDs for sequence pairs.
310
+
311
+ Returns:
312
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
313
+ """
314
+ if token_ids_1 is None:
315
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
316
+ cls = [self.cls_token_id]
317
+ sep = [self.sep_token_id]
318
+ return cls + token_ids_0 + sep + token_ids_1 + sep
319
+
320
+ # Copied from BertJapaneseTokenizer
321
+ def get_special_tokens_mask(
322
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
323
+ ) -> List[int]:
324
+ """
325
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
326
+ special tokens using the tokenizer `prepare_for_model` method.
327
+
328
+ Args:
329
+ token_ids_0 (`List[int]`):
330
+ List of IDs.
331
+ token_ids_1 (`List[int]`, *optional*):
332
+ Optional second list of IDs for sequence pairs.
333
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
334
+ Whether or not the token list is already formatted with special tokens for the model.
335
+
336
+ Returns:
337
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
338
+ """
339
+
340
+ if already_has_special_tokens:
341
+ return super().get_special_tokens_mask(
342
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
343
+ )
344
+
345
+ if token_ids_1 is not None:
346
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
347
+ return [1] + ([0] * len(token_ids_0)) + [1]
348
+
349
+ # Copied from BertJapaneseTokenizer
350
+ def create_token_type_ids_from_sequences(
351
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
352
+ ) -> List[int]:
353
+ """
354
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
355
+ pair mask has the following format:
356
+
357
+ ```
358
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
359
+ | first sequence | second sequence |
360
+ ```
361
+
362
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
363
+
364
+ Args:
365
+ token_ids_0 (`List[int]`):
366
+ List of IDs.
367
+ token_ids_1 (`List[int]`, *optional*):
368
+ Optional second list of IDs for sequence pairs.
369
+
370
+ Returns:
371
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
372
+ """
373
+ sep = [self.sep_token_id]
374
+ cls = [self.cls_token_id]
375
+ if token_ids_1 is None:
376
+ return len(cls + token_ids_0 + sep) * [0]
377
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
378
+
379
+ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
380
+ return (text, kwargs)
381
+
382
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
383
+ if os.path.isdir(save_directory):
384
+ if self.subword_tokenizer_type == "sentencepiece":
385
+ vocab_file = os.path.join(
386
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["spm_file"]
387
+ )
388
+ else:
389
+ vocab_file = os.path.join(
390
+ save_directory,
391
+ (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"],
392
+ )
393
+ else:
394
+ vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
395
+
396
+ if self.subword_tokenizer_type == "sentencepiece":
397
+ with open(vocab_file, "wb") as writer:
398
+ content_spiece_model = self.subword_tokenizer.sp_model.serialized_model_proto()
399
+ writer.write(content_spiece_model)
400
+ else:
401
+ with open(vocab_file, "w", encoding="utf-8") as writer:
402
+ index = 0
403
+ for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
404
+ if index != token_index:
405
+ logger.warning(
406
+ f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
407
+ " Please check that the vocabulary is not corrupted!"
408
+ )
409
+ index = token_index
410
+ writer.write(token + "\n")
411
+ index += 1
412
+
413
+ entity_vocab_file = os.path.join(
414
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["entity_vocab_file"]
415
+ )
416
+
417
+ with open(entity_vocab_file, "w", encoding="utf-8") as f:
418
+ f.write(json.dumps(self.entity_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
419
+
420
+ return vocab_file, entity_vocab_file
tokenizer_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenization_luke_bert_japanese.LukeBertJapaneseTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "clean_up_tokenization_spaces": true,
9
+ "cls_token": "[CLS]",
10
+ "do_lower_case": false,
11
+ "do_subword_tokenize": true,
12
+ "do_word_tokenize": true,
13
+ "entity_mask2_token": "[MASK2]",
14
+ "entity_mask_token": "[MASK]",
15
+ "entity_pad_token": "[PAD]",
16
+ "entity_token_1": "<ent>",
17
+ "entity_token_2": "<ent2>",
18
+ "entity_unk_token": "[UNK]",
19
+ "jumanpp_kwargs": null,
20
+ "mask_token": "[MASK]",
21
+ "max_entity_length": 32,
22
+ "max_mention_length": 30,
23
+ "mecab_kwargs": {
24
+ "mecab_dic": "unidic_lite"
25
+ },
26
+ "model_max_length": 512,
27
+ "never_split": null,
28
+ "pad_token": "[PAD]",
29
+ "sep_token": "[SEP]",
30
+ "spm_file": null,
31
+ "subword_tokenizer_type": "wordpiece",
32
+ "sudachi_kwargs": null,
33
+ "task": null,
34
+ "tokenizer_class": "LukeBertJapaneseTokenizer",
35
+ "unk_token": "[UNK]",
36
+ "word_tokenizer_type": "mecab"
37
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff