p-s commited on
Commit
0419fec
·
1 Parent(s): 78e84bd

Initial release

Browse files
README.md ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - ja
4
+ license: mit
5
+ tags:
6
+ - bart
7
+ - pytorch
8
+ datasets:
9
+ - wikipedia
10
+ ---
11
+ # bart-base-japanese
12
+
13
+ This model is converted from the original [Japanese BART Pretrained model](https://nlp.ist.i.kyoto-u.ac.jp/?BART%E6%97%A5%E6%9C%AC%E8%AA%9EPretrained%E3%83%A2%E3%83%87%E3%83%AB) released by Kyoto University.
14
+
15
+ Both the encoder and decoder outputs are identical to the original Fairseq model.
16
+
17
+ ### How to use the model
18
+
19
+ The input text should be tokenized by [BartJapaneseTokenizer](https://huggingface.co/Formzu/bart-base-japanese/blob/main/tokenization_bart_japanese.py).
20
+
21
+ Tokenizer requirements:
22
+ * [Juman++](https://github.com/ku-nlp/jumanpp)
23
+ * [zenhan](https://pypi.org/project/zenhan/)
24
+ * [pyknp](https://pypi.org/project/pyknp/)
25
+ * [sentencepiece](https://pypi.org/project/sentencepiece/)
26
+
27
+ #### Simple FillMaskPipeline
28
+ ```python
29
+ from transformers import AutoModelForSeq2SeqLM, pipeline
30
+ from tokenization_bart_japanese import BartJapaneseTokenizer
31
+
32
+ model_name = "Formzu/bart-base-japanese"
33
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
34
+ tokenizer = BartJapaneseTokenizer.from_pretrained(model_name)
35
+
36
+ masked_text = "天気が<mask>から散歩しましょう。"
37
+
38
+ fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)
39
+ out = fill_mask(masked_text)
40
+ print(out)
41
+ # [{'score': 0.19255658984184265, 'token': 1718, 'token_str': 'よく', 'sequence': '天気 が よく から 散歩 し ましょう 。'},
42
+ # {'score': 0.14426815509796143, 'token': 5478, 'token_str': '良く', 'sequence': '天気 が 良く から 散歩 し ましょう 。'},
43
+ # {'score': 0.05554169788956642, 'token': 6561, 'token_str': '悪い', 'sequence': '天気 が 悪い から 散歩 し ましょう 。'},
44
+ # {'score': 0.05524599179625511, 'token': 3553, 'token_str': '良い', 'sequence': '天気 が 良い から 散歩 し ましょう 。'},
45
+ # {'score': 0.03720080852508545, 'token': 1370, 'token_str': '良', 'sequence': '天気 が 良 から 散歩 し ましょう 。'}]
46
+ ```
47
+ #### Text Generation
48
+ ```python
49
+ from transformers import AutoModelForSeq2SeqLM
50
+ from tokenization_bart_japanese import BartJapaneseTokenizer
51
+ import torch
52
+
53
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
54
+
55
+ model_name = "Formzu/bart-base-japanese"
56
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
57
+ tokenizer = BartJapaneseTokenizer.from_pretrained(model_name)
58
+
59
+ masked_text = "天気が<mask>から散歩しましょう。"
60
+
61
+ inp = tokenizer(masked_text, return_tensors='pt').to(device)
62
+
63
+ out = model.generate(**inp, num_beams=1, min_length=0, max_length=20, early_stopping=True, no_repeat_ngram_size=2)
64
+ res = "".join(tokenizer.decode(out.squeeze(0).tolist(), skip_special_tokens=True).split(" "))
65
+ print(res)
66
+ # 天気がよくなってから散歩しましょう。天気のよく合っているところにいる
67
+ ```
68
+
69
+ ### Framework versions
70
+
71
+ - Transformers 4.21.2
72
+ - Pytorch 1.12.1+cu116
73
+ - Tokenizers 0.12.1
config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bart-base-japanese",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "add_final_layer_norm": true,
6
+ "architectures": [
7
+ "MBartForConditionalGeneration"
8
+ ],
9
+ "attention_dropout": 0.0,
10
+ "bos_token_id": 0,
11
+ "classifier_dropout": 0.0,
12
+ "d_model": 768,
13
+ "decoder_attention_heads": 12,
14
+ "decoder_ffn_dim": 3072,
15
+ "decoder_layerdrop": 0.0,
16
+ "decoder_layers": 6,
17
+ "dropout": 0.0,
18
+ "encoder_attention_heads": 12,
19
+ "encoder_ffn_dim": 3072,
20
+ "encoder_layerdrop": 0.0,
21
+ "encoder_layers": 6,
22
+ "eos_token_id": 2,
23
+ "forced_eos_token_id": 2,
24
+ "id2label": {
25
+ "0": "LABEL_0",
26
+ "1": "LABEL_1",
27
+ "2": "LABEL_2"
28
+ },
29
+ "init_std": 0.02,
30
+ "is_encoder_decoder": true,
31
+ "label2id": {
32
+ "LABEL_0": 0,
33
+ "LABEL_1": 1,
34
+ "LABEL_2": 2
35
+ },
36
+ "max_position_embeddings": 1024,
37
+ "model_type": "mbart",
38
+ "num_hidden_layers": 6,
39
+ "pad_token_id": 1,
40
+ "scale_embedding": false,
41
+ "torch_dtype": "float32",
42
+ "transformers_version": "4.21.2",
43
+ "use_cache": true,
44
+ "vocab_size": 32002
45
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74367555daf7ab2ed66635f49f84bde097db57d11cb0c6c293410645a0f3f34f
3
+ size 501801969
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff9226612d029bfade0621f401cb605740dc0a8ca88400e89ffdce26702ee266
3
+ size 588767
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenization_bart_japanese.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import os
17
+ from contextlib import contextmanager
18
+ from shutil import copyfile
19
+ from typing import Any, Dict, List, Optional, Tuple
20
+
21
+ import sentencepiece as spm
22
+
23
+ from transformers import AddedToken, PreTrainedTokenizer
24
+ from transformers import logging
25
+
26
+
27
+ logger = logging.get_logger(__name__)
28
+
29
+ SPIECE_UNDERLINE = "▁"
30
+
31
+ VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
32
+
33
+
34
+ PRETRAINED_VOCAB_FILES_MAP = {
35
+ "vocab_file": {
36
+ "Formzu/bart-base-japanese": (
37
+ "https://huggingface.co/Formzu/bart-base-japanese/resolve/main/sentencepiece.bpe.model"
38
+ ),
39
+ "Formzu/bart-large-japanese": (
40
+ "https://huggingface.co/Formzu/bart-large-japanese/resolve/main/sentencepiece.bpe.model"
41
+ ),
42
+ }
43
+ }
44
+
45
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
46
+ "Formzu/bart-base-japanese": 1024,
47
+ "Formzu/bart-large-japanese": 1024,
48
+ }
49
+
50
+
51
+ class BartJapaneseTokenizer(PreTrainedTokenizer):
52
+ """
53
+ Construct a BART tokenizer for Japanese text.
54
+
55
+ Adapted from [`RobertaTokenizer`], [`XLNetTokenizer`] and [`MBartTokenizer`]. Based on
56
+ [SentencePiece](https://github.com/google/sentencepiece).
57
+
58
+ The tokenization method is `<bos> <tokens> <eos>`.
59
+
60
+ Examples:
61
+
62
+ ```python
63
+ >>> from tokenization_bart_japanese import BartJapaneseTokenizer
64
+
65
+ >>> tokenizer = BartJapaneseTokenizer.from_pretrained("Formzu/bart-base-japanese")
66
+ >>> example_japanese_phrase = "今日は晴れています。"
67
+ >>> expected_label = "天気"
68
+ >>> inputs = tokenizer(example_japanese_phrase, return_tensors="pt")
69
+ >>> labels = tokenizer(expected_label, return_tensors="pt")
70
+ >>> inputs["labels"] = labels["input_ids"]
71
+ ```"""
72
+
73
+ vocab_files_names = VOCAB_FILES_NAMES
74
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
75
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
76
+ model_input_names = ["input_ids", "attention_mask"]
77
+
78
+ prefix_tokens: List[int] = []
79
+ suffix_tokens: List[int] = []
80
+
81
+ def __init__(
82
+ self,
83
+ vocab_file,
84
+ bos_token="<s>",
85
+ eos_token="</s>",
86
+ sep_token="</s>",
87
+ cls_token="<s>",
88
+ unk_token="<unk>",
89
+ pad_token="<pad>",
90
+ mask_token="<mask>",
91
+ tokenizer_file=None,
92
+ src_lang=None,
93
+ tgt_lang=None,
94
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
95
+ additional_special_tokens=None,
96
+ **kwargs
97
+ ):
98
+ # Mask token behave like a normal word, i.e. include the space before it
99
+ mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
100
+
101
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
102
+
103
+ super().__init__(
104
+ bos_token=bos_token,
105
+ eos_token=eos_token,
106
+ unk_token=unk_token,
107
+ sep_token=sep_token,
108
+ cls_token=cls_token,
109
+ pad_token=pad_token,
110
+ mask_token=mask_token,
111
+ tokenizer_file=None,
112
+ src_lang=src_lang,
113
+ tgt_lang=tgt_lang,
114
+ additional_special_tokens=additional_special_tokens,
115
+ sp_model_kwargs=self.sp_model_kwargs,
116
+ **kwargs,
117
+ )
118
+
119
+
120
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
121
+ self.sp_model.Load(str(vocab_file))
122
+ self.vocab_file = vocab_file
123
+ try:
124
+ from zenhan import h2z
125
+ except ModuleNotFoundError as error:
126
+ raise error.__class__(
127
+ "You need to install zenhan to use BartJapaneseTokenizer."
128
+ "See https://pypi.org/project/zenhan/ for installation."
129
+ )
130
+ try:
131
+ from pyknp import Juman
132
+ except ModuleNotFoundError as error:
133
+ raise error.__class__(
134
+ "You need to install pyknp to use BartJapaneseTokenizer."
135
+ "See https://pypi.org/project/pyknp/ for installation."
136
+ )
137
+
138
+ self.h2z = h2z
139
+ self.jumanpp = Juman()
140
+
141
+ # Original fairseq vocab and spm vocab must be "aligned":
142
+ # Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
143
+ # -------- | ------- | ------- | ------ | ------- | ------ | ------ | ------ | ------ | ------ | ------
144
+ # fairseq | '<s>' | '<pad>' | '</s>' | '<unk>' | '▁の' | '▁、' | '▁。' | '▁に' | '▁は' | '▁を'
145
+ # spm | '<unk>' | '<s>' | '</s>' | '▁の'  | '▁、' | '▁。' | '▁に' | '▁は' | '▁を' | '▁と'
146
+
147
+ # Mimic fairseq token-to-id alignment for the first 4 token
148
+ self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
149
+
150
+ # The first "real" token "▁の" has position 4 in the original fairseq vocab and position 3 in the spm vocab
151
+ self.fairseq_offset = 1
152
+
153
+ self.sp_model_size = len(self.sp_model)
154
+
155
+ self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + self.fairseq_offset
156
+ self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
157
+
158
+ self.set_special_tokens()
159
+
160
+ def __getstate__(self):
161
+ state = self.__dict__.copy()
162
+ state["sp_model"] = None
163
+ state["sp_model_proto"] = self.sp_model.serialized_model_proto()
164
+ return state
165
+
166
+ def __setstate__(self, d):
167
+ self.__dict__ = d
168
+
169
+ # for backward compatibility
170
+ if not hasattr(self, "sp_model_kwargs"):
171
+ self.sp_model_kwargs = {}
172
+
173
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
174
+ self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
175
+
176
+ @property
177
+ def vocab_size(self):
178
+ return len(self.sp_model) + self.fairseq_offset + 1 # Plus 1 for the mask token
179
+
180
+ def get_special_tokens_mask(
181
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
182
+ ) -> List[int]:
183
+ """
184
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
185
+ special tokens using the tokenizer `prepare_for_model` method.
186
+
187
+ Args:
188
+ token_ids_0 (`List[int]`):
189
+ List of IDs.
190
+ token_ids_1 (`List[int]`, *optional*):
191
+ Optional second list of IDs for sequence pairs.
192
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
193
+ Whether or not the token list is already formatted with special tokens for the model.
194
+
195
+ Returns:
196
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
197
+ """
198
+
199
+ if already_has_special_tokens:
200
+ return super().get_special_tokens_mask(
201
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
202
+ )
203
+
204
+ prefix_ones = [1] * len(self.prefix_tokens)
205
+ suffix_ones = [1] * len(self.suffix_tokens)
206
+ if token_ids_1 is None:
207
+ return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
208
+ return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
209
+
210
+ def build_inputs_with_special_tokens(
211
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
212
+ ) -> List[int]:
213
+ """
214
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
215
+ adding special tokens. A Japanese BART sequence has the following format, where `X` represents the sequence:
216
+
217
+ - `input_ids` (for encoder) `[bos] X [eos]`
218
+ - `decoder_input_ids`: (for decoder) `[bos] X [eos]`
219
+
220
+ Pairs of sequences are not the expected use case, but they will be handled without a separator.
221
+
222
+ Args:
223
+ token_ids_0 (`List[int]`):
224
+ List of IDs to which the special tokens will be added.
225
+ token_ids_1 (`List[int]`, *optional*):
226
+ Optional second list of IDs for sequence pairs.
227
+
228
+ Returns:
229
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
230
+ """
231
+ if token_ids_1 is None:
232
+ return self.prefix_tokens + token_ids_0 + self.suffix_tokens
233
+ # We don't expect to process pairs, but leave the pair logic for API consistency
234
+ return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
235
+
236
+ def create_token_type_ids_from_sequences(
237
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
238
+ ) -> List[int]:
239
+ """
240
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. Japanese BART does not
241
+ make use of token type ids, therefore a list of zeros is returned.
242
+
243
+ Args:
244
+ token_ids_0 (`List[int]`):
245
+ List of IDs.
246
+ token_ids_1 (`List[int]`, *optional*):
247
+ Optional second list of IDs for sequence pairs.
248
+
249
+ Returns:
250
+ `List[int]`: List of zeros.
251
+
252
+ """
253
+
254
+ sep = [self.sep_token_id]
255
+ cls = [self.cls_token_id]
256
+
257
+ if token_ids_1 is None:
258
+ return len(cls + token_ids_0 + sep) * [0]
259
+ return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
260
+
261
+ def get_vocab(self):
262
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
263
+ vocab.update(self.added_tokens_encoder)
264
+ return vocab
265
+
266
+ def _tokenize(self, text: str) -> List[str]:
267
+ text = text
268
+ text = self.h2z(text)
269
+ text = self.jumanpp.analysis(text)
270
+ text = ' '.join([mrph.midasi for mrph in text.mrph_list()])
271
+ return self.sp_model.encode(text, out_type=str)
272
+
273
+ def _convert_token_to_id(self, token):
274
+ """Converts a token (str) in an id using the vocab."""
275
+ if token in self.fairseq_tokens_to_ids:
276
+ return self.fairseq_tokens_to_ids[token]
277
+ spm_id = self.sp_model.PieceToId(token)
278
+
279
+ # Need to return unknown token if the SP model returned 0
280
+ return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
281
+
282
+ def _convert_id_to_token(self, index):
283
+ """Converts an index (integer) in a token (str) using the vocab."""
284
+ if index in self.fairseq_ids_to_tokens:
285
+ return self.fairseq_ids_to_tokens[index]
286
+ return self.sp_model.IdToPiece(index - self.fairseq_offset)
287
+
288
+ def convert_tokens_to_string(self, tokens):
289
+ """Converts a sequence of tokens (strings for sub-words) in a single string."""
290
+ out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
291
+ return out_string
292
+
293
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
294
+ if not os.path.isdir(save_directory):
295
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
296
+ return
297
+ out_vocab_file = os.path.join(
298
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
299
+ )
300
+
301
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
302
+ copyfile(self.vocab_file, out_vocab_file)
303
+ elif not os.path.isfile(self.vocab_file):
304
+ with open(out_vocab_file, "wb") as fi:
305
+ content_spiece_model = self.sp_model.serialized_model_proto()
306
+ fi.write(content_spiece_model)
307
+
308
+ return (out_vocab_file,)
309
+
310
+ def set_special_tokens(self) -> None:
311
+ """Set prefix=[bos], suffix=[eos]."""
312
+ self.prefix_tokens = [self.bos_token_id]
313
+ self.suffix_tokens = [self.eos_token_id]
314
+ self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
tokenizer_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": null,
3
+ "bos_token": "<s>",
4
+ "cls_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "mask_token": {
7
+ "__type": "AddedToken",
8
+ "content": "<mask>",
9
+ "lstrip": true,
10
+ "normalized": true,
11
+ "rstrip": false,
12
+ "single_word": false
13
+ },
14
+ "pad_token": "<pad>",
15
+ "sep_token": "</s>",
16
+ "sp_model_kwargs": {},
17
+ "src_lang": null,
18
+ "tgt_lang": null,
19
+ "tokenizer_class": "BartJapaneseTokenizer",
20
+ "tokenizer_file": null,
21
+ "unk_token": "<unk>"
22
+ }