gabrielandrade2 commited on
Commit
13dbcb6
1 Parent(s): c837b79

Add tokenizer configuration files

Browse files
Files changed (4) hide show
  1. config.json +1 -1
  2. special_tokens_map.json +7 -0
  3. tokenizer_config.json +24 -0
  4. vocab.txt +0 -0
config.json CHANGED
@@ -183,7 +183,7 @@
183
  "num_hidden_layers": 12,
184
  "pad_token_id": 0,
185
  "position_embedding_type": "absolute",
186
- "tokenizer_class": "BertJapaneseTokenizer",
187
  "type_vocab_size": 2,
188
  "vocab_size": 32000
189
  }
 
183
  "num_hidden_layers": 12,
184
  "pad_token_id": 0,
185
  "position_embedding_type": "absolute",
186
+ "tokenizer_class": "NER_medNLP.NER_tokenizer_BIO",
187
  "type_vocab_size": 2,
188
  "vocab_size": 32000
189
  }
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_lower_case": false,
4
+ "do_subword_tokenize": true,
5
+ "do_word_tokenize": true,
6
+ "jumanpp_kwargs": null,
7
+ "mask_token": "[MASK]",
8
+ "mecab_kwargs": null,
9
+ "model_max_length": 512,
10
+ "never_split": null,
11
+ "pad_token": "[PAD]",
12
+ "sep_token": "[SEP]",
13
+ "special_tokens_map_file": null,
14
+ "subword_tokenizer_type": "wordpiece",
15
+ "sudachi_kwargs": null,
16
+ "tokenizer_class": "NER_tokenizer_BIO",
17
+ "unk_token": "[UNK]",
18
+ "word_tokenizer_type": "mecab",
19
+ "auto_map": [
20
+ "NER_medNLP.NER_tokenizer_BIO",
21
+ null
22
+ ],
23
+ "num_entity_type": "40"
24
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff