danfu09 commited on
Commit
3fabfd0
1 Parent(s): cb4b136

Allow loading via AutoTokenizer (#3)

Browse files

- Add bert-base-cased tokenizer with model_max_length: 8192 (fdd629a0b4b0d3543fd2cfe70db482ffdcfe95a7)
- Remove AutoTokenizer from the config (7d7932ee225d71aadc1ba6d16b2fcadba10db703)

config.json CHANGED
@@ -9,7 +9,6 @@
9
  "auto_map": {
10
  "AutoConfig": "configuration_bert.BertConfig",
11
  "AutoModelForSequenceClassification": "bert_layers.BertForTextEncoding",
12
- "AutoTokenizer": "bert-base-uncased"
13
  },
14
  "classifier_dropout": null,
15
  "gradient_checkpointing": false,
 
9
  "auto_map": {
10
  "AutoConfig": "configuration_bert.BertConfig",
11
  "AutoModelForSequenceClassification": "bert_layers.BertForTextEncoding",
 
12
  },
13
  "classifier_dropout": null,
14
  "gradient_checkpointing": false,
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "mask_token": "[MASK]",
6
+ "model_max_length": 8192,
7
+ "pad_token": "[PAD]",
8
+ "sep_token": "[SEP]",
9
+ "strip_accents": null,
10
+ "tokenize_chinese_chars": true,
11
+ "tokenizer_class": "BertTokenizer",
12
+ "unk_token": "[UNK]"
13
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff