add missing char tokens to vocab (with embeddings close to [UNK])

Files changed (6) hide show

config.json CHANGED Viewed

@@ -1,8 +1,10 @@
 {
   "architectures": [
     "BertForMaskedLM"
   ],
   "attention_probs_dropout_prob": 0.1,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
@@ -14,6 +16,9 @@
   "num_attention_heads": 12,
   "num_hidden_layers": 12,
   "pad_token_id": 3,
   "type_vocab_size": 2,
-  "vocab_size": 30000
-}

 {
+  "_name_or_path": "bert-base-dutch-cased",
   "architectures": [
     "BertForMaskedLM"
   ],
   "attention_probs_dropout_prob": 0.1,
+  "gradient_checkpointing": false,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
   "num_attention_heads": 12,
   "num_hidden_layers": 12,
   "pad_token_id": 3,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.5.1",
   "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30073
+}

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0da32020b1799e175f53caddbc6bf250af61d67d5a8c77f4708af6ebe1d03600
-size 438869143

 version https://git-lfs.github.com/spec/v1
+oid sha256:5ffe408c7eea0ffee4c257c6028f8c98146967e3ac3db51dba8e2bc8a4abddf5
+size 436761702

tf_model.h5 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:686ae10663084e39852406c4abf97edce8dbd6173b79a9c1b68265deb3970f11
-size 532853952

 version https://git-lfs.github.com/spec/v1
+oid sha256:88cc47b929d21ed816d6ad8d5abea5c06ccae04a5f04f2d6b07da7d212aa18e1
+size 530923844

tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -1,4 +1,11 @@
 {
   "do_lower_case": false,
-  "max_len": 512
-}

 {
   "do_lower_case": false,
+  "unk_token": "[UNK]",
+  "sep_token": "[SEP]",
+  "pad_token": "[PAD]",
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "tokenize_chinese_chars": true,
+  "strip_accents": null,
+  "model_max_length": 512
+}

vocab.txt CHANGED Viewed

@@ -29997,4 +29997,77 @@ zóó
 ##óók
 ##öl
 ##ön
-##ör

 ##óók
 ##öl
 ##ön
+##ör
+##Q
+##X
+##Ç
+##Ó
+##Ô
+##Ú
+##Û
+##Ü
+##à
+##á
+##â
+##ä
+##ê
+##ì
+##í
+##î
+##ñ
+##ò
+##ô
+##ù
+##ú
+##û
+##ü
+Q
+X
+a
+c
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+x
+y
+Ç
+Ó
+Ô
+Ú
+Û
+Ü
+à
+á
+â
+ä
+è
+é
+ê
+ë
+ì
+í
+î
+ï
+ñ
+ò
+ó
+ô
+ö
+ù
+ú
+û
+ü