Wietse de Vries commited on
Commit
ff8ab2f
1 Parent(s): 9dbd6c0

add missing char tokens to vocab (with embeddings close to [UNK])

Browse files
Files changed (6) hide show
  1. config.json +7 -2
  2. pytorch_model.bin +2 -2
  3. tf_model.h5 +2 -2
  4. tokenizer.json +0 -0
  5. tokenizer_config.json +9 -2
  6. vocab.txt +74 -1
config.json CHANGED
@@ -1,8 +1,10 @@
1
  {
 
2
  "architectures": [
3
  "BertForMaskedLM"
4
  ],
5
  "attention_probs_dropout_prob": 0.1,
 
6
  "hidden_act": "gelu",
7
  "hidden_dropout_prob": 0.1,
8
  "hidden_size": 768,
@@ -14,6 +16,9 @@
14
  "num_attention_heads": 12,
15
  "num_hidden_layers": 12,
16
  "pad_token_id": 3,
 
 
17
  "type_vocab_size": 2,
18
- "vocab_size": 30000
19
- }
 
1
  {
2
+ "_name_or_path": "bert-base-dutch-cased",
3
  "architectures": [
4
  "BertForMaskedLM"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
+ "gradient_checkpointing": false,
8
  "hidden_act": "gelu",
9
  "hidden_dropout_prob": 0.1,
10
  "hidden_size": 768,
16
  "num_attention_heads": 12,
17
  "num_hidden_layers": 12,
18
  "pad_token_id": 3,
19
+ "position_embedding_type": "absolute",
20
+ "transformers_version": "4.5.1",
21
  "type_vocab_size": 2,
22
+ "use_cache": true,
23
+ "vocab_size": 30073
24
+ }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0da32020b1799e175f53caddbc6bf250af61d67d5a8c77f4708af6ebe1d03600
3
- size 438869143
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ffe408c7eea0ffee4c257c6028f8c98146967e3ac3db51dba8e2bc8a4abddf5
3
+ size 436761702
tf_model.h5 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:686ae10663084e39852406c4abf97edce8dbd6173b79a9c1b68265deb3970f11
3
- size 532853952
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88cc47b929d21ed816d6ad8d5abea5c06ccae04a5f04f2d6b07da7d212aa18e1
3
+ size 530923844
tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
tokenizer_config.json CHANGED
@@ -1,4 +1,11 @@
1
  {
2
  "do_lower_case": false,
3
- "max_len": 512
4
- }
 
 
 
 
 
 
 
1
  {
2
  "do_lower_case": false,
3
+ "unk_token": "[UNK]",
4
+ "sep_token": "[SEP]",
5
+ "pad_token": "[PAD]",
6
+ "cls_token": "[CLS]",
7
+ "mask_token": "[MASK]",
8
+ "tokenize_chinese_chars": true,
9
+ "strip_accents": null,
10
+ "model_max_length": 512
11
+ }
vocab.txt CHANGED
@@ -29997,4 +29997,77 @@ zóó
29997
  ##óók
29998
  ##öl
29999
  ##ön
30000
- ##ör
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29997
  ##óók
29998
  ##öl
29999
  ##ön
30000
+ ##ör
30001
+ ##Q
30002
+ ##X
30003
+ ##Ç
30004
+ ##Ó
30005
+ ##Ô
30006
+ ##Ú
30007
+ ##Û
30008
+ ##Ü
30009
+ ##à
30010
+ ##á
30011
+ ##â
30012
+ ##ä
30013
+ ##ê
30014
+ ##ì
30015
+ ##í
30016
+ ##î
30017
+ ##ñ
30018
+ ##ò
30019
+ ##ô
30020
+ ##ù
30021
+ ##ú
30022
+ ##û
30023
+ ##ü
30024
+ Q
30025
+ X
30026
+ a
30027
+ c
30028
+ e
30029
+ f
30030
+ g
30031
+ h
30032
+ i
30033
+ j
30034
+ k
30035
+ l
30036
+ m
30037
+ n
30038
+ o
30039
+ p
30040
+ q
30041
+ r
30042
+ s
30043
+ t
30044
+ u
30045
+ x
30046
+ y
30047
+ Ç
30048
+ Ó
30049
+ Ô
30050
+ Ú
30051
+ Û
30052
+ Ü
30053
+ à
30054
+ á
30055
+ â
30056
+ ä
30057
+ è
30058
+ é
30059
+ ê
30060
+ ë
30061
+ ì
30062
+ í
30063
+ î
30064
+ ï
30065
+ ñ
30066
+ ò
30067
+ ó
30068
+ ô
30069
+ ö
30070
+ ù
30071
+ ú
30072
+ û
30073
+ ü