patrickvonplaten commited on
Commit
a84519e
1 Parent(s): 1f4f9cc

add tokenizer

Browse files
entity_vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"[MASK]": 0, "[UNK]": 1, "[PAD]": 2, "DUMMY": 3, "DUMMY2": 4, "[MASK2]": 5}
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8dfd1eae4522281b1b839eab877a791befec7a1663a41c814c77d9c89c748f2d
3
+ size 253154
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}, "additional_special_tokens": [{"content": "<ent>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, {"content": "<ent2>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}]}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "sp_model_kwargs": {}, "task": null, "max_entity_length": 32, "max_mention_length": 30, "entity_token_1": {"content": "<ent>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "entity_token_2": {"content": "<ent2>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "entity_unk_token": "[UNK]", "entity_pad_token": "[PAD]", "entity_mask_token": "[MASK]", "entity_mask2_token": "[MASK2]", "do_lower_case": false, "additional_special_tokens": [{"content": "<ent>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, {"content": "<ent2>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}], "tokenizer_class": "MLukeTokenizer"}