Norrawee commited on
Commit
dca76aa
1 Parent(s): 354bf61

add tokenizer

Browse files
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49c4ba4e495ddf31eb2fdba7fc6aef3c233091d25d35bc9d24694ccf48ae114c
3
+ size 904693
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}, "additional_special_tokens": ["<s>NOTUSED", "</s>NOTUSED", "<_>"]}
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"do_lower_case": false, "do_basic_tokenize": true, "never_split": null, "unk_token": "<unk>", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": false, "lowercase": false, "special_tokens_map_file": "/root/.cache/huggingface/transformers/ff8c49f786a0eddfc4a865f09ab427d99cd22cd2c866230724f7b7133952dc1a.234ad2446e4daff2bd4ca6fa6a13d9d878b748e62b97f23af6e858fd5fc3dbec", "tokenizer_file": null, "name_or_path": "monsoon-nlp/bert-base-thai", "tokenizer_class": "BertTokenizer"}
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "additional_special_tokens": ["<s>NOTUSED", "</s>NOTUSED", "<_>"], "sp_model_kwargs": {}, "do_lower_case": false, "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "airesearch/wangchanberta-base-att-spm-uncased", "tokenizer_class": "CamembertTokenizer"}