wzhouad commited on
Commit
e7b1ef3
1 Parent(s): 95a4d96

add tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"[te]": 250077, "[ja]": 250040, "[sq]": 250071, "[et]": 250024, "[fa]": 250026, "[zh]": 250085, "[ru]": 250065, "[hi]": 250033, "[la]": 250047, "[is]": 250038, "[el]": 250020, "[he]": 250032, "[yi]": 250084, "[ca]": 250015, "[pt]": 250063, "[sw]": 250075, "[jv]": 250041, "[su]": 250073, "[ur]": 250081, "[fi]": 250027, "[fr]": 250028, "[sco]": 250067, "[th]": 250078, "[S]": 250002, "[sv]": 250074, "[my]": 250056, "[pl]": 250061, "[mg]": 250050, "[uk]": 250080, "[mr]": 250054, "[hy]": 250036, "[eo]": 250022, "[it]": 250039, "[cs]": 250016, "[es]": 250023, "[sa]": 250066, "[mk]": 250051, "[az]": 250009, "[lt]": 250048, "[am]": 250007, "[kk]": 250043, "[or]": 250060, "[zh_min_nan]": 250086, "[ta]": 250076, "[sr]": 250072, "[hu]": 250035, "[sh]": 250068, "[da]": 250018, "[bn]": 250012, "[vi]": 250083, "[sk]": 250069, "[kn]": 250044, "[ka]": 250042, "[hr]": 250034, "[ne]": 250057, "[ku]": 250046, "[ko]": 250045, "[gu]": 250031, "[nl]": 250058, "[ro]": 250064, "[ml]": 250052, "[ms]": 250055, "[pnb]": 250062, "[af]": 250006, "[uz]": 250082, "[gl]": 250030, "[en]": 250021, "[no]": 250059, "[eu]": 250025, "[br]": 250013, "[P]": 250003, "[id]": 250037, "[be]": 250010, "[cy]": 250017, "[sl]": 250070, "[EOS]": 250005, "[ar]": 250008, "[O]": 250004, "[bg]": 250011, "[bs]": 250014, "[lv]": 250049, "[mn]": 250053, "[de]": 250019, "[ga]": 250029, "[tr]": 250079}
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "sep_token": "</s>", "cls_token": "<s>", "unk_token": "<unk>", "pad_token": "<pad>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "xlm-roberta-base", "tokenizer_class": "XLMRobertaTokenizer"}