asahi417 commited on
Commit
51e8e22
1 Parent(s): a848dfa

Upload tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<hl>": 85729,
3
+ "ar_AR": 85706,
4
+ "cs_CZ": 85707,
5
+ "es_XX": 85708,
6
+ "et_EE": 85709,
7
+ "fi_FI": 85710,
8
+ "fr_XX": 85711,
9
+ "gu_IN": 85712,
10
+ "hi_IN": 85713,
11
+ "it_IT": 85714,
12
+ "ja_XX": 85715,
13
+ "kk_KZ": 85716,
14
+ "ko_KR": 85717,
15
+ "lt_LT": 85718,
16
+ "lv_LV": 85719,
17
+ "my_MM": 85720,
18
+ "ne_NP": 85721,
19
+ "nl_XX": 85722,
20
+ "ro_RO": 85723,
21
+ "ru_RU": 85724,
22
+ "si_LK": 85725,
23
+ "tr_TR": 85726,
24
+ "vi_VN": 85727,
25
+ "zh_CN": 85728
26
+ }
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<hl>"
4
+ ],
5
+ "bos_token": "<s>",
6
+ "cls_token": "<s>",
7
+ "eos_token": "</s>",
8
+ "mask_token": {
9
+ "content": "<mask>",
10
+ "lstrip": true,
11
+ "normalized": false,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ },
15
+ "pad_token": "<pad>",
16
+ "sep_token": "</s>",
17
+ "unk_token": "<unk>"
18
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": null,
3
+ "bos_token": "<s>",
4
+ "cls_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "mask_token": {
7
+ "__type": "AddedToken",
8
+ "content": "<mask>",
9
+ "lstrip": true,
10
+ "normalized": true,
11
+ "rstrip": false,
12
+ "single_word": false
13
+ },
14
+ "model_max_length": 1024,
15
+ "name_or_path": "ckpts/mbart-large-cc25-frquad-qg-trimmed-fr",
16
+ "pad_token": "<pad>",
17
+ "sep_token": "</s>",
18
+ "special_tokens_map_file": null,
19
+ "src_lang": null,
20
+ "tgt_lang": null,
21
+ "tokenizer_class": "MBartTokenizer",
22
+ "unk_token": "<unk>"
23
+ }