vietdata commited on
Commit
5d60d56
1 Parent(s): 001d22f

Upload tokenizer

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. tokenizer_config.json +8 -2
  3. unigram.json +3 -0
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ unigram.json filter=lfs diff=lfs merge=lfs -text
tokenizer_config.json CHANGED
@@ -2,6 +2,7 @@
2
  "bos_token": "<s>",
3
  "clean_up_tokenization_spaces": true,
4
  "cls_token": "<s>",
 
5
  "eos_token": "</s>",
6
  "mask_token": {
7
  "__type": "AddedToken",
@@ -11,12 +12,17 @@
11
  "rstrip": false,
12
  "single_word": false
13
  },
14
- "max_length": 256,
15
  "model_max_length": 512,
 
16
  "pad_token": "<pad>",
 
 
17
  "sep_token": "</s>",
18
  "stride": 0,
19
- "tokenizer_class": "XLMRobertaTokenizer",
 
 
20
  "truncation_side": "right",
21
  "truncation_strategy": "longest_first",
22
  "unk_token": "<unk>"
 
2
  "bos_token": "<s>",
3
  "clean_up_tokenization_spaces": true,
4
  "cls_token": "<s>",
5
+ "do_lower_case": true,
6
  "eos_token": "</s>",
7
  "mask_token": {
8
  "__type": "AddedToken",
 
12
  "rstrip": false,
13
  "single_word": false
14
  },
15
+ "max_length": 128,
16
  "model_max_length": 512,
17
+ "pad_to_multiple_of": null,
18
  "pad_token": "<pad>",
19
+ "pad_token_type_id": 0,
20
+ "padding_side": "right",
21
  "sep_token": "</s>",
22
  "stride": 0,
23
+ "strip_accents": null,
24
+ "tokenize_chinese_chars": true,
25
+ "tokenizer_class": "BertTokenizer",
26
  "truncation_side": "right",
27
  "truncation_strategy": "longest_first",
28
  "unk_token": "<unk>"
unigram.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71b44701d7efd054205115acfa6ef126c5d2f84bd3affe0c59e48163674d19a6
3
+ size 14763234