saattrupdan
commited on
Commit
•
262a6bd
1
Parent(s):
a8035fc
Update tokenizer_config.json
Browse filesThis removes the extra tokens whose indices exceed the vocab size. Only the pad token is actually used, so we do the standard thing of using the EOS token as the PAD token.
Further, we set the `model_max_length`, which was previously not set, as well as changing the `padding_side` to 'left' instead of 'right', as the model is auto-regressive.
- tokenizer_config.json +3 -46
tokenizer_config.json
CHANGED
@@ -26,57 +26,14 @@
|
|
26 |
"single_word": false,
|
27 |
"special": true
|
28 |
},
|
29 |
-
"32000": {
|
30 |
-
"content": "<CLS>",
|
31 |
-
"lstrip": false,
|
32 |
-
"normalized": false,
|
33 |
-
"rstrip": false,
|
34 |
-
"single_word": false,
|
35 |
-
"special": true
|
36 |
-
},
|
37 |
-
"32001": {
|
38 |
-
"content": "<SEP>",
|
39 |
-
"lstrip": false,
|
40 |
-
"normalized": false,
|
41 |
-
"rstrip": false,
|
42 |
-
"single_word": false,
|
43 |
-
"special": true
|
44 |
-
},
|
45 |
-
"32002": {
|
46 |
-
"content": "<EOD>",
|
47 |
-
"lstrip": false,
|
48 |
-
"normalized": false,
|
49 |
-
"rstrip": false,
|
50 |
-
"single_word": false,
|
51 |
-
"special": true
|
52 |
-
},
|
53 |
-
"32003": {
|
54 |
-
"content": "<MASK>",
|
55 |
-
"lstrip": false,
|
56 |
-
"normalized": false,
|
57 |
-
"rstrip": false,
|
58 |
-
"single_word": false,
|
59 |
-
"special": true
|
60 |
-
},
|
61 |
-
"32004": {
|
62 |
-
"content": "<PAD>",
|
63 |
-
"lstrip": false,
|
64 |
-
"normalized": false,
|
65 |
-
"rstrip": false,
|
66 |
-
"single_word": false,
|
67 |
-
"special": true
|
68 |
-
}
|
69 |
},
|
70 |
"bos_token": "<s>",
|
71 |
"clean_up_tokenization_spaces": false,
|
72 |
-
"cls_token": "<CLS>",
|
73 |
"eos_token": "</s>",
|
74 |
"legacy": false,
|
75 |
-
"
|
76 |
-
"
|
77 |
-
"
|
78 |
-
"padding_side": "right",
|
79 |
-
"sep_token": "<SEP>",
|
80 |
"sp_model_kwargs": {},
|
81 |
"tokenizer_class": "LlamaTokenizer",
|
82 |
"unk_token": "<unk>",
|
|
|
26 |
"single_word": false,
|
27 |
"special": true
|
28 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
},
|
30 |
"bos_token": "<s>",
|
31 |
"clean_up_tokenization_spaces": false,
|
|
|
32 |
"eos_token": "</s>",
|
33 |
"legacy": false,
|
34 |
+
"model_max_length": 4096,
|
35 |
+
"pad_token": "</s>",
|
36 |
+
"padding_side": "left",
|
|
|
|
|
37 |
"sp_model_kwargs": {},
|
38 |
"tokenizer_class": "LlamaTokenizer",
|
39 |
"unk_token": "<unk>",
|