saattrupdan commited on
Commit
5119a31
1 Parent(s): a8035fc

Update tokenizer_config.json

Browse files

This removes the extra tokens whose indices exceed the vocab size. Only the pad token is actually used, so we do the standard thing of using the EOS token as the PAD token.

Further, we set the model_max_length, which was previously not set, as well as changing the padding_side to 'left' instead of 'right', as the model is auto-regressive.

Files changed (1) hide show
  1. tokenizer_config.json +3 -46
tokenizer_config.json CHANGED
@@ -25,58 +25,15 @@
25
  "rstrip": false,
26
  "single_word": false,
27
  "special": true
28
- },
29
- "32000": {
30
- "content": "<CLS>",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": true
36
- },
37
- "32001": {
38
- "content": "<SEP>",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": true
44
- },
45
- "32002": {
46
- "content": "<EOD>",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": true
52
- },
53
- "32003": {
54
- "content": "<MASK>",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": true
60
- },
61
- "32004": {
62
- "content": "<PAD>",
63
- "lstrip": false,
64
- "normalized": false,
65
- "rstrip": false,
66
- "single_word": false,
67
- "special": true
68
  }
69
  },
70
  "bos_token": "<s>",
71
  "clean_up_tokenization_spaces": false,
72
- "cls_token": "<CLS>",
73
  "eos_token": "</s>",
74
  "legacy": false,
75
- "mask_token": "<MASK>",
76
- "model_max_length": 1000000000000000019884624838656,
77
- "pad_token": "<PAD>",
78
- "padding_side": "right",
79
- "sep_token": "<SEP>",
80
  "sp_model_kwargs": {},
81
  "tokenizer_class": "LlamaTokenizer",
82
  "unk_token": "<unk>",
 
25
  "rstrip": false,
26
  "single_word": false,
27
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  }
29
  },
30
  "bos_token": "<s>",
31
  "clean_up_tokenization_spaces": false,
 
32
  "eos_token": "</s>",
33
  "legacy": false,
34
+ "model_max_length": 4096,
35
+ "pad_token": "</s>",
36
+ "padding_side": "left",
 
 
37
  "sp_model_kwargs": {},
38
  "tokenizer_class": "LlamaTokenizer",
39
  "unk_token": "<unk>",