Bohdan Petryshyn commited on
Commit
3bf1408
β€’
1 Parent(s): 2249927

Update tokenizer with the origina CodeLlama tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +3 -9
  2. tokenizer.json +0 -37
  3. tokenizer_config.json +23 -72
special_tokens_map.json CHANGED
@@ -1,28 +1,22 @@
1
  {
2
- "additional_special_tokens": [
3
- "▁<PRE>",
4
- "▁<MID>",
5
- "▁<SUF>",
6
- "▁<EOT>"
7
- ],
8
  "bos_token": {
9
  "content": "<s>",
10
  "lstrip": false,
11
- "normalized": false,
12
  "rstrip": false,
13
  "single_word": false
14
  },
15
  "eos_token": {
16
  "content": "</s>",
17
  "lstrip": false,
18
- "normalized": false,
19
  "rstrip": false,
20
  "single_word": false
21
  },
22
  "unk_token": {
23
  "content": "<unk>",
24
  "lstrip": false,
25
- "normalized": false,
26
  "rstrip": false,
27
  "single_word": false
28
  }
 
1
  {
 
 
 
 
 
 
2
  "bos_token": {
3
  "content": "<s>",
4
  "lstrip": false,
5
+ "normalized": true,
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
  "eos_token": {
10
  "content": "</s>",
11
  "lstrip": false,
12
+ "normalized": true,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "unk_token": {
17
  "content": "<unk>",
18
  "lstrip": false,
19
+ "normalized": true,
20
  "rstrip": false,
21
  "single_word": false
22
  }
tokenizer.json CHANGED
@@ -29,42 +29,6 @@
29
  "rstrip": false,
30
  "normalized": false,
31
  "special": true
32
- },
33
- {
34
- "id": 32007,
35
- "content": "▁<PRE>",
36
- "single_word": false,
37
- "lstrip": false,
38
- "rstrip": false,
39
- "normalized": false,
40
- "special": true
41
- },
42
- {
43
- "id": 32008,
44
- "content": "▁<SUF>",
45
- "single_word": false,
46
- "lstrip": false,
47
- "rstrip": false,
48
- "normalized": false,
49
- "special": true
50
- },
51
- {
52
- "id": 32009,
53
- "content": "▁<MID>",
54
- "single_word": false,
55
- "lstrip": false,
56
- "rstrip": false,
57
- "normalized": false,
58
- "special": true
59
- },
60
- {
61
- "id": 32010,
62
- "content": "▁<EOT>",
63
- "single_word": false,
64
- "lstrip": false,
65
- "rstrip": false,
66
- "normalized": false,
67
- "special": true
68
  }
69
  ],
70
  "normalizer": {
@@ -170,7 +134,6 @@
170
  "end_of_word_suffix": null,
171
  "fuse_unk": true,
172
  "byte_fallback": true,
173
- "ignore_merges": false,
174
  "vocab": {
175
  "<unk>": 0,
176
  "<s>": 1,
 
29
  "rstrip": false,
30
  "normalized": false,
31
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  }
33
  ],
34
  "normalizer": {
 
134
  "end_of_word_suffix": null,
135
  "fuse_unk": true,
136
  "byte_fallback": true,
 
137
  "vocab": {
138
  "<unk>": 0,
139
  "<s>": 1,
tokenizer_config.json CHANGED
@@ -1,83 +1,34 @@
1
  {
2
  "add_bos_token": true,
3
  "add_eos_token": false,
4
- "added_tokens_decoder": {
5
- "0": {
6
- "content": "<unk>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "1": {
14
- "content": "<s>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "2": {
22
- "content": "</s>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "32007": {
30
- "content": "▁<PRE>",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": true
36
- },
37
- "32008": {
38
- "content": "▁<SUF>",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": true
44
- },
45
- "32009": {
46
- "content": "▁<MID>",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": true
52
- },
53
- "32010": {
54
- "content": "▁<EOT>",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": true
60
- }
61
  },
62
- "additional_special_tokens": [
63
- "▁<PRE>",
64
- "▁<MID>",
65
- "▁<SUF>",
66
- "▁<EOT>"
67
- ],
68
- "bos_token": "<s>",
69
  "clean_up_tokenization_spaces": false,
70
- "eos_token": "</s>",
71
- "eot_token": "▁<EOT>",
72
- "fill_token": "<FILL_ME>",
 
 
 
 
 
73
  "legacy": null,
74
- "middle_token": "▁<MID>",
75
  "model_max_length": 1000000000000000019884624838656,
76
  "pad_token": null,
77
- "prefix_token": "▁<PRE>",
78
  "sp_model_kwargs": {},
79
- "suffix_token": "▁<SUF>",
80
  "tokenizer_class": "CodeLlamaTokenizer",
81
- "unk_token": "<unk>",
82
- "use_default_system_prompt": false
 
 
 
 
 
 
83
  }
 
1
  {
2
  "add_bos_token": true,
3
  "add_eos_token": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<s>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  },
 
 
 
 
 
 
 
12
  "clean_up_tokenization_spaces": false,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
  "legacy": null,
 
22
  "model_max_length": 1000000000000000019884624838656,
23
  "pad_token": null,
 
24
  "sp_model_kwargs": {},
 
25
  "tokenizer_class": "CodeLlamaTokenizer",
26
+ "unk_token": {
27
+ "__type": "AddedToken",
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
  }