deepnet commited on
Commit
d40f79e
1 Parent(s): e982a82

Upload tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endofprompt|>": 100276,
3
+ "<|im_end|>": 100265,
4
+ "<|im_start|>": 100264
5
+ }
merges.txt CHANGED
@@ -99998,4 +99998,4 @@ _RECE IVED
99998
  Ġmerc iless
99999
  .Wait For
100000
  Ġday care
100001
- ĠCon veyor
 
99998
  Ġmerc iless
99999
  .Wait For
100000
  Ġday care
100001
+ ĠCon veyor
special_tokens_map.json CHANGED
@@ -1,5 +1,23 @@
1
  {
2
- "bos_token": "<|endoftext|>",
3
- "eos_token": "<|endoftext|>",
4
- "unk_token": "<|endoftext|>"
5
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json CHANGED
@@ -40,8 +40,8 @@
40
  "special": true
41
  },
42
  {
43
- "id": 100276,
44
- "content": "<|endofprompt|>",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
@@ -49,8 +49,8 @@
49
  "special": true
50
  },
51
  {
52
- "id": 100264,
53
- "content": "<|im_start|>",
54
  "single_word": false,
55
  "lstrip": false,
56
  "rstrip": false,
@@ -58,8 +58,8 @@
58
  "special": true
59
  },
60
  {
61
- "id": 100265,
62
- "content": "<|im_end|>",
63
  "single_word": false,
64
  "lstrip": false,
65
  "rstrip": false,
@@ -102,6 +102,7 @@
102
  "end_of_word_suffix": "",
103
  "fuse_unk": false,
104
  "byte_fallback": false,
 
105
  "vocab": {
106
  "!": 0,
107
  "\"": 1,
@@ -100363,9 +100364,9 @@
100363
  "<|fim_prefix|>": 100258,
100364
  "<|fim_middle|>": 100259,
100365
  "<|fim_suffix|>": 100260,
100366
- "<|endofprompt|>": 100276,
100367
  "<|im_start|>": 100264,
100368
- "<|im_end|>": 100265
 
100369
  },
100370
  "merges": [
100371
  "Ġ Ġ",
 
40
  "special": true
41
  },
42
  {
43
+ "id": 100264,
44
+ "content": "<|im_start|>",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
 
49
  "special": true
50
  },
51
  {
52
+ "id": 100265,
53
+ "content": "<|im_end|>",
54
  "single_word": false,
55
  "lstrip": false,
56
  "rstrip": false,
 
58
  "special": true
59
  },
60
  {
61
+ "id": 100276,
62
+ "content": "<|endofprompt|>",
63
  "single_word": false,
64
  "lstrip": false,
65
  "rstrip": false,
 
102
  "end_of_word_suffix": "",
103
  "fuse_unk": false,
104
  "byte_fallback": false,
105
+ "ignore_merges": false,
106
  "vocab": {
107
  "!": 0,
108
  "\"": 1,
 
100364
  "<|fim_prefix|>": 100258,
100365
  "<|fim_middle|>": 100259,
100366
  "<|fim_suffix|>": 100260,
 
100367
  "<|im_start|>": 100264,
100368
+ "<|im_end|>": 100265,
100369
+ "<|endofprompt|>": 100276
100370
  },
100371
  "merges": [
100372
  "Ġ Ġ",
tokenizer_config.json CHANGED
@@ -1,10 +1,68 @@
1
  {
2
  "add_prefix_space": false,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "bos_token": "<|endoftext|>",
 
4
  "clean_up_tokenization_spaces": false,
5
  "eos_token": "<|endoftext|>",
6
  "model_max_length": 8192,
7
- "tokenizer_class": "GPT2TokenizerFast",
8
- "unk_token": "<|endoftext|>",
9
- "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
10
- }
 
1
  {
2
  "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "100257": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "100258": {
13
+ "content": "<|fim_prefix|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "100259": {
21
+ "content": "<|fim_middle|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "100260": {
29
+ "content": "<|fim_suffix|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "100264": {
37
+ "content": "<|im_start|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "100265": {
45
+ "content": "<|im_end|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "100276": {
53
+ "content": "<|endofprompt|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ }
60
+ },
61
  "bos_token": "<|endoftext|>",
62
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
63
  "clean_up_tokenization_spaces": false,
64
  "eos_token": "<|endoftext|>",
65
  "model_max_length": 8192,
66
+ "tokenizer_class": "GPT2Tokenizer",
67
+ "unk_token": "<|endoftext|>"
68
+ }
 
vocab.json CHANGED
The diff for this file is too large to render. See raw diff