nono1224 commited on
Commit
2270930
·
verified ·
1 Parent(s): af0cd0f

Upload tokenizer

Browse files
added_tokens.json CHANGED
@@ -1,3 +1,4 @@
1
  {
2
- "\n": 31999
 
3
  }
 
1
  {
2
+ "<ent2>": 32771,
3
+ "<ent>": 32770
4
  }
entity_vocab.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "[MASK2]": 3,
3
+ "[MASK]": 0,
4
+ "[PAD]": 2,
5
+ "[UNK]": 1
6
+ }
sentencepiece.bpe.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
- size 5069051
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8b73a5e054936c920cf5b7d1ec21ce9c281977078269963beb821c6c86fbff7
3
+ size 841889
special_tokens_map.json CHANGED
@@ -1,4 +1,48 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "bos_token": {
3
  "content": "<s>",
4
  "lstrip": false,
@@ -7,7 +51,7 @@
7
  "single_word": false
8
  },
9
  "cls_token": {
10
- "content": "<cls>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
@@ -22,8 +66,8 @@
22
  },
23
  "mask_token": {
24
  "content": "<mask>",
25
- "lstrip": false,
26
- "normalized": false,
27
  "rstrip": false,
28
  "single_word": false
29
  },
@@ -35,7 +79,7 @@
35
  "single_word": false
36
  },
37
  "sep_token": {
38
- "content": "<sep>",
39
  "lstrip": false,
40
  "normalized": false,
41
  "rstrip": false,
 
1
  {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<ent>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<ent2>",
12
+ "lstrip": false,
13
+ "normalized": true,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<ent>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ {
25
+ "content": "<ent2>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ {
32
+ "content": "<ent>",
33
+ "lstrip": false,
34
+ "normalized": true,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ {
39
+ "content": "<ent2>",
40
+ "lstrip": false,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false
44
+ }
45
+ ],
46
  "bos_token": {
47
  "content": "<s>",
48
  "lstrip": false,
 
51
  "single_word": false
52
  },
53
  "cls_token": {
54
+ "content": "<s>",
55
  "lstrip": false,
56
  "normalized": false,
57
  "rstrip": false,
 
66
  },
67
  "mask_token": {
68
  "content": "<mask>",
69
+ "lstrip": true,
70
+ "normalized": true,
71
  "rstrip": false,
72
  "single_word": false
73
  },
 
79
  "single_word": false
80
  },
81
  "sep_token": {
82
+ "content": "</s>",
83
  "lstrip": false,
84
  "normalized": false,
85
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -1,11 +1,7 @@
1
  {
2
- "add_bos_token": true,
3
- "add_dummy_prefix_space": false,
4
- "add_eos_token": true,
5
- "add_prefix_space": false,
6
  "added_tokens_decoder": {
7
  "0": {
8
- "content": "<unk>",
9
  "lstrip": false,
10
  "normalized": false,
11
  "rstrip": false,
@@ -13,7 +9,7 @@
13
  "special": true
14
  },
15
  "1": {
16
- "content": "<s>",
17
  "lstrip": false,
18
  "normalized": false,
19
  "rstrip": false,
@@ -29,143 +25,81 @@
29
  "special": true
30
  },
31
  "3": {
32
- "content": "<pad>",
33
- "lstrip": false,
34
- "normalized": false,
35
- "rstrip": false,
36
- "single_word": false,
37
- "special": true
38
- },
39
- "4": {
40
- "content": "<sep>",
41
  "lstrip": false,
42
  "normalized": false,
43
  "rstrip": false,
44
  "single_word": false,
45
  "special": true
46
  },
47
- "5": {
48
  "content": "<mask>",
49
- "lstrip": false,
50
- "normalized": false,
51
  "rstrip": false,
52
  "single_word": false,
53
  "special": true
54
  },
55
- "6": {
56
- "content": "<cls>",
57
  "lstrip": false,
58
- "normalized": false,
59
  "rstrip": false,
60
  "single_word": false,
61
  "special": true
62
  },
63
- "7": {
64
- "content": "<|system|>",
65
  "lstrip": false,
66
- "normalized": false,
67
  "rstrip": false,
68
  "single_word": false,
69
- "special": false
70
- },
71
- "8": {
72
- "content": "<|assistant|>",
73
- "lstrip": false,
74
- "normalized": false,
75
- "rstrip": false,
76
- "single_word": false,
77
- "special": false
78
- },
79
- "9": {
80
- "content": "<|user|>",
81
- "lstrip": false,
82
- "normalized": false,
83
- "rstrip": false,
84
- "single_word": false,
85
- "special": false
86
- },
87
- "10": {
88
- "content": "<|available_tools|>",
89
- "lstrip": false,
90
- "normalized": false,
91
- "rstrip": false,
92
- "single_word": false,
93
- "special": false
94
- },
95
- "11": {
96
- "content": "<|tool_calls|>",
97
- "lstrip": false,
98
- "normalized": false,
99
- "rstrip": false,
100
- "single_word": false,
101
- "special": false
102
- },
103
- "12": {
104
- "content": "<|tool_results|>",
105
- "lstrip": false,
106
- "normalized": false,
107
- "rstrip": false,
108
- "single_word": false,
109
- "special": false
110
- },
111
- "13": {
112
- "content": "<|code|>",
113
- "lstrip": false,
114
- "normalized": false,
115
- "rstrip": false,
116
- "single_word": false,
117
- "special": false
118
- },
119
- "14": {
120
- "content": "<|file|>",
121
- "lstrip": false,
122
- "normalized": false,
123
- "rstrip": false,
124
- "single_word": false,
125
- "special": false
126
- },
127
- "102397": {
128
- "content": "<|prefix|>",
129
- "lstrip": false,
130
- "normalized": false,
131
- "rstrip": false,
132
- "single_word": false,
133
- "special": false
134
- },
135
- "102398": {
136
- "content": "<|suffix|>",
137
- "lstrip": false,
138
- "normalized": false,
139
- "rstrip": false,
140
- "single_word": false,
141
- "special": false
142
- },
143
- "102399": {
144
- "content": "<|middle|>",
145
- "lstrip": false,
146
- "normalized": false,
147
- "rstrip": false,
148
- "single_word": false,
149
- "special": false
150
  }
151
  },
 
 
 
 
 
 
 
 
152
  "bos_token": "<s>",
153
  "clean_up_tokenization_spaces": false,
154
- "cls_token": "<cls>",
155
- "do_lower_case": false,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  "eos_token": "</s>",
157
- "extra_ids": 0,
158
  "extra_special_tokens": {},
159
- "keep_accents": true,
160
- "legacy": false,
161
  "mask_token": "<mask>",
162
- "model_max_length": 1000000000000000019884624838656,
 
 
163
  "pad_token": "<pad>",
164
- "padding_side": "right",
165
- "sep_token": "<sep>",
166
  "sp_model_kwargs": {},
167
- "spaces_between_special_tokens": false,
168
- "tokenizer_class": "LlamaTokenizer",
169
- "unk_token": "<unk>",
170
- "use_default_system_prompt": false
171
  }
 
1
  {
 
 
 
 
2
  "added_tokens_decoder": {
3
  "0": {
4
+ "content": "<s>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
 
9
  "special": true
10
  },
11
  "1": {
12
+ "content": "<pad>",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
 
25
  "special": true
26
  },
27
  "3": {
28
+ "content": "<unk>",
 
 
 
 
 
 
 
 
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
32
  "single_word": false,
33
  "special": true
34
  },
35
+ "32769": {
36
  "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": true,
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
  },
43
+ "32770": {
44
+ "content": "<ent>",
45
  "lstrip": false,
46
+ "normalized": true,
47
  "rstrip": false,
48
  "single_word": false,
49
  "special": true
50
  },
51
+ "32771": {
52
+ "content": "<ent2>",
53
  "lstrip": false,
54
+ "normalized": true,
55
  "rstrip": false,
56
  "single_word": false,
57
+ "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  }
59
  },
60
+ "additional_special_tokens": [
61
+ "<ent>",
62
+ "<ent2>",
63
+ "<ent>",
64
+ "<ent2>",
65
+ "<ent>",
66
+ "<ent2>"
67
+ ],
68
  "bos_token": "<s>",
69
  "clean_up_tokenization_spaces": false,
70
+ "cls_token": "<s>",
71
+ "entity_mask2_token": "[MASK2]",
72
+ "entity_mask_token": "[MASK]",
73
+ "entity_pad_token": "[PAD]",
74
+ "entity_token_1": {
75
+ "__type": "AddedToken",
76
+ "content": "<ent>",
77
+ "lstrip": false,
78
+ "normalized": true,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": false
82
+ },
83
+ "entity_token_2": {
84
+ "__type": "AddedToken",
85
+ "content": "<ent2>",
86
+ "lstrip": false,
87
+ "normalized": true,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": false
91
+ },
92
+ "entity_unk_token": "[UNK]",
93
  "eos_token": "</s>",
 
94
  "extra_special_tokens": {},
 
 
95
  "mask_token": "<mask>",
96
+ "max_entity_length": 32,
97
+ "max_mention_length": 30,
98
+ "model_max_length": 512,
99
  "pad_token": "<pad>",
100
+ "sep_token": "</s>",
 
101
  "sp_model_kwargs": {},
102
+ "task": null,
103
+ "tokenizer_class": "MLukeTokenizer",
104
+ "unk_token": "<unk>"
 
105
  }