p1atdev commited on
Commit
79e3511
1 Parent(s): e8bc33e

Upload 5 files

Browse files
category_config.json ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|bos|>",
3
+ "eos_token": "<|eos|>",
4
+ "pad_token": "<|pad|>",
5
+ "unk_token": "<|unknown|>"
6
+ }
tokenization_dart.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import json
4
+ from typing import Optional, Dict, List, Tuple, Union
5
+ from pydantic.dataclasses import dataclass
6
+
7
+ import numpy as np
8
+ from numpy.typing import NDArray
9
+
10
+ from transformers import PreTrainedTokenizerFast
11
+ from tokenizers.decoders import Decoder
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ VOCAB_FILES_NAMES = {
16
+ "category_config": "category_config.json",
17
+ }
18
+
19
+ PRETRAINED_VOCAB_FILES_MAP = {
20
+ "category_config": {
21
+ "p1atdev/dart-tokenizer-v1": "https://huggingface.co/p1atdev/dart-tokenizer-v1/resolve/main/tag_category.json"
22
+ }
23
+ }
24
+
25
+
26
+ @dataclass
27
+ class Category:
28
+ name: str
29
+ bos_token_id: int
30
+ eos_token_id: int
31
+
32
+
33
+ @dataclass
34
+ class TagCategoryConfig:
35
+ categories: Dict[str, Category]
36
+ category_to_token_ids: Dict[str, List[int]]
37
+
38
+
39
+ def load_tag_category_config(config_json: str):
40
+ with open(config_json, "rb") as file:
41
+ config: TagCategoryConfig = TagCategoryConfig(**json.loads(file.read()))
42
+
43
+ return config
44
+
45
+
46
+ class DartDecoder:
47
+ def __init__(self, special_tokens: List[str]):
48
+ self.special_tokens = list(special_tokens)
49
+
50
+ def decode_chain(self, tokens: List[str]) -> List[str]:
51
+ new_tokens = []
52
+ is_specials = []
53
+
54
+ for i, token in enumerate(tokens):
55
+ is_specials.append(token in self.special_tokens)
56
+
57
+ if i == 0:
58
+ new_tokens.append(token)
59
+ continue
60
+
61
+ # this token or previous token is special
62
+ if is_specials[i] or is_specials[i - 1]:
63
+ new_tokens.append(token)
64
+ continue
65
+
66
+ new_tokens.append(f", {token}")
67
+
68
+ return new_tokens
69
+
70
+
71
+ class DartTokenizer(PreTrainedTokenizerFast):
72
+ """Dart tokenizer"""
73
+
74
+ vocab_files_names = VOCAB_FILES_NAMES
75
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
76
+
77
+ def __init__(self, category_config, **kwargs):
78
+ super().__init__(**kwargs)
79
+
80
+ self._tokenizer.decoder = Decoder.custom( # type: ignore
81
+ DartDecoder(list(self.get_added_vocab().keys()))
82
+ )
83
+
84
+ self.category_config = load_tag_category_config(category_config)
85
+
86
+ self._id_to_category_map = np.zeros(self.vocab_size).astype("uint8")
87
+ for (
88
+ category_id,
89
+ tokens,
90
+ ) in self.category_config.category_to_token_ids.items():
91
+ self._id_to_category_map[tokens] = int(category_id)
92
+
93
+ def create_vocab_mask(self, value: int = 1):
94
+ """Create an array of vocab size filled with specified value"""
95
+ return np.full(self.vocab_size, value).astype("uint8")
96
+
97
+ def get_token_ids_in_category(self, category_id: Union[int, str]):
98
+ """Get token ids in the specified category"""
99
+ return self.category_config.category_to_token_ids[str(category_id)]
100
+
101
+ def get_category(self, category_id: Union[int, str]):
102
+ """Get the specified category config"""
103
+ return self.category_config.categories[str(category_id)]
104
+
105
+ def convert_ids_to_category_ids(self, token_ids: Union[int, List[int]]):
106
+ """Get the category ids of specified tokens"""
107
+ return self._id_to_category_map[token_ids]
108
+
109
+ def get_banned_tokens_mask(self, tokens: Union[str, List[str], int, List[int]]):
110
+ if isinstance(tokens, str):
111
+ tokens = [tokens]
112
+ elif isinstance(tokens, int):
113
+ tokens = [tokens]
114
+ elif isinstance(tokens, list):
115
+ tokens = [ # type: ignore
116
+ self.convert_tokens_to_ids(token) if isinstance(token, str) else token
117
+ for token in tokens
118
+ ]
119
+
120
+ assert isinstance(tokens, list) and all(
121
+ [isinstance(token, int) for token in tokens]
122
+ )
123
+
124
+ mask = self.create_vocab_mask(value=1)
125
+ mask[tokens] = 0
126
+
127
+ return mask
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_class": "DartTokenizer",
3
+ "auto_map": {
4
+ "AutoTokenizer": [
5
+ "tokenization_dart.DartTokenizer",
6
+ "tokenization_dart.DartTokenizer"
7
+ ]
8
+ },
9
+ "added_tokens_decoder": {
10
+ "0": {
11
+ "content": "<|bos|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false,
16
+ "special": true
17
+ },
18
+ "1": {
19
+ "content": "<|eos|>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false,
24
+ "special": true
25
+ },
26
+ "2": {
27
+ "content": "<|pad|>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false,
32
+ "special": true
33
+ },
34
+ "3": {
35
+ "content": "<|unknown|>",
36
+ "lstrip": false,
37
+ "normalized": false,
38
+ "rstrip": false,
39
+ "single_word": false,
40
+ "special": true
41
+ },
42
+ "4": {
43
+ "content": "<rating>",
44
+ "lstrip": false,
45
+ "normalized": false,
46
+ "rstrip": false,
47
+ "single_word": false,
48
+ "special": true
49
+ },
50
+ "5": {
51
+ "content": "</rating>",
52
+ "lstrip": false,
53
+ "normalized": false,
54
+ "rstrip": false,
55
+ "single_word": false,
56
+ "special": true
57
+ },
58
+ "6": {
59
+ "content": "<copyright>",
60
+ "lstrip": false,
61
+ "normalized": false,
62
+ "rstrip": false,
63
+ "single_word": false,
64
+ "special": true
65
+ },
66
+ "7": {
67
+ "content": "</copyright>",
68
+ "lstrip": false,
69
+ "normalized": false,
70
+ "rstrip": false,
71
+ "single_word": false,
72
+ "special": true
73
+ },
74
+ "8": {
75
+ "content": "<character>",
76
+ "lstrip": false,
77
+ "normalized": false,
78
+ "rstrip": false,
79
+ "single_word": false,
80
+ "special": true
81
+ },
82
+ "9": {
83
+ "content": "</character>",
84
+ "lstrip": false,
85
+ "normalized": false,
86
+ "rstrip": false,
87
+ "single_word": false,
88
+ "special": true
89
+ },
90
+ "10": {
91
+ "content": "<general>",
92
+ "lstrip": false,
93
+ "normalized": false,
94
+ "rstrip": false,
95
+ "single_word": false,
96
+ "special": true
97
+ },
98
+ "11": {
99
+ "content": "</general>",
100
+ "lstrip": false,
101
+ "normalized": false,
102
+ "rstrip": false,
103
+ "single_word": false,
104
+ "special": true
105
+ },
106
+ "12": {
107
+ "content": "<|input_end|>",
108
+ "lstrip": false,
109
+ "normalized": false,
110
+ "rstrip": false,
111
+ "single_word": false,
112
+ "special": true
113
+ },
114
+ "13": {
115
+ "content": "<|very_short|>",
116
+ "lstrip": false,
117
+ "normalized": false,
118
+ "rstrip": false,
119
+ "single_word": false,
120
+ "special": true
121
+ },
122
+ "14": {
123
+ "content": "<|short|>",
124
+ "lstrip": false,
125
+ "normalized": false,
126
+ "rstrip": false,
127
+ "single_word": false,
128
+ "special": true
129
+ },
130
+ "15": {
131
+ "content": "<|long|>",
132
+ "lstrip": false,
133
+ "normalized": false,
134
+ "rstrip": false,
135
+ "single_word": false,
136
+ "special": true
137
+ },
138
+ "16": {
139
+ "content": "<|very_long|>",
140
+ "lstrip": false,
141
+ "normalized": false,
142
+ "rstrip": false,
143
+ "single_word": false,
144
+ "special": true
145
+ },
146
+ "17": {
147
+ "content": "<|reserved_5|>",
148
+ "lstrip": false,
149
+ "normalized": false,
150
+ "rstrip": false,
151
+ "single_word": false,
152
+ "special": true
153
+ },
154
+ "18": {
155
+ "content": "<|reserved_6|>",
156
+ "lstrip": false,
157
+ "normalized": false,
158
+ "rstrip": false,
159
+ "single_word": false,
160
+ "special": true
161
+ },
162
+ "19": {
163
+ "content": "<|reserved_7|>",
164
+ "lstrip": false,
165
+ "normalized": false,
166
+ "rstrip": false,
167
+ "single_word": false,
168
+ "special": true
169
+ },
170
+ "20": {
171
+ "content": "<|reserved_8|>",
172
+ "lstrip": false,
173
+ "normalized": false,
174
+ "rstrip": false,
175
+ "single_word": false,
176
+ "special": true
177
+ },
178
+ "21": {
179
+ "content": "<|reserved_9|>",
180
+ "lstrip": false,
181
+ "normalized": false,
182
+ "rstrip": false,
183
+ "single_word": false,
184
+ "special": true
185
+ },
186
+ "22": {
187
+ "content": "<|reserved_10|>",
188
+ "lstrip": false,
189
+ "normalized": false,
190
+ "rstrip": false,
191
+ "single_word": false,
192
+ "special": true
193
+ },
194
+ "23": {
195
+ "content": "<|reserved_11|>",
196
+ "lstrip": false,
197
+ "normalized": false,
198
+ "rstrip": false,
199
+ "single_word": false,
200
+ "special": true
201
+ },
202
+ "24": {
203
+ "content": "<|reserved_12|>",
204
+ "lstrip": false,
205
+ "normalized": false,
206
+ "rstrip": false,
207
+ "single_word": false,
208
+ "special": true
209
+ },
210
+ "25": {
211
+ "content": "<|reserved_13|>",
212
+ "lstrip": false,
213
+ "normalized": false,
214
+ "rstrip": false,
215
+ "single_word": false,
216
+ "special": true
217
+ },
218
+ "26": {
219
+ "content": "<|reserved_14|>",
220
+ "lstrip": false,
221
+ "normalized": false,
222
+ "rstrip": false,
223
+ "single_word": false,
224
+ "special": true
225
+ },
226
+ "27": {
227
+ "content": "<|reserved_15|>",
228
+ "lstrip": false,
229
+ "normalized": false,
230
+ "rstrip": false,
231
+ "single_word": false,
232
+ "special": true
233
+ },
234
+ "28": {
235
+ "content": "<|reserved_16|>",
236
+ "lstrip": false,
237
+ "normalized": false,
238
+ "rstrip": false,
239
+ "single_word": false,
240
+ "special": true
241
+ },
242
+ "29": {
243
+ "content": "<|reserved_17|>",
244
+ "lstrip": false,
245
+ "normalized": false,
246
+ "rstrip": false,
247
+ "single_word": false,
248
+ "special": true
249
+ },
250
+ "30": {
251
+ "content": "<|reserved_18|>",
252
+ "lstrip": false,
253
+ "normalized": false,
254
+ "rstrip": false,
255
+ "single_word": false,
256
+ "special": true
257
+ },
258
+ "31": {
259
+ "content": "<|reserved_19|>",
260
+ "lstrip": false,
261
+ "normalized": false,
262
+ "rstrip": false,
263
+ "single_word": false,
264
+ "special": true
265
+ },
266
+ "32": {
267
+ "content": "<|reserved_20|>",
268
+ "lstrip": false,
269
+ "normalized": false,
270
+ "rstrip": false,
271
+ "single_word": false,
272
+ "special": true
273
+ },
274
+ "33": {
275
+ "content": "<|reserved_21|>",
276
+ "lstrip": false,
277
+ "normalized": false,
278
+ "rstrip": false,
279
+ "single_word": false,
280
+ "special": true
281
+ },
282
+ "34": {
283
+ "content": "<|reserved_22|>",
284
+ "lstrip": false,
285
+ "normalized": false,
286
+ "rstrip": false,
287
+ "single_word": false,
288
+ "special": true
289
+ },
290
+ "35": {
291
+ "content": "<|reserved_23|>",
292
+ "lstrip": false,
293
+ "normalized": false,
294
+ "rstrip": false,
295
+ "single_word": false,
296
+ "special": true
297
+ },
298
+ "36": {
299
+ "content": "<|reserved_24|>",
300
+ "lstrip": false,
301
+ "normalized": false,
302
+ "rstrip": false,
303
+ "single_word": false,
304
+ "special": true
305
+ },
306
+ "37": {
307
+ "content": "<|reserved_25|>",
308
+ "lstrip": false,
309
+ "normalized": false,
310
+ "rstrip": false,
311
+ "single_word": false,
312
+ "special": true
313
+ },
314
+ "38": {
315
+ "content": "<|reserved_26|>",
316
+ "lstrip": false,
317
+ "normalized": false,
318
+ "rstrip": false,
319
+ "single_word": false,
320
+ "special": true
321
+ },
322
+ "39": {
323
+ "content": "<|reserved_27|>",
324
+ "lstrip": false,
325
+ "normalized": false,
326
+ "rstrip": false,
327
+ "single_word": false,
328
+ "special": true
329
+ },
330
+ "40": {
331
+ "content": "<|reserved_28|>",
332
+ "lstrip": false,
333
+ "normalized": false,
334
+ "rstrip": false,
335
+ "single_word": false,
336
+ "special": true
337
+ },
338
+ "41": {
339
+ "content": "<|reserved_29|>",
340
+ "lstrip": false,
341
+ "normalized": false,
342
+ "rstrip": false,
343
+ "single_word": false,
344
+ "special": true
345
+ },
346
+ "42": {
347
+ "content": "<|reserved_30|>",
348
+ "lstrip": false,
349
+ "normalized": false,
350
+ "rstrip": false,
351
+ "single_word": false,
352
+ "special": true
353
+ },
354
+ "43": {
355
+ "content": "<|reserved_31|>",
356
+ "lstrip": false,
357
+ "normalized": false,
358
+ "rstrip": false,
359
+ "single_word": false,
360
+ "special": true
361
+ }
362
+ },
363
+ "bos_token": "<|bos|>",
364
+ "clean_up_tokenization_spaces": true,
365
+ "eos_token": "<|eos|>",
366
+ "max_length": null,
367
+ "model_max_length": 1000000000000000019884624838656,
368
+ "pad_to_multiple_of": null,
369
+ "pad_token": "<|pad|>",
370
+ "pad_token_type_id": 0,
371
+ "padding_side": "right",
372
+ "unk_token": "<|unknown|>"
373
+ }