Upload 5 files

Browse files

Files changed (5) hide show

category_config.json +0 -0
special_tokens_map.json +6 -0
tokenization_dart.py +127 -0
tokenizer.json +0 -0
tokenizer_config.json +373 -0

category_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<|bos|>",
+  "eos_token": "<|eos|>",
+  "pad_token": "<|pad|>",
+  "unk_token": "<|unknown|>"
+}

tokenization_dart.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import logging
+import os
+import json
+from typing import Optional, Dict, List, Tuple, Union
+from pydantic.dataclasses import dataclass
+import numpy as np
+from numpy.typing import NDArray
+from transformers import PreTrainedTokenizerFast
+from tokenizers.decoders import Decoder
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {
+    "category_config": "category_config.json",
+}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "category_config": {
+        "p1atdev/dart-tokenizer-v1": "https://huggingface.co/p1atdev/dart-tokenizer-v1/resolve/main/tag_category.json"
+    }
+}
+@dataclass
+class Category:
+    name: str
+    bos_token_id: int
+    eos_token_id: int
+@dataclass
+class TagCategoryConfig:
+    categories: Dict[str, Category]
+    category_to_token_ids: Dict[str, List[int]]
+def load_tag_category_config(config_json: str):
+    with open(config_json, "rb") as file:
+        config: TagCategoryConfig = TagCategoryConfig(**json.loads(file.read()))
+    return config
+class DartDecoder:
+    def __init__(self, special_tokens: List[str]):
+        self.special_tokens = list(special_tokens)
+    def decode_chain(self, tokens: List[str]) -> List[str]:
+        new_tokens = []
+        is_specials = []
+        for i, token in enumerate(tokens):
+            is_specials.append(token in self.special_tokens)
+            if i == 0:
+                new_tokens.append(token)
+                continue
+            # this token or previous token is special
+            if is_specials[i] or is_specials[i - 1]:
+                new_tokens.append(token)
+                continue
+            new_tokens.append(f", {token}")
+        return new_tokens
+class DartTokenizer(PreTrainedTokenizerFast):
+    """Dart tokenizer"""
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    def __init__(self, category_config, **kwargs):
+        super().__init__(**kwargs)
+        self._tokenizer.decoder = Decoder.custom(  # type: ignore
+            DartDecoder(list(self.get_added_vocab().keys()))
+        )
+        self.category_config = load_tag_category_config(category_config)
+        self._id_to_category_map = np.zeros(self.vocab_size).astype("uint8")
+        for (
+            category_id,
+            tokens,
+        ) in self.category_config.category_to_token_ids.items():
+            self._id_to_category_map[tokens] = int(category_id)
+    def create_vocab_mask(self, value: int = 1):
+        """Create an array of vocab size filled with specified value"""
+        return np.full(self.vocab_size, value).astype("uint8")
+    def get_token_ids_in_category(self, category_id: Union[int, str]):
+        """Get token ids in the specified category"""
+        return self.category_config.category_to_token_ids[str(category_id)]
+    def get_category(self, category_id: Union[int, str]):
+        """Get the specified category config"""
+        return self.category_config.categories[str(category_id)]
+    def convert_ids_to_category_ids(self, token_ids: Union[int, List[int]]):
+        """Get the category ids of specified tokens"""
+        return self._id_to_category_map[token_ids]
+    def get_banned_tokens_mask(self, tokens: Union[str, List[str], int, List[int]]):
+        if isinstance(tokens, str):
+            tokens = [tokens]
+        elif isinstance(tokens, int):
+            tokens = [tokens]
+        elif isinstance(tokens, list):
+            tokens = [  # type: ignore
+                self.convert_tokens_to_ids(token) if isinstance(token, str) else token
+                for token in tokens
+            ]
+        assert isinstance(tokens, list) and all(
+            [isinstance(token, int) for token in tokens]
+        )
+        mask = self.create_vocab_mask(value=1)
+        mask[tokens] = 0
+        return mask

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,373 @@

+{
+  "tokenizer_class": "DartTokenizer",
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_dart.DartTokenizer",
+      "tokenization_dart.DartTokenizer"
+    ]
+  },
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<|unknown|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<rating>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "</rating>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<copyright>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "</copyright>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<character>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "</character>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<general>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "</general>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<|input_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<|very_short|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<|short|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<|long|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<|very_long|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<|reserved_5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<|reserved_6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "19": {
+      "content": "<|reserved_7|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "20": {
+      "content": "<|reserved_8|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "21": {
+      "content": "<|reserved_9|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "22": {
+      "content": "<|reserved_10|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "23": {
+      "content": "<|reserved_11|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "24": {
+      "content": "<|reserved_12|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "25": {
+      "content": "<|reserved_13|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "26": {
+      "content": "<|reserved_14|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "27": {
+      "content": "<|reserved_15|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "28": {
+      "content": "<|reserved_16|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "29": {
+      "content": "<|reserved_17|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "30": {
+      "content": "<|reserved_18|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "31": {
+      "content": "<|reserved_19|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32": {
+      "content": "<|reserved_20|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "33": {
+      "content": "<|reserved_21|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "34": {
+      "content": "<|reserved_22|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "35": {
+      "content": "<|reserved_23|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "36": {
+      "content": "<|reserved_24|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "37": {
+      "content": "<|reserved_25|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "38": {
+      "content": "<|reserved_26|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "39": {
+      "content": "<|reserved_27|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "40": {
+      "content": "<|reserved_28|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "41": {
+      "content": "<|reserved_29|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "42": {
+      "content": "<|reserved_30|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "43": {
+      "content": "<|reserved_31|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|bos|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eos|>",
+  "max_length": null,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_to_multiple_of": null,
+  "pad_token": "<|pad|>",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "unk_token": "<|unknown|>"
+}