RWKV
/

rwkv-4-world-169m

Transformers

PyTorch

rwkv

Inference Endpoints

Model card Files Files and versions Community

KaleiNeely commited on Dec 9, 2023

Commit

551c1fa

•

1 Parent(s): 42b5867

Update tokenization_rwkv_world.py

Browse files

Files changed (1) hide show

tokenization_rwkv_world.py +221 -87

tokenization_rwkv_world.py CHANGED Viewed

@@ -12,38 +12,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for OpenAI GPT."""
 import json
 import os
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
-from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
-from transformers.utils import logging, to_py_obj
-from transformers.tokenization_utils_base import BatchEncoding
-import bisect
-import itertools
-import re
-import unicodedata
-from collections import OrderedDict
-from typing import Any, Dict, List, Optional, Tuple, Union, overload
 from transformers.tokenization_utils_base import (
-    ENCODE_KWARGS_DOCSTRING,
-    ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
-    INIT_TOKENIZER_DOCSTRING,
-    AddedToken,
     BatchEncoding,
     EncodedInput,
-    EncodedInputPair,
-    PreTokenizedInput,
-    PreTokenizedInputPair,
-    PreTrainedTokenizerBase,
     TextInput,
-    TextInputPair,
     TruncationStrategy,
 )
-from transformers.utils import PaddingStrategy, TensorType, add_end_docstrings, logging
 if TYPE_CHECKING:
@@ -54,11 +36,18 @@ logger = logging.get_logger(__name__)
 VOCAB_FILES_NAMES = {
     "vocab_file": "rwkv_vocab_v20230424.txt",
 }
 class TRIE:
     __slots__ = tuple("ch,to,values,front".split(","))
-    to:list
-    values:set
     def __init__(self, front=None, ch=None):
         self.ch = ch
         self.to = [None for ch in range(256)]
@@ -68,67 +57,59 @@ class TRIE:
     def __repr__(self):
         fr = self
         ret = []
-        while(fr!=None):
-            if(fr.ch!=None):
                 ret.append(fr.ch)
             fr = fr.front
-        return "<TRIE %s %s>"%(ret[::-1], self.values)
-    def add(self, key:bytes, idx:int=0, val=None):
-        if(idx == len(key)):
-            if(val is None):
                 val = key
             self.values.add(val)
             return self
         ch = key[idx]
-        if(self.to[ch] is None):
             self.to[ch] = TRIE(front=self, ch=ch)
-        return self.to[ch].add(key, idx=idx+1, val=val)
-    def find_longest(self, key:bytes, idx:int=0):
-        u:TRIE = self
-        ch:int = key[idx]
-        while(u.to[ch] is not None):
             u = u.to[ch]
             idx += 1
-            if(u.values):
                 ret = idx, u, u.values
-            if(idx==len(key)):
                 break
             ch = key[idx]
         return ret
 class RWKVWorldTokenizer(PreTrainedTokenizer):
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
-    def __init__(
-            self,
-            vocab_file,
-            errors="replace",
-            **kwargs
-    ):
         self.add_bos_token = False
         self.encoder = {}
-        sorted = [] # must be already sorted
         with open(vocab_file, "r", encoding="utf-8") as f:
             lines = f.readlines()
         for l in lines:
-            idx = int(l[:l.index(' ')])
-            x = eval(l[l.index(' '):l.rindex(' ')])
             x = x.encode("utf-8") if isinstance(x, str) else x
             assert isinstance(x, bytes)
-            assert len(x) == int(l[l.rindex(' '):])
             sorted += [x]
             self.encoder[idx] = x
-        super().__init__(
-            errors=errors,
-            **kwargs,
-        )
         self.decoder = {}
-        for k,v in self.encoder.items():
             self.decoder[v] = int(k)
         self.trie = TRIE()
@@ -136,6 +117,23 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
             _ = self.trie.add(t, val=(t, i))
         self.errors = errors  # how to handle errors in decoding
         self.cache = {}
     @property
     def vocab_size(self):
@@ -144,6 +142,22 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
     def get_vocab(self):
         return dict(self.encoder, **self.added_tokens_encoder)
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         if self.add_bos_token:
             bos_token_ids = [self.bos_token_id]
@@ -158,8 +172,7 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
         return output + bos_token_ids + token_ids_1
     def get_special_tokens_mask(
-            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
-            already_has_special_tokens: bool = False
     ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
@@ -190,19 +203,19 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
             return [1] + ([0] * len(token_ids_0))
         return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
-    def encodeBytes(self, src:bytes):
-        idx:int = 0
         tokens = []
-        while (idx < len(src)):
-            _idx:int = idx
             idx, _, values = self.trie.find_longest(src, idx)
-            assert(idx != _idx)
-            _, token = next(iter(values))
             tokens.append(token)
         return tokens
     def decodeBytes(self, tokens):
-        return b''.join(map(lambda i: self.encoder[i], tokens))
     def _tokenize(self, text, **kwargs):
         """Tokenize a string."""
@@ -210,23 +223,30 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
     def _decode_tokens(self, tokens):
         try:
-            return self.decodeBytes(tokens).decode('utf-8')
-        except:
-            return '\ufffd' # bad utf-8
-    def _decode(self,
-               token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
-               skip_special_tokens: bool = False,
-               **kwargs
-               ) -> str:
         # Convert inputs to python lists
         token_ids = to_py_obj(token_ids)
         if isinstance(token_ids, int):
             if token_ids in self.all_special_ids and skip_special_tokens:
                 return ""
             return self.encoder.get(token_ids, self.unk_token)
         elif isinstance(token_ids, list):
             out_str = ""
             out_last = 0
             out_tokens = []
@@ -235,7 +255,7 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
                     break
                 out_tokens += [token]
                 tmp = self._decode_tokens(out_tokens[out_last:])
-                if '\ufffd' not in tmp:
                     out_str += tmp
                     out_last = i + 1
             return out_str
@@ -268,6 +288,11 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
     def prepare_for_tokenization(self, text, **kwargs):
         return (text, kwargs)
     def _encode_plus(
         self,
         text: Union[TextInput, EncodedInput],
@@ -285,16 +310,29 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
     ) -> BatchEncoding:
-        def get_input_ids(text):
             if isinstance(text, str):
-                text_id = self._tokenize(text)
-                return text_id
             elif isinstance(text, list) and len(text) > 0 and isinstance(text[0], str):
-                return [self._tokenize(t) for t in text]
             elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
                 return text
             else:
                 raise ValueError(
                     "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
@@ -350,16 +388,29 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
     ) -> BatchEncoding:
-        def get_input_ids(text):
             if isinstance(text, str):
-                text_id = self._tokenize(text)
-                return text_id
             elif isinstance(text, list) and len(text) > 0 and isinstance(text[0], str):
-                return [self._tokenize(t) for t in text]
             elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
                 return text
             else:
                 raise ValueError(
                     "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
@@ -372,15 +423,29 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
                 "transformers.PreTrainedTokenizerFast."
             )
-        input_ids = []
         for ids_or_pair_ids in batch_text_or_text_pairs:
             if not isinstance(ids_or_pair_ids, (list, tuple)):
                 ids, pair_ids = ids_or_pair_ids, None
             else:
                 ids, pair_ids = ids_or_pair_ids
             first_ids = get_input_ids(ids)
             second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
             input_ids.append((first_ids, second_ids))
         batch_outputs = self._batch_prepare_for_model(
@@ -402,10 +467,79 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
         return BatchEncoding(batch_outputs)
     def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
         input_ids = []
         for is_user, text in conversation.iter_texts():
             input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
         if len(input_ids) > self.model_max_length:
-            input_ids = input_ids[-self.model_max_length:]
         return input_ids

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Tokenization classes for RWKV5."""
 import json
 import os
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.tokenization_utils_base import (
     BatchEncoding,
     EncodedInput,
     TextInput,
     TruncationStrategy,
 )
+from transformers.utils import PaddingStrategy, TensorType, logging, to_py_obj
 if TYPE_CHECKING:
 VOCAB_FILES_NAMES = {
     "vocab_file": "rwkv_vocab_v20230424.txt",
 }
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "RWKV/rwkv-5-world-169m": "https://huggingface.co/RWKV/rwkv-5-world-169m/blob/main/rwkv_vocab_v20230424.txt",
+    },
+}
 class TRIE:
     __slots__ = tuple("ch,to,values,front".split(","))
+    to: list
+    values: set
     def __init__(self, front=None, ch=None):
         self.ch = ch
         self.to = [None for ch in range(256)]
     def __repr__(self):
         fr = self
         ret = []
+        while fr is not None:
+            if fr.ch is not None:
                 ret.append(fr.ch)
             fr = fr.front
+        return "<TRIE %s %s>" % (ret[::-1], self.values)
+    def add(self, key: bytes, idx: int = 0, val=None):
+        if idx == len(key):
+            if val is None:
                 val = key
             self.values.add(val)
             return self
         ch = key[idx]
+        if self.to[ch] is None:
             self.to[ch] = TRIE(front=self, ch=ch)
+        return self.to[ch].add(key, idx=idx + 1, val=val)
+    def find_longest(self, key: bytes, idx: int = 0):
+        u: TRIE = self
+        ch: int = key[idx]
+        while u.to[ch] is not None:
             u = u.to[ch]
             idx += 1
+            if u.values:
                 ret = idx, u, u.values
+            if idx == len(key):
                 break
             ch = key[idx]
         return ret
 class RWKVWorldTokenizer(PreTrainedTokenizer):
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
+    def __init__(self, vocab_file, errors="replace", pad_token="0", **kwargs):
         self.add_bos_token = False
         self.encoder = {}
+        sorted = []  # must be already sorted
         with open(vocab_file, "r", encoding="utf-8") as f:
             lines = f.readlines()
         for l in lines:
+            idx = int(l[: l.index(" ")])
+            x = eval(l[l.index(" ") : l.rindex(" ")])
             x = x.encode("utf-8") if isinstance(x, str) else x
             assert isinstance(x, bytes)
+            assert len(x) == int(l[l.rindex(" ") :])
             sorted += [x]
             self.encoder[idx] = x
         self.decoder = {}
+        for k, v in self.encoder.items():
             self.decoder[v] = int(k)
         self.trie = TRIE()
             _ = self.trie.add(t, val=(t, i))
         self.errors = errors  # how to handle errors in decoding
         self.cache = {}
+        self.first_max_length = 0
+        super().__init__(
+            errors=errors,
+            **kwargs,
+        )
+    @property
+    def eos_token_id(self) -> Optional[int]:
+        return 0
+    @property
+    def eot_token_id(self) -> Optional[int]:
+        return 0
+    @property
+    def pad_token_id(self) -> Optional[int]:
+        return 0
     @property
     def vocab_size(self):
     def get_vocab(self):
         return dict(self.encoder, **self.added_tokens_encoder)
+    def add_tokens(self, new_tokens, special_tokens: bool = False):
+        for token in new_tokens:
+            token_id = self.convert_tokens_to_ids(token)
+            self.added_tokens_decoder[token_id] = token
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        if isinstance(ids, int):
+            ids = [ids]
+        tokens = []
+        for id_ in ids:
+            if id_ in self.added_tokens_decoder:
+                tokens.append(self.added_tokens_decoder[id_])
+            else:
+                tokens.append(self._convert_id_to_token(id_))
+        return tokens
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         if self.add_bos_token:
             bos_token_ids = [self.bos_token_id]
         return output + bos_token_ids + token_ids_1
     def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
     ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
             return [1] + ([0] * len(token_ids_0))
         return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
+    def encodeBytes(self, src: bytes):
+        idx: int = 0
         tokens = []
+        while idx < len(src):
+            _idx: int = idx
             idx, _, values = self.trie.find_longest(src, idx)
+            assert idx != _idx
+            _, token = next(iter(values))
             tokens.append(token)
         return tokens
     def decodeBytes(self, tokens):
+        return b"".join(map(lambda i: self.encoder[i], tokens))  # noqa
     def _tokenize(self, text, **kwargs):
         """Tokenize a string."""
     def _decode_tokens(self, tokens):
         try:
+            return self.decodeBytes(tokens).decode("utf-8")
+        except Exception:
+            return "\ufffd"  # bad utf-8
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        **kwargs,
+    ) -> str:
+        def remove_zeros_from_first_segment(token_ids, first_max_length):
+            first_segment = token_ids[:first_max_length]
+            first_segment_cleaned = [token for token in first_segment if token != 0]
+            return first_segment_cleaned + token_ids[first_max_length:]
         # Convert inputs to python lists
         token_ids = to_py_obj(token_ids)
+        token_ids = remove_zeros_from_first_segment(token_ids, self.first_max_length)
         if isinstance(token_ids, int):
             if token_ids in self.all_special_ids and skip_special_tokens:
                 return ""
             return self.encoder.get(token_ids, self.unk_token)
         elif isinstance(token_ids, list):
+            self.first_max_length
             out_str = ""
             out_last = 0
             out_tokens = []
                     break
                 out_tokens += [token]
                 tmp = self._decode_tokens(out_tokens[out_last:])
+                if "\ufffd" not in tmp:
                     out_str += tmp
                     out_last = i + 1
             return out_str
     def prepare_for_tokenization(self, text, **kwargs):
         return (text, kwargs)
+    def _get_padding_truncation_strategies(
+        self, padding=False, truncation=None, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
+    ):
+        return PaddingStrategy.LONGEST, TruncationStrategy.DO_NOT_TRUNCATE, -1, kwargs
     def _encode_plus(
         self,
         text: Union[TextInput, EncodedInput],
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
+        **kwargs,
     ) -> BatchEncoding:
+        def get_input_ids(text, max_length=None, pad_token_id=0):
+            def pad_sequence(seq, max_len, pad_tok):
+                return [pad_tok] * (max_len - len(seq)) + seq
             if isinstance(text, str):
+                tokens = self._tokenize(text)
+                if max_length is not None:
+                    tokens = pad_sequence(tokens, max_length, pad_token_id)
+                return tokens
             elif isinstance(text, list) and len(text) > 0 and isinstance(text[0], str):
+                tokenized_texts = [self._tokenize(t) for t in text]
+                if max_length is None:
+                    max_length = max(len(t) for t in tokenized_texts)
+                return [pad_sequence(t, max_length, pad_token_id) for t in tokenized_texts]
             elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                if max_length is not None and len(text) < max_length:
+                    return pad_sequence(text, max_length, pad_token_id)
                 return text
             else:
                 raise ValueError(
                     "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
+        **kwargs,
     ) -> BatchEncoding:
+        def get_input_ids(text, max_length=None, pad_token_id=0):
+            def pad_sequence(seq, max_len, pad_tok):
+                return [pad_tok] * (max_len - len(seq)) + seq
             if isinstance(text, str):
+                tokens = self._tokenize(text)
+                if max_length is not None:
+                    tokens = pad_sequence(tokens, max_length, pad_token_id)
+                return tokens
             elif isinstance(text, list) and len(text) > 0 and isinstance(text[0], str):
+                tokenized_texts = [self._tokenize(t) for t in text]
+                if max_length is None:
+                    max_length = max(len(t) for t in tokenized_texts)
+                return [pad_sequence(t, max_length, pad_token_id) for t in tokenized_texts]
             elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                if max_length is not None and len(text) < max_length:
+                    return pad_sequence(text, max_length, pad_token_id)
                 return text
             else:
                 raise ValueError(
                     "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
                 "transformers.PreTrainedTokenizerFast."
             )
+        first_max_length = 0
+        second_max_length = 0
         for ids_or_pair_ids in batch_text_or_text_pairs:
             if not isinstance(ids_or_pair_ids, (list, tuple)):
                 ids, pair_ids = ids_or_pair_ids, None
             else:
                 ids, pair_ids = ids_or_pair_ids
             first_ids = get_input_ids(ids)
             second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
+            first_max_length = max(first_max_length, len(first_ids))
+            if second_ids is not None:
+                second_max_length = max(second_max_length, len(second_ids))
+        self.first_max_length = first_max_length
+        input_ids = []
+        for ids_or_pair_ids in batch_text_or_text_pairs:
+            if not isinstance(ids_or_pair_ids, (list, tuple)):
+                ids, pair_ids = ids_or_pair_ids, None
+            else:
+                ids, pair_ids = ids_or_pair_ids
+            first_ids = get_input_ids(ids, max_length=first_max_length)
+            second_ids = get_input_ids(pair_ids, max_length=second_max_length) if pair_ids is not None else None
             input_ids.append((first_ids, second_ids))
         batch_outputs = self._batch_prepare_for_model(
         return BatchEncoding(batch_outputs)
+    def decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = None,
+        **kwargs,
+    ) -> str:
+        """
+        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
+        tokens and clean up tokenization spaces.
+        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether or not to clean up the tokenization spaces. If `None`, will default to
+                `self.clean_up_tokenization_spaces`.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+        Returns:
+            `str`: The decoded sentence.
+        """
+        # Convert inputs to python lists
+        return self._decode(
+            token_ids=token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+    def batch_decode(
+        self,
+        sequences: Union[List[int], List[List[int]]],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = None,
+        **kwargs,
+    ) -> List[str]:
+        """
+        Convert a list of lists of token ids into a list of strings by calling decode.
+        Args:
+            sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether or not to clean up the tokenization spaces. If `None`, will default to
+                `self.clean_up_tokenization_spaces`.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+        Returns:
+            `List[str]`: The list of decoded sentences.
+        """
+        return [
+            self.decode(
+                seq,
+                skip_special_tokens=skip_special_tokens,
+                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                **kwargs,
+            )
+            for seq in sequences
+        ]
     def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
         input_ids = []
         for is_user, text in conversation.iter_texts():
             input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
         if len(input_ids) > self.model_max_length:
+            input_ids = input_ids[-self.model_max_length :]
         return input_ids