jinaai
/

jina-bert-flash-implementation

@@ -1,62 +1,140 @@
 import torch
 import numpy as np
-from transformers import RobertaTokenizer, BatchEncoding
 import warnings
-class JinaTokenizer(RobertaTokenizer):
-    def __init__(self, *args, task_type_vocab_size=6, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.task_type_vocab_size = task_type_vocab_size
-    def __call__(self, *args, task_type=None, **kwargs):
-        batch_encoding = super().__call__(*args, **kwargs)
-        batch_encoding = BatchEncoding(
-            {
-                'task_type_ids': self._get_task_type_ids(batch_encoding, task_type),
-                **batch_encoding,
-            },
-            tensor_type=kwargs.get('return_tensors'),
-        )
-        return batch_encoding
-    def _batch_encode_plus(self, *args, task_type=None, **kwargs):
-        batch_encoding = super()._batch_encode_plus(*args, **kwargs)
-        if task_type is not None:
-            batch_encoding = BatchEncoding(
-                {
-                    'task_type_ids': self._get_task_type_ids(batch_encoding, task_type),
-                    **batch_encoding,
-                },
-                tensor_type=kwargs.get('return_tensors'),
             )
-        return batch_encoding
-    def _encode_plus(self, *args, task_type=None, **kwargs):
-        batch_encoding = super()._encode_plus(*args, **kwargs)
-        if task_type is not None:
-            batch_encoding = BatchEncoding(
                 {
-                    'task_type_ids': self._get_task_type_ids(batch_encoding, task_type),
                     **batch_encoding,
                 },
-                tensor_type=kwargs.get('return_tensors'),
             )
-        return batch_encoding
-    @staticmethod
-    def _get_task_type_ids(batch_encoding: BatchEncoding, task_type: int):
-        if isinstance(batch_encoding['input_ids'], torch.Tensor):
-            shape = batch_encoding['input_ids'].shape
-            return torch.ones(shape, dtype=torch.long) * task_type
-        else:
-            shape = torch.tensor(batch_encoding['input_ids']).shape
-            if isinstance(batch_encoding['input_ids'], list):
-                return (torch.ones(shape, dtype=torch.long) * task_type).tolist()
-            elif isinstance(batch_encoding['input_ids'], np.array):
-                return (torch.ones(shape, dtype=torch.long) * task_type).numpy()
             else:
-                warnings.warn(
-                    'input_ids is not a torch tensor, numpy array, or list. Returning torch tensor'
-                )
-                return torch.ones(shape, dtype=torch.long) * task_type

 import torch
 import numpy as np
+from transformers import RobertaTokenizer, BatchEncoding, RobertaTokenizerFast
 import warnings
+def get_tokenizer(parent_class):
+    class TokenizerClass(parent_class):
+        def __init__(self, *args, **kwargs):
+            """
+            This class dynamically extends a given tokenizer class from the HF
+            Transformers library (RobertaTokenizer or RobertaTokenizerFast).
+            The task_type_ids are used to pass instruction information to the model.
+            A task_type should either be an integer or a sequence of integers with the same
+            length as the batch size.
+            """
+            super().__init__(*args, **kwargs)
+            self.cls_token_interval = kwargs.get('cls_token_interval')
+        def __call__(self, *args, task_type=None, **kwargs):
+            return super().__call__(*args, **kwargs)
+        def _encode_plus(self, *args, **kwargs):
+            return self._process_encoding(super()._encode_plus(*args, **kwargs), **kwargs)
+        def _batch_encode_plus(self, *args, **kwargs):
+            return self._process_encoding(
+                super()._batch_encode_plus(*args, **kwargs), **kwargs
             )
+        def _process_encoding(self, batch_encoding: BatchEncoding, **kwargs):
+            task_type = kwargs.get("task_type")
+            if self.cls_token_interval is not None:
+                modified_input_ids, modified_attention_mask, modified_special_tokens_mask = self._insert_cls_tokens(
+                    batch_encoding
+                )
+                batch_encoding["input_ids"] = modified_input_ids
+                if "attention_mask" in batch_encoding:
+                    batch_encoding["attention_mask"] = modified_attention_mask
+                if "special_tokens_mask" in batch_encoding:
+                    batch_encoding["special_tokens_mask"] = modified_special_tokens_mask
+            if task_type is not None:
+                batch_encoding = self._add_task_type_ids(batch_encoding, task_type, kwargs.get('return_tensors'))
+            return BatchEncoding(batch_encoding, tensor_type=kwargs.get("return_tensors"))
+        def _insert_cls_tokens(self, batch_encoding: BatchEncoding):
+            cls_token_id = self.cls_token_id
+            new_input_ids = []
+            new_attention_masks = []
+            new_special_tokens_masks = []
+            sequences = batch_encoding["input_ids"].tolist()
+            original_attention_masks = batch_encoding["attention_mask"].tolist()
+            original_special_tokens_mask = batch_encoding["special_tokens_mask"].tolist()
+            for seq_index, sequence in enumerate(sequences):
+                original_sequence_length = sum(original_attention_masks[seq_index])
+                num_cls_tokens_to_add = (
+                                                original_sequence_length - 1
+                                        ) // self.cls_token_interval
+                new_sequence_length = original_sequence_length + num_cls_tokens_to_add
+                special_tokens_mask = original_special_tokens_mask[seq_index]
+                modified_sequence = [sequence[0]]
+                modified_special_tokens_mask = [special_tokens_mask[0]]
+                for i in range(1, len(sequence), self.cls_token_interval):
+                    modified_sequence.extend(sequence[i: i + self.cls_token_interval])
+                    modified_special_tokens_mask.extend(special_tokens_mask[i: i + self.cls_token_interval])
+                    if i + self.cls_token_interval < len(sequence):
+                        modified_sequence.append(cls_token_id)
+                        modified_special_tokens_mask.append(1)
+                new_input_ids.append(modified_sequence)
+                new_attention_mask = [1] * new_sequence_length + [0] * (
+                        len(modified_sequence) - new_sequence_length
+                )
+                new_special_tokens_masks.append(modified_special_tokens_mask)
+                new_attention_masks.append(new_attention_mask)
+            new_input_ids = torch.tensor(new_input_ids, dtype=torch.long)
+            new_attention_masks = torch.tensor(new_attention_masks, dtype=torch.long)
+            new_special_tokens_masks = torch.tensor(new_special_tokens_masks, dtype=torch.long)
+            return new_input_ids, new_attention_masks, new_special_tokens_masks
+        @classmethod
+        def _add_task_type_ids(cls, batch_encoding, task_type, tensor_type):
+            return BatchEncoding(
                 {
+                    'task_type_ids': cls._get_task_type_ids(batch_encoding, task_type),
                     **batch_encoding,
                 },
+                tensor_type=tensor_type,
             )
+        @staticmethod
+        def _get_task_type_ids(batch_encoding: BatchEncoding, task_type):
+            def apply_task_type(m, x):
+                x = torch.tensor(x)
+                assert (
+                        len(x.shape) == 0 or x.shape[0] == m.shape[0]
+                ), 'The shape of task_type does not match the size of the batch.'
+                return m * x if len(x.shape) == 0 else m * x[:, None]
+            if isinstance(batch_encoding['input_ids'], torch.Tensor):
+                shape = batch_encoding['input_ids'].shape
+                return apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
             else:
+                try:
+                    shape = torch.tensor(batch_encoding['input_ids']).shape
+                except:
+                    raise ValueError(
+                        "Unable to create tensor, you should probably "
+                        "activate truncation and/or padding with "
+                        "'padding=True' 'truncation=True' to have batched "
+                        "tensors with the same length."
+                    )
+                if isinstance(batch_encoding['input_ids'], list):
+                    return (
+                        apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
+                    ).tolist()
+                elif isinstance(batch_encoding['input_ids'], np.array):
+                    return (
+                        apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
+                    ).numpy()
+                else:
+                    warnings.warn(
+                        'input_ids is not a torch tensor, numpy array, or list. Returning torch tensor'
+                    )
+                    return apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
+    return TokenizerClass
+JinaTokenizer = get_tokenizer(RobertaTokenizer)
+JinaTokenizerFast = get_tokenizer(RobertaTokenizerFast)