jinaai
/

jina-bert-flash-implementation

@@ -1,62 +1,94 @@
-import torch
-import numpy as np
-from transformers import RobertaTokenizer, BatchEncoding
 import warnings
 class JinaTokenizer(RobertaTokenizer):
-    def __init__(self, *args, task_type_vocab_size=6, **kwargs):
         super().__init__(*args, **kwargs)
         self.task_type_vocab_size = task_type_vocab_size
     def __call__(self, *args, task_type=None, **kwargs):
-        batch_encoding = super().__call__(*args, **kwargs)
-        batch_encoding = BatchEncoding(
-            {
-                'task_type_ids': self._get_task_type_ids(batch_encoding, task_type),
-                **batch_encoding,
-            },
-            tensor_type=kwargs.get('return_tensors'),
         )
-        return batch_encoding
-    def _batch_encode_plus(self, *args, task_type=None, **kwargs):
-        batch_encoding = super()._batch_encode_plus(*args, **kwargs)
-        if task_type is not None:
-            batch_encoding = BatchEncoding(
-                {
-                    'task_type_ids': self._get_task_type_ids(batch_encoding, task_type),
-                    **batch_encoding,
-                },
-                tensor_type=kwargs.get('return_tensors'),
             )
-        return batch_encoding
-    def _encode_plus(self, *args, task_type=None, **kwargs):
-        batch_encoding = super()._encode_plus(*args, **kwargs)
         if task_type is not None:
-            batch_encoding = BatchEncoding(
-                {
-                    'task_type_ids': self._get_task_type_ids(batch_encoding, task_type),
-                    **batch_encoding,
-                },
-                tensor_type=kwargs.get('return_tensors'),
             )
-        return batch_encoding
     @staticmethod
     def _get_task_type_ids(batch_encoding: BatchEncoding, task_type: int):
-        if isinstance(batch_encoding['input_ids'], torch.Tensor):
-            shape = batch_encoding['input_ids'].shape
             return torch.ones(shape, dtype=torch.long) * task_type
         else:
-            shape = torch.tensor(batch_encoding['input_ids']).shape
-            if isinstance(batch_encoding['input_ids'], list):
                 return (torch.ones(shape, dtype=torch.long) * task_type).tolist()
-            elif isinstance(batch_encoding['input_ids'], np.array):
                 return (torch.ones(shape, dtype=torch.long) * task_type).numpy()
             else:
                 warnings.warn(
-                    'input_ids is not a torch tensor, numpy array, or list. Returning torch tensor'
                 )
                 return torch.ones(shape, dtype=torch.long) * task_type

 import warnings
+import numpy as np
+import torch
+from transformers import BatchEncoding, RobertaTokenizer
 class JinaTokenizer(RobertaTokenizer):
+    def __init__(
+        self, *args, task_type_vocab_size=6, cls_token_interval=None, **kwargs
+    ):
         super().__init__(*args, **kwargs)
         self.task_type_vocab_size = task_type_vocab_size
+        self.cls_token_interval = cls_token_interval
     def __call__(self, *args, task_type=None, **kwargs):
+        kwargs["task_type"] = task_type
+        return super().__call__(*args, **kwargs)
+    def _encode_plus(self, *args, **kwargs):
+        return self._process_encoding(super()._encode_plus(*args, **kwargs), **kwargs)
+    def _batch_encode_plus(self, *args, **kwargs):
+        return self._process_encoding(
+            super()._batch_encode_plus(*args, **kwargs), **kwargs
         )
+    def _process_encoding(self, batch_encoding: BatchEncoding, **kwargs):
+        task_type = kwargs.get("task_type")
+        if self.cls_token_interval is not None:
+            modified_input_ids, modified_attention_mask = self._insert_cls_tokens(
+                batch_encoding
             )
+            batch_encoding["input_ids"] = modified_input_ids
+            if "attention_mask" in batch_encoding:
+                print(batch_encoding["attention_mask"])
+                batch_encoding["attention_mask"] = modified_attention_mask
         if task_type is not None:
+            task_type_ids = self._get_task_type_ids(batch_encoding, task_type)
+            batch_encoding["task_type_ids"] = task_type_ids
+        return BatchEncoding(batch_encoding, tensor_type=kwargs.get("return_tensors"))
+    def _insert_cls_tokens(self, batch_encoding: BatchEncoding):
+        cls_token_id = self.cls_token_id
+        new_input_ids = []
+        new_attention_masks = []
+        sequences = batch_encoding["input_ids"].tolist()
+        original_attention_masks = batch_encoding["attention_mask"].tolist()
+        for seq_index, sequence in enumerate(sequences):
+            original_sequence_length = sum(original_attention_masks[seq_index])
+            num_cls_tokens_to_add = (
+                original_sequence_length - 1
+            ) // self.cls_token_interval
+            new_sequence_length = original_sequence_length + num_cls_tokens_to_add
+            modified_sequence = [sequence[0]]
+            for i in range(1, len(sequence), self.cls_token_interval):
+                chunk = sequence[i : i + self.cls_token_interval]
+                modified_sequence.extend(chunk)
+                if i + self.cls_token_interval < len(sequence):
+                    modified_sequence.append(cls_token_id)
+            new_input_ids.append(modified_sequence)
+            new_attention_mask = [1] * new_sequence_length + [0] * (
+                len(modified_sequence) - new_sequence_length
             )
+            new_attention_masks.append(new_attention_mask)
+        new_input_ids = torch.tensor(new_input_ids, dtype=torch.long)
+        new_attention_masks = torch.tensor(new_attention_masks, dtype=torch.long)
+        return new_input_ids, new_attention_masks
     @staticmethod
     def _get_task_type_ids(batch_encoding: BatchEncoding, task_type: int):
+        if isinstance(batch_encoding["input_ids"], torch.Tensor):
+            shape = batch_encoding["input_ids"].shape
             return torch.ones(shape, dtype=torch.long) * task_type
         else:
+            shape = torch.tensor(batch_encoding["input_ids"]).shape
+            if isinstance(batch_encoding["input_ids"], list):
                 return (torch.ones(shape, dtype=torch.long) * task_type).tolist()
+            elif isinstance(batch_encoding["input_ids"], np.array):
                 return (torch.ones(shape, dtype=torch.long) * task_type).numpy()
             else:
                 warnings.warn(
+                    "input_ids is not a torch tensor, numpy array, or list. Returning torch tensor"
                 )
                 return torch.ones(shape, dtype=torch.long) * task_type