support-multiple-task-ids (#5)

Browse files

- support multiple task ids (6170b43f054d6e22f48dec300374eeaec5b70a75)
- add assertions and docs (4b6651909fb268865ca95db85a7ca214631cd5bf)

Co-authored-by: Michael Günther <michael-guenther@users.noreply.huggingface.co>

Files changed (1) hide show

tokenizer.py +42 -15

tokenizer.py CHANGED Viewed

@@ -5,19 +5,26 @@ import warnings
 class JinaTokenizer(RobertaTokenizer):
-    def __init__(self, *args, task_type_vocab_size=6, **kwargs):
         super().__init__(*args, **kwargs)
-        self.task_type_vocab_size = task_type_vocab_size
     def __call__(self, *args, task_type=None, **kwargs):
         batch_encoding = super().__call__(*args, **kwargs)
-        batch_encoding = BatchEncoding(
-            {
-                'task_type_ids': self._get_task_type_ids(batch_encoding, task_type),
-                **batch_encoding,
-            },
-            tensor_type=kwargs.get('return_tensors'),
-        )
         return batch_encoding
     def _batch_encode_plus(self, *args, task_type=None, **kwargs):
@@ -45,18 +52,38 @@ class JinaTokenizer(RobertaTokenizer):
         return batch_encoding
     @staticmethod
-    def _get_task_type_ids(batch_encoding: BatchEncoding, task_type: int):
         if isinstance(batch_encoding['input_ids'], torch.Tensor):
             shape = batch_encoding['input_ids'].shape
-            return torch.ones(shape, dtype=torch.long) * task_type
         else:
-            shape = torch.tensor(batch_encoding['input_ids']).shape
             if isinstance(batch_encoding['input_ids'], list):
-                return (torch.ones(shape, dtype=torch.long) * task_type).tolist()
             elif isinstance(batch_encoding['input_ids'], np.array):
-                return (torch.ones(shape, dtype=torch.long) * task_type).numpy()
             else:
                 warnings.warn(
                     'input_ids is not a torch tensor, numpy array, or list. Returning torch tensor'
                 )
-                return torch.ones(shape, dtype=torch.long) * task_type

 class JinaTokenizer(RobertaTokenizer):
+    def __init__(self, *args, **kwargs):
+        """
+        JinaTokenizer extends the RobertaTokenizer class to include task_type_ids in
+        the batch encoding.
+        The task_type_ids are used to pass instruction information to the model.
+        A task_type should either be an integer or a sequence of integers with the same
+        length as the batch size.
+        """
         super().__init__(*args, **kwargs)
     def __call__(self, *args, task_type=None, **kwargs):
         batch_encoding = super().__call__(*args, **kwargs)
+        if task_type is not None:
+            batch_encoding = BatchEncoding(
+                {
+                    'task_type_ids': self._get_task_type_ids(batch_encoding, task_type),
+                    **batch_encoding,
+                },
+                tensor_type=kwargs.get('return_tensors'),
+            )
         return batch_encoding
     def _batch_encode_plus(self, *args, task_type=None, **kwargs):
         return batch_encoding
     @staticmethod
+    def _get_task_type_ids(batch_encoding: BatchEncoding, task_type):
+        def apply_task_type(m, x):
+            x = torch.tensor(x)
+            assert (
+                len(x.shape) == 0 or x.shape[0] == m.shape[0]
+            ), 'The shape of task_type does not match the size of the batch.'
+            return m * x if len(x.shape) == 0 else m * x[:, None]
         if isinstance(batch_encoding['input_ids'], torch.Tensor):
             shape = batch_encoding['input_ids'].shape
+            return apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
         else:
+            try:
+                shape = torch.tensor(batch_encoding['input_ids']).shape
+            except:
+                raise ValueError(
+                    "Unable to create tensor, you should probably "
+                    "activate truncation and/or padding with "
+                    "'padding=True' 'truncation=True' to have batched "
+                    "tensors with the same length."
+                )
             if isinstance(batch_encoding['input_ids'], list):
+                return (
+                    apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
+                ).tolist()
             elif isinstance(batch_encoding['input_ids'], np.array):
+                return (
+                    apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
+                ).numpy()
             else:
                 warnings.warn(
                     'input_ids is not a torch tensor, numpy array, or list. Returning torch tensor'
                 )
+                return apply_task_type(torch.ones(shape, dtype=torch.long), task_type)