jinaai
/

jina-bert-flash-implementation

Transformers

bert

custom_code

🇪🇺 Region: EU

Model card Files Files and versions Community

jupyterjazz commited on Mar 7, 2024

Commit

72221a7

1 Parent(s): 9b5c148

refactor: batch encoding

Browse files

Signed-off-by: jupyterjazz <saba.sturua@jina.ai>

Files changed (1) hide show

tokenizer.py +15 -23

tokenizer.py CHANGED Viewed

@@ -8,8 +8,8 @@ def get_tokenizer(parent_class):
     class TokenizerClass(parent_class):
         def __init__(self, *args, **kwargs):
             """
-            JinaTokenizer extends the RobertaTokenizer class to include task_type_ids in
-            the batch encoding.
             The task_type_ids are used to pass instruction information to the model.
             A task_type should either be an integer or a sequence of integers with the same
             length as the batch size.
@@ -19,39 +19,31 @@ def get_tokenizer(parent_class):
         def __call__(self, *args, task_type=None, **kwargs):
             batch_encoding = super().__call__(*args, **kwargs)
             if task_type is not None:
-                batch_encoding = BatchEncoding(
-                    {
-                        'task_type_ids': self._get_task_type_ids(batch_encoding, task_type),
-                        **batch_encoding,
-                    },
-                    tensor_type=kwargs.get('return_tensors'),
-                )
             return batch_encoding
         def _batch_encode_plus(self, *args, task_type=None, **kwargs):
             batch_encoding = super()._batch_encode_plus(*args, **kwargs)
             if task_type is not None:
-                batch_encoding = BatchEncoding(
-                    {
-                        'task_type_ids': self._get_task_type_ids(batch_encoding, task_type),
-                        **batch_encoding,
-                    },
-                    tensor_type=kwargs.get('return_tensors'),
-                )
             return batch_encoding
         def _encode_plus(self, *args, task_type=None, **kwargs):
             batch_encoding = super()._encode_plus(*args, **kwargs)
             if task_type is not None:
-                batch_encoding = BatchEncoding(
-                    {
-                        'task_type_ids': self._get_task_type_ids(batch_encoding, task_type),
-                        **batch_encoding,
-                    },
-                    tensor_type=kwargs.get('return_tensors'),
-                )
             return batch_encoding
         @staticmethod
         def _get_task_type_ids(batch_encoding: BatchEncoding, task_type):

     class TokenizerClass(parent_class):
         def __init__(self, *args, **kwargs):
             """
+            This class dynamically extends a given tokenizer class from the HF
+            Transformers library (RobertaTokenizer or RobertaTokenizerFast).
             The task_type_ids are used to pass instruction information to the model.
             A task_type should either be an integer or a sequence of integers with the same
             length as the batch size.
         def __call__(self, *args, task_type=None, **kwargs):
             batch_encoding = super().__call__(*args, **kwargs)
             if task_type is not None:
+                batch_encoding = self._add_task_type_ids(batch_encoding, task_type, kwargs.get('return_tensors'))
             return batch_encoding
         def _batch_encode_plus(self, *args, task_type=None, **kwargs):
             batch_encoding = super()._batch_encode_plus(*args, **kwargs)
             if task_type is not None:
+                batch_encoding = self._add_task_type_ids(batch_encoding, task_type, kwargs.get('return_tensors'))
             return batch_encoding
         def _encode_plus(self, *args, task_type=None, **kwargs):
             batch_encoding = super()._encode_plus(*args, **kwargs)
             if task_type is not None:
+                batch_encoding = self._add_task_type_ids(batch_encoding, task_type, kwargs.get('return_tensors'))
             return batch_encoding
+        @classmethod
+        def _add_task_type_ids(cls, batch_encoding, task_type, tensor_type):
+            return BatchEncoding(
+                {
+                    'task_type_ids': cls._get_task_type_ids(batch_encoding, task_type),
+                    **batch_encoding,
+                },
+                tensor_type=tensor_type,
+            )
         @staticmethod
         def _get_task_type_ids(batch_encoding: BatchEncoding, task_type):