File size: 3,954 Bytes

6343db7
 
ed1b276
6343db7
 
 
ed1b276
 
 
 
 
 
 
 
 
 
 
6343db7
ed1b276
 
 
 
 
6343db7
ed1b276
 
 
 
 
 
 
 
 
 
 
6343db7
ed1b276
 
 
6343db7
ed1b276
6343db7
 
ed1b276
6343db7
 
ed1b276
 
e151a8f
ed1b276
 
 
 
 
 
e151a8f
ed1b276
 
e151a8f
ed1b276

import torch
import numpy as np
from transformers import RobertaTokenizer, BatchEncoding, RobertaTokenizerFast
import warnings


def get_tokenizer(parent_class):
    class TokenizerClass(parent_class):
        def __init__(self, *args, **kwargs):
            """
            This class dynamically extends a given tokenizer class from the HF
            Transformers library (RobertaTokenizer or RobertaTokenizerFast).
            The task_type_ids are used to pass instruction information to the model.
            A task_type should either be an integer or a sequence of integers with the same
            length as the batch size.
            """
            super().__init__(*args, **kwargs)

        def __call__(self, *args, task_type=None, **kwargs):
            batch_encoding = super().__call__(*args, **kwargs)
            if task_type is not None:
                batch_encoding = self._add_task_type_ids(batch_encoding, task_type, kwargs.get('return_tensors'))
            return batch_encoding

        def _batch_encode_plus(self, *args, task_type=None, **kwargs):
            batch_encoding = super()._batch_encode_plus(*args, **kwargs)
            if task_type is not None:
                batch_encoding = self._add_task_type_ids(batch_encoding, task_type, kwargs.get('return_tensors'))
            return batch_encoding

        def _encode_plus(self, *args, task_type=None, **kwargs):
            batch_encoding = super()._encode_plus(*args, **kwargs)
            if task_type is not None:
                batch_encoding = self._add_task_type_ids(batch_encoding, task_type, kwargs.get('return_tensors'))
            return batch_encoding

        @classmethod
        def _add_task_type_ids(cls, batch_encoding, task_type, tensor_type):
            return BatchEncoding(
                {
                    'task_type_ids': cls._get_task_type_ids(batch_encoding, task_type),
                    **batch_encoding,
                },
                tensor_type=tensor_type,
            )

        @staticmethod
        def _get_task_type_ids(batch_encoding: BatchEncoding, task_type):

            def apply_task_type(m, x):
                x = torch.tensor(x)
                assert (
                        len(x.shape) == 0 or x.shape[0] == m.shape[0]
                ), 'The shape of task_type does not match the size of the batch.'
                return m * x if len(x.shape) == 0 else m * x[:, None]

            if isinstance(batch_encoding['input_ids'], torch.Tensor):
                shape = batch_encoding['input_ids'].shape
                return apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
            else:
                try:
                    shape = torch.tensor(batch_encoding['input_ids']).shape
                except:
                    raise ValueError(
                        "Unable to create tensor, you should probably "
                        "activate truncation and/or padding with "
                        "'padding=True' 'truncation=True' to have batched "
                        "tensors with the same length."
                    )
                if isinstance(batch_encoding['input_ids'], list):
                    return (
                        apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
                    ).tolist()
                elif isinstance(batch_encoding['input_ids'], np.array):
                    return (
                        apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
                    ).numpy()
                else:
                    warnings.warn(
                        'input_ids is not a torch tensor, numpy array, or list. Returning torch tensor'
                    )
                    return apply_task_type(torch.ones(shape, dtype=torch.long), task_type)

    return TokenizerClass


JinaTokenizer = get_tokenizer(RobertaTokenizer)
JinaTokenizerFast = get_tokenizer(RobertaTokenizerFast)