import torch
import numpy as np
from transformers import RobertaTokenizer, BatchEncoding, RobertaTokenizerFast
import warnings


def get_tokenizer(parent_class):
    class TokenizerClass(parent_class):
        def __init__(self, *args, **kwargs):
            """
            This class dynamically extends a given tokenizer class from the HF
            Transformers library (RobertaTokenizer or RobertaTokenizerFast).
            The task_type_ids are used to pass instruction information to the model.
            A task_type should either be an integer or a sequence of integers with the same
            length as the batch size.
            """
            super().__init__(*args, **kwargs)

        def __call__(self, *args, task_type=None, **kwargs):
            batch_encoding = super().__call__(*args, **kwargs)
            if task_type is not None:
                batch_encoding = self._add_task_type_ids(batch_encoding, task_type, kwargs.get('return_tensors'))
            return batch_encoding

        def _batch_encode_plus(self, *args, task_type=None, **kwargs):
            batch_encoding = super()._batch_encode_plus(*args, **kwargs)
            if task_type is not None:
                batch_encoding = self._add_task_type_ids(batch_encoding, task_type, kwargs.get('return_tensors'))
            return batch_encoding

        def _encode_plus(self, *args, task_type=None, **kwargs):
            batch_encoding = super()._encode_plus(*args, **kwargs)
            if task_type is not None:
                batch_encoding = self._add_task_type_ids(batch_encoding, task_type, kwargs.get('return_tensors'))
            return batch_encoding

        @classmethod
        def _add_task_type_ids(cls, batch_encoding, task_type, tensor_type):
            return BatchEncoding(
                {
                    'task_type_ids': cls._get_task_type_ids(batch_encoding, task_type),
                    **batch_encoding,
                },
                tensor_type=tensor_type,
            )

        @staticmethod
        def _get_task_type_ids(batch_encoding: BatchEncoding, task_type):

            def apply_task_type(m, x):
                x = torch.tensor(x)
                assert (
                        len(x.shape) == 0 or x.shape[0] == m.shape[0]
                ), 'The shape of task_type does not match the size of the batch.'
                return m * x if len(x.shape) == 0 else m * x[:, None]

            if isinstance(batch_encoding['input_ids'], torch.Tensor):
                shape = batch_encoding['input_ids'].shape
                return apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
            else:
                try:
                    shape = torch.tensor(batch_encoding['input_ids']).shape
                except:
                    raise ValueError(
                        "Unable to create tensor, you should probably "
                        "activate truncation and/or padding with "
                        "'padding=True' 'truncation=True' to have batched "
                        "tensors with the same length."
                    )
                if isinstance(batch_encoding['input_ids'], list):
                    return (
                        apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
                    ).tolist()
                elif isinstance(batch_encoding['input_ids'], np.array):
                    return (
                        apply_task_type(torch.ones(shape, dtype=torch.long), task_type)
                    ).numpy()
                else:
                    warnings.warn(
                        'input_ids is not a torch tensor, numpy array, or list. Returning torch tensor'
                    )
                    return apply_task_type(torch.ones(shape, dtype=torch.long), task_type)

    return TokenizerClass


JinaTokenizer = get_tokenizer(RobertaTokenizer)
JinaTokenizerFast = get_tokenizer(RobertaTokenizerFast)