from typing import List, Union from relik.inference.data.objects import Word class BaseTokenizer: """ A :obj:`Tokenizer` splits strings of text into single words, optionally adds pos tags and perform lemmatization. """ def __call__( self, texts: Union[str, List[str], List[List[str]]], is_split_into_words: bool = False, **kwargs ) -> List[List[Word]]: """ Tokenize the input into single words. Args: texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): Text to tag. It can be a single string, a batch of string and pre-tokenized strings. is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`): If :obj:`True` and the input is a string, the input is split on spaces. Returns: :obj:`List[List[Word]]`: The input text tokenized in single words. """ raise NotImplementedError def tokenize(self, text: str) -> List[Word]: """ Implements splitting words into tokens. Args: text (:obj:`str`): Text to tokenize. Returns: :obj:`List[Word]`: The input text tokenized in single words. """ raise NotImplementedError def tokenize_batch(self, texts: List[str]) -> List[List[Word]]: """ Implements batch splitting words into tokens. Args: texts (:obj:`List[str]`): Batch of text to tokenize. Returns: :obj:`List[List[Word]]`: The input batch tokenized in single words. """ return [self.tokenize(text) for text in texts] @staticmethod def check_is_batched( texts: Union[str, List[str], List[List[str]]], is_split_into_words: bool ): """ Check if input is batched or a single sample. Args: texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): Text to check. is_split_into_words (:obj:`bool`): If :obj:`True` and the input is a string, the input is split on spaces. Returns: :obj:`bool`: ``True`` if ``texts`` is batched, ``False`` otherwise. """ return bool( (not is_split_into_words and isinstance(texts, (list, tuple))) or ( is_split_into_words and isinstance(texts, (list, tuple)) and texts and isinstance(texts[0], (list, tuple)) ) )