|
import re |
|
from typing import List, Union |
|
|
|
from overrides import overrides |
|
|
|
from relik.inference.data.objects import Word |
|
from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer |
|
|
|
|
|
class RegexTokenizer(BaseTokenizer): |
|
""" |
|
A :obj:`Tokenizer` that splits the text based on a simple regex. |
|
""" |
|
|
|
def __init__(self): |
|
super(RegexTokenizer, self).__init__() |
|
|
|
|
|
self._regex = re.compile( |
|
r"\w+|\$[\d\.]+|\S+", re.UNICODE | re.MULTILINE | re.DOTALL |
|
) |
|
|
|
def __call__( |
|
self, |
|
texts: Union[str, List[str], List[List[str]]], |
|
is_split_into_words: bool = False, |
|
**kwargs, |
|
) -> List[List[Word]]: |
|
""" |
|
Tokenize the input into single words by splitting using a simple regex. |
|
|
|
Args: |
|
texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): |
|
Text to tag. It can be a single string, a batch of string and pre-tokenized strings. |
|
is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`): |
|
If :obj:`True` and the input is a string, the input is split on spaces. |
|
|
|
Returns: |
|
:obj:`List[List[Word]]`: The input text tokenized in single words. |
|
|
|
Example:: |
|
|
|
>>> from relik.retriever.serve.tokenizers.regex_tokenizer import RegexTokenizer |
|
|
|
>>> regex_tokenizer = RegexTokenizer() |
|
>>> regex_tokenizer("Mary sold the car to John.") |
|
|
|
""" |
|
|
|
is_batched = self.check_is_batched(texts, is_split_into_words) |
|
|
|
if is_batched: |
|
tokenized = self.tokenize_batch(texts) |
|
else: |
|
tokenized = self.tokenize(texts) |
|
|
|
return tokenized |
|
|
|
@overrides |
|
def tokenize(self, text: Union[str, List[str]]) -> List[Word]: |
|
if not isinstance(text, (str, list)): |
|
raise ValueError( |
|
f"text must be either `str` or `list`, found: `{type(text)}`" |
|
) |
|
|
|
if isinstance(text, list): |
|
text = " ".join(text) |
|
return [ |
|
Word(t[0], i, start_char=t[1], end_char=t[2]) |
|
for i, t in enumerate( |
|
(m.group(0), m.start(), m.end()) for m in self._regex.finditer(text) |
|
) |
|
] |
|
|