riccorl's picture
first commit
626eca0
raw
history blame
2.43 kB
import re
from typing import List, Union
from overrides import overrides
from relik.inference.data.objects import Word
from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer
class RegexTokenizer(BaseTokenizer):
"""
A :obj:`Tokenizer` that splits the text based on a simple regex.
"""
def __init__(self):
super(RegexTokenizer, self).__init__()
# regex for splitting on spaces and punctuation and new lines
# self._regex = re.compile(r"\S+|[\[\](),.!?;:\"]|\\n")
self._regex = re.compile(
r"\w+|\$[\d\.]+|\S+", re.UNICODE | re.MULTILINE | re.DOTALL
)
def __call__(
self,
texts: Union[str, List[str], List[List[str]]],
is_split_into_words: bool = False,
**kwargs,
) -> List[List[Word]]:
"""
Tokenize the input into single words by splitting using a simple regex.
Args:
texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
If :obj:`True` and the input is a string, the input is split on spaces.
Returns:
:obj:`List[List[Word]]`: The input text tokenized in single words.
Example::
>>> from relik.retriever.serve.tokenizers.regex_tokenizer import RegexTokenizer
>>> regex_tokenizer = RegexTokenizer()
>>> regex_tokenizer("Mary sold the car to John.")
"""
# check if input is batched or a single sample
is_batched = self.check_is_batched(texts, is_split_into_words)
if is_batched:
tokenized = self.tokenize_batch(texts)
else:
tokenized = self.tokenize(texts)
return tokenized
@overrides
def tokenize(self, text: Union[str, List[str]]) -> List[Word]:
if not isinstance(text, (str, list)):
raise ValueError(
f"text must be either `str` or `list`, found: `{type(text)}`"
)
if isinstance(text, list):
text = " ".join(text)
return [
Word(t[0], i, start_char=t[1], end_char=t[2])
for i, t in enumerate(
(m.group(0), m.start(), m.end()) for m in self._regex.finditer(text)
)
]