Spaces:

riccorl
/

relik-entity-linking

Sleeping

App Files Files Community

relik-entity-linking / relik /inference /data /tokenizers /regex_tokenizer.py

riccorl

first commit

626eca0 11 months ago

raw

history blame

No virus

2.43 kB

	import re
	from typing import List, Union

	from overrides import overrides

	from relik.inference.data.objects import Word
	from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer


	class RegexTokenizer(BaseTokenizer):
	"""
	A :obj:`Tokenizer` that splits the text based on a simple regex.
	"""

	def __init__(self):
	super(RegexTokenizer, self).__init__()
	# regex for splitting on spaces and punctuation and new lines
	# self._regex = re.compile(r"\S+\|[\[\](),.!?;:\"]\|\\n")
	self._regex = re.compile(
	r"\w+\|\$[\d\.]+\|\S+", re.UNICODE \| re.MULTILINE \| re.DOTALL
	)

	def __call__(
	self,
	texts: Union[str, List[str], List[List[str]]],
	is_split_into_words: bool = False,
	**kwargs,
	) -> List[List[Word]]:
	"""
	Tokenize the input into single words by splitting using a simple regex.

	Args:
	texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
	Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
	is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
	If :obj:`True` and the input is a string, the input is split on spaces.

	Returns:
	:obj:`List[List[Word]]`: The input text tokenized in single words.

	Example::

	>>> from relik.retriever.serve.tokenizers.regex_tokenizer import RegexTokenizer

	>>> regex_tokenizer = RegexTokenizer()
	>>> regex_tokenizer("Mary sold the car to John.")

	"""
	# check if input is batched or a single sample
	is_batched = self.check_is_batched(texts, is_split_into_words)

	if is_batched:
	tokenized = self.tokenize_batch(texts)
	else:
	tokenized = self.tokenize(texts)

	return tokenized

	@overrides
	def tokenize(self, text: Union[str, List[str]]) -> List[Word]:
	if not isinstance(text, (str, list)):
	raise ValueError(
	f"text must be either `str` or `list`, found: `{type(text)}`"
	)

	if isinstance(text, list):
	text = " ".join(text)
	return [
	Word(t[0], i, start_char=t[1], end_char=t[2])
	for i, t in enumerate(
	(m.group(0), m.start(), m.end()) for m in self._regex.finditer(text)
	)
	]