Spaces:

riccorl
/

relik-entity-linking

Sleeping

App Files Files Community

relik-entity-linking / relik /inference /data /tokenizers /regex_tokenizer.py

riccorl

first commit

626eca0 about 1 year ago

raw

history blame

2.43 kB

	import re
	from typing import List, Union

	from overrides import overrides

	from relik.inference.data.objects import Word
	from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer


	class RegexTokenizer(BaseTokenizer):
	"""
	A :obj:`Tokenizer` that splits the text based on a simple regex.
	"""

	def __init__(self):
	super(RegexTokenizer, self).__init__()
	# regex for splitting on spaces and punctuation and new lines
	# self._regex = re.compile(r"\S+\|[\[\](),.!?;:\"]\|\\n")
	self._regex = re.compile(
	r"\w+\|\$[\d\.]+\|\S+", re.UNICODE \| re.MULTILINE \| re.DOTALL
	)

	def __call__(
	self,
	texts: Union[str, List[str], List[List[str]]],
	is_split_into_words: bool = False,
	**kwargs,
	) -> List[List[Word]]:
	"""
	Tokenize the input into single words by splitting using a simple regex.

	Args:
	texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
	Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
	is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
	If :obj:`True` and the input is a string, the input is split on spaces.

	Returns:
	:obj:`List[List[Word]]`: The input text tokenized in single words.

	Example::

	>>> from relik.retriever.serve.tokenizers.regex_tokenizer import RegexTokenizer

	>>> regex_tokenizer = RegexTokenizer()
	>>> regex_tokenizer("Mary sold the car to John.")

	"""
	# check if input is batched or a single sample
	is_batched = self.check_is_batched(texts, is_split_into_words)

	if is_batched:
	tokenized = self.tokenize_batch(texts)
	else:
	tokenized = self.tokenize(texts)

	return tokenized

	@overrides
	def tokenize(self, text: Union[str, List[str]]) -> List[Word]:
	if not isinstance(text, (str, list)):
	raise ValueError(
	f"text must be either `str` or `list`, found: `{type(text)}`"
	)

	if isinstance(text, list):
	text = " ".join(text)
	return [
	Word(t[0], i, start_char=t[1], end_char=t[2])
	for i, t in enumerate(
	(m.group(0), m.start(), m.end()) for m in self._regex.finditer(text)
	)
	]