nikhil_staging / src /signals /substring_search.py
nsthorat's picture
Push
e4f9cbe
raw
history blame
No virus
1.01 kB
"""A signal to search for a substring in a document."""
import re
from typing import Any, Iterable, Optional
from typing_extensions import override
from ..data.dataset_utils import lilac_span
from ..schema import Field, Item, RichData, SignalInputType, field
from .signal import Signal
class SubstringSignal(Signal):
"""Find a substring in a document."""
name = 'substring_search'
display_name = 'Substring Search'
input_type = SignalInputType.TEXT
compute_type = SignalInputType.TEXT
query: str
_regex: re.Pattern[str]
def __init__(self, **kwargs: Any):
super().__init__(**kwargs)
self._regex = re.compile(self.query, re.IGNORECASE)
@override
def fields(self) -> Field:
return field(fields=['string_span'])
@override
def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
for text in data:
if not isinstance(text, str):
yield None
continue
yield [lilac_span(m.start(), m.end()) for m in self._regex.finditer(text)]