nikhil_staging / src /signals /splitters /text_splitter_test_utils.py
nsthorat's picture
Push
e4f9cbe
raw
history blame
900 Bytes
"""Utilities for testing text splitters."""
from typing import Optional
from ...data.dataset_utils import lilac_span
from ...schema import TEXT_SPAN_END_FEATURE, TEXT_SPAN_START_FEATURE, VALUE_KEY, Item
def spans_to_text(text: str, spans: Optional[list[Item]]) -> list[str]:
"""Convert text and a list of spans to a list of strings."""
if not spans:
return []
return [
text[span[VALUE_KEY][TEXT_SPAN_START_FEATURE]:span[VALUE_KEY][TEXT_SPAN_END_FEATURE]]
for span in spans
]
def text_to_expected_spans(text: str, splits: list[str]) -> list[Item]:
"""Convert text and a list of splits to a list of expected spans."""
start_offset = 0
expected_spans: list[Item] = []
for split in splits:
start = text.find(split, start_offset)
end = start + len(split)
expected_spans.append(lilac_span(start=start, end=end))
start_offset = end
return expected_spans