File size: 900 Bytes
e4f9cbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
"""Utilities for testing text splitters."""

from typing import Optional

from ...data.dataset_utils import lilac_span
from ...schema import TEXT_SPAN_END_FEATURE, TEXT_SPAN_START_FEATURE, VALUE_KEY, Item


def spans_to_text(text: str, spans: Optional[list[Item]]) -> list[str]:
  """Convert text and a list of spans to a list of strings."""
  if not spans:
    return []
  return [
    text[span[VALUE_KEY][TEXT_SPAN_START_FEATURE]:span[VALUE_KEY][TEXT_SPAN_END_FEATURE]]
    for span in spans
  ]


def text_to_expected_spans(text: str, splits: list[str]) -> list[Item]:
  """Convert text and a list of splits to a list of expected spans."""
  start_offset = 0
  expected_spans: list[Item] = []
  for split in splits:
    start = text.find(split, start_offset)
    end = start + len(split)
    expected_spans.append(lilac_span(start=start, end=end))
    start_offset = end

  return expected_spans