File size: 1,260 Bytes
e4f9cbe
 
55dc3dd
e4f9cbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55dc3dd
 
e4f9cbe
 
 
 
55dc3dd
 
 
 
 
 
 
e4f9cbe
 
55dc3dd
e4f9cbe
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
"""Utilities for testing text splitters."""

from typing import Optional, Union

from ...data.dataset_utils import lilac_span
from ...schema import TEXT_SPAN_END_FEATURE, TEXT_SPAN_START_FEATURE, VALUE_KEY, Item


def spans_to_text(text: str, spans: Optional[list[Item]]) -> list[str]:
  """Convert text and a list of spans to a list of strings."""
  if not spans:
    return []
  return [
    text[span[VALUE_KEY][TEXT_SPAN_START_FEATURE]:span[VALUE_KEY][TEXT_SPAN_END_FEATURE]]
    for span in spans
  ]


def text_to_expected_spans(text: str, splits: Union[list[str], list[tuple[str,
                                                                          Item]]]) -> list[Item]:
  """Convert text and a list of splits to a list of expected spans."""
  start_offset = 0
  expected_spans: list[Item] = []
  for split in splits:
    item: Item
    if isinstance(split, str):
      split, item = split, {}
    elif isinstance(split, tuple):
      split, item = split
    else:
      raise ValueError('Split should be a string or a tuple of (string, item dict).')
    start = text.find(split, start_offset)
    end = start + len(split)
    expected_spans.append(lilac_span(start=start, end=end, metadata=item))
    start_offset = end

  return expected_spans