File size: 4,120 Bytes
02ae0bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from typing import List, Tuple


def replace_refspans(
    spans_to_replace: List[Tuple[int, int, str, str]],
    full_string: str,
    pre_padding: str = "",
    post_padding: str = "",
    btwn_padding: str = ", "
) -> str:
    """
    For each span within the full string, replace that span with new text
    :param spans_to_replace: list of tuples of form (start_ind, end_ind, span_text, new_substring)
    :param full_string:
    :param pre_padding:
    :param post_padding:
    :param btwn_padding:
    :return:
    """
    # assert all spans are equal to full_text span
    assert all([full_string[start:end] == span for start, end, span, _ in spans_to_replace])

    # assert none of the spans start with the same start ind
    start_inds = [rep[0] for rep in spans_to_replace]
    assert len(set(start_inds)) == len(start_inds)

    # sort by start index
    spans_to_replace.sort(key=lambda x: x[0])

    # form strings for each span group
    for i, entry in enumerate(spans_to_replace):
        start, end, span, new_string = entry

        # skip empties
        if end <= 0:
            continue

        # compute shift amount
        shift_amount = len(new_string) - len(span) + len(pre_padding) + len(post_padding)

        # shift remaining appropriately
        for ind in range(i + 1, len(spans_to_replace)):
            next_start, next_end, next_span, next_string = spans_to_replace[ind]
            # skip empties
            if next_end <= 0:
                continue
            # if overlap between ref span and current ref span, remove from replacement
            if next_start < end:
                next_start = 0
                next_end = 0
                next_string = ""
            # if ref span abuts previous reference span
            elif next_start == end:
                next_start += shift_amount
                next_end += shift_amount
                next_string = btwn_padding + pre_padding + next_string + post_padding
            # if ref span starts after, shift starts and ends
            elif next_start > end:
                next_start += shift_amount
                next_end += shift_amount
                next_string = pre_padding + next_string + post_padding
            # save adjusted span
            spans_to_replace[ind] = (next_start, next_end, next_span, next_string)

    spans_to_replace = [entry for entry in spans_to_replace if entry[1] > 0]
    spans_to_replace.sort(key=lambda x: x[0])

    # apply shifts in series
    for start, end, span, new_string in spans_to_replace:
        assert full_string[start:end] == span
        full_string = full_string[:start] + new_string + full_string[end:]

    return full_string


def sub_spans_and_update_indices(
    spans_to_replace: List[Tuple[int, int, str, str]],
    full_string: str
) -> Tuple[str, List]:
    """
    Replace all spans and recompute indices
    :param spans_to_replace:
    :param full_string:
    :return:
    """
    # TODO: check no spans overlapping
    # TODO: check all spans well-formed

    # assert all spans are equal to full_text span
    assert all([full_string[start:end] == token for start, end, token, _ in spans_to_replace])

    # assert none of the spans start with the same start ind
    start_inds = [rep[0] for rep in spans_to_replace]
    assert len(set(start_inds)) == len(start_inds)

    # sort by start index
    spans_to_replace.sort(key=lambda x: x[0])

    # compute offsets for each span
    new_spans = [[start, end, token, surface, 0] for start, end, token, surface in spans_to_replace]
    for i, entry in enumerate(spans_to_replace):
        start, end, token, surface = entry
        new_end = start + len(surface)
        offset = new_end - end
        new_spans[i][1] += offset
        for new_span_entry in new_spans[i+1:]:
            new_span_entry[4] += offset

    # generate new text and create final spans
    new_text = replace_refspans(spans_to_replace, full_string, btwn_padding="")
    new_spans = [(start + offset, end + offset, token, surface) for start, end, token, surface, offset in new_spans]

    return new_text, new_spans