nianlonggu
init
02ae0bf
from typing import List, Tuple
def replace_refspans(
spans_to_replace: List[Tuple[int, int, str, str]],
full_string: str,
pre_padding: str = "",
post_padding: str = "",
btwn_padding: str = ", "
) -> str:
"""
For each span within the full string, replace that span with new text
:param spans_to_replace: list of tuples of form (start_ind, end_ind, span_text, new_substring)
:param full_string:
:param pre_padding:
:param post_padding:
:param btwn_padding:
:return:
"""
# assert all spans are equal to full_text span
assert all([full_string[start:end] == span for start, end, span, _ in spans_to_replace])
# assert none of the spans start with the same start ind
start_inds = [rep[0] for rep in spans_to_replace]
assert len(set(start_inds)) == len(start_inds)
# sort by start index
spans_to_replace.sort(key=lambda x: x[0])
# form strings for each span group
for i, entry in enumerate(spans_to_replace):
start, end, span, new_string = entry
# skip empties
if end <= 0:
continue
# compute shift amount
shift_amount = len(new_string) - len(span) + len(pre_padding) + len(post_padding)
# shift remaining appropriately
for ind in range(i + 1, len(spans_to_replace)):
next_start, next_end, next_span, next_string = spans_to_replace[ind]
# skip empties
if next_end <= 0:
continue
# if overlap between ref span and current ref span, remove from replacement
if next_start < end:
next_start = 0
next_end = 0
next_string = ""
# if ref span abuts previous reference span
elif next_start == end:
next_start += shift_amount
next_end += shift_amount
next_string = btwn_padding + pre_padding + next_string + post_padding
# if ref span starts after, shift starts and ends
elif next_start > end:
next_start += shift_amount
next_end += shift_amount
next_string = pre_padding + next_string + post_padding
# save adjusted span
spans_to_replace[ind] = (next_start, next_end, next_span, next_string)
spans_to_replace = [entry for entry in spans_to_replace if entry[1] > 0]
spans_to_replace.sort(key=lambda x: x[0])
# apply shifts in series
for start, end, span, new_string in spans_to_replace:
assert full_string[start:end] == span
full_string = full_string[:start] + new_string + full_string[end:]
return full_string
def sub_spans_and_update_indices(
spans_to_replace: List[Tuple[int, int, str, str]],
full_string: str
) -> Tuple[str, List]:
"""
Replace all spans and recompute indices
:param spans_to_replace:
:param full_string:
:return:
"""
# TODO: check no spans overlapping
# TODO: check all spans well-formed
# assert all spans are equal to full_text span
assert all([full_string[start:end] == token for start, end, token, _ in spans_to_replace])
# assert none of the spans start with the same start ind
start_inds = [rep[0] for rep in spans_to_replace]
assert len(set(start_inds)) == len(start_inds)
# sort by start index
spans_to_replace.sort(key=lambda x: x[0])
# compute offsets for each span
new_spans = [[start, end, token, surface, 0] for start, end, token, surface in spans_to_replace]
for i, entry in enumerate(spans_to_replace):
start, end, token, surface = entry
new_end = start + len(surface)
offset = new_end - end
new_spans[i][1] += offset
for new_span_entry in new_spans[i+1:]:
new_span_entry[4] += offset
# generate new text and create final spans
new_text = replace_refspans(spans_to_replace, full_string, btwn_padding="")
new_spans = [(start + offset, end + offset, token, surface) for start, end, token, surface, offset in new_spans]
return new_text, new_spans