|
import re |
|
from collections import Counter |
|
from spacy.tokens import SpanGroup |
|
|
|
|
|
def preprocess(text): |
|
text = re.sub("--- Para SEP ---", '\n', text) |
|
text = re.sub("\n\n", ' &&&&&&&&#&#&#&#&', text) |
|
text = re.sub('\n', ' ', text) |
|
text = re.sub(r'\s+', " ", text) |
|
text = re.sub('&&&&&&&&#&#&#&#&', '\n\n', text) |
|
return text |
|
|
|
|
|
def del_spans(span_sc, indexes: list): |
|
|
|
indexes.sort( |
|
reverse=True |
|
) |
|
|
|
for idx in indexes: |
|
if idx + 1 < len(span_sc): |
|
del span_sc[idx + 1] |
|
|
|
|
|
def delete_overlapping_span(span_sc: dict): |
|
|
|
start_token_list = [spn.start for spn in span_sc] |
|
dict_ = Counter(start_token_list) |
|
overlap = {k: v for k, v in dict_.items() if v > 1} |
|
|
|
id_del = [] |
|
id_comp = {} |
|
|
|
info = {} |
|
for n, (spn, score) in enumerate(zip(span_sc, span_sc.attrs['scores']), |
|
start=0): |
|
res = { |
|
'score': score, |
|
'spn': spn, |
|
'label': spn.label_, |
|
'start': spn.start, |
|
'end': spn.end, |
|
'compare': spn.start in overlap, |
|
"sents": len(list(spn.sents)) |
|
} |
|
|
|
info[n] = res |
|
|
|
if res['compare']: |
|
if spn.start not in id_comp: |
|
id_comp[spn.start] = n |
|
else: |
|
same_lbl = res['label'] == info[id_comp[spn.start]]['label'] |
|
update = res['score'] > info[id_comp[spn.start]]['score'] |
|
if update and same_lbl: |
|
print(res['label'], info[id_comp[spn.start]]['label']) |
|
print(same_lbl) |
|
id_del.append(id_comp[spn.start]) |
|
id_comp[spn.start] = n |
|
else: |
|
id_del.append(n) |
|
|
|
|
|
|
|
if len(list(spn.sents)) > 1: |
|
id_del.append(n) |
|
|
|
|
|
del_spans(span_sc, id_del) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def cleanup_justify(doc, span_sc: dict): |
|
|
|
|
|
|
|
justifies = {} |
|
for idx, span in enumerate(span_sc): |
|
|
|
|
|
|
|
if span.label_ in ['JUSTIFYING']: |
|
justifies[span.root] = { |
|
"span": span, |
|
"head": span.root.head, |
|
"start": span.start, |
|
"end": span.end, |
|
"del": False, |
|
"dependent": False, |
|
"span_idx": idx |
|
} |
|
|
|
|
|
|
|
for spanroot, info in justifies.items(): |
|
if spanroot.head in justifies: |
|
info['dependent'] = True |
|
info['del'] = True |
|
|
|
|
|
new_spans = [] |
|
for spanroot, info in justifies.items(): |
|
|
|
if not info['dependent']: |
|
|
|
|
|
|
|
new_span = doc[spanroot.left_edge.i:spanroot.right_edge.i + 1] |
|
new_span.label_ = "JUSTIFYING" |
|
|
|
if new_span not in span_sc: |
|
new_spans.append(new_span) |
|
info['del'] = True |
|
|
|
else: |
|
info['del'] = True |
|
|
|
to_delete = [ |
|
info['span_idx'] for spanroot, info in justifies.items() if info['del'] |
|
] |
|
|
|
to_delete_span = [ |
|
info['span'] for spanroot, info in justifies.items() if info['del'] |
|
] |
|
|
|
|
|
|
|
|
|
del_spans(span_sc, to_delete) |
|
|
|
span_grp = SpanGroup(doc, spans=new_spans) |
|
span_sc.extend(span_grp) |
|
|
|
|
|
|