import re from collections import Counter from spacy.tokens import SpanGroup def preprocess(text): text = re.sub("--- Para SEP ---", '\n', text) text = re.sub("\n\n", ' &&&&&&&&#&#&#&#&', text) text = re.sub('\n', ' ', text) text = re.sub(r'\s+', " ", text) text = re.sub('&&&&&&&&#&#&#&#&', '\n\n', text) return text def del_spans(span_sc, indexes: list): indexes.sort( reverse=True ) # reversing allows the deletion from the last, keeping the original index for idx in indexes: if idx + 1 < len(span_sc): del span_sc[idx + 1] def delete_overlapping_span(span_sc: dict): # print(span_sc) start_token_list = [spn.start for spn in span_sc] dict_ = Counter(start_token_list) overlap = {k: v for k, v in dict_.items() if v > 1} id_del = [] id_comp = {} info = {} for n, (spn, score) in enumerate(zip(span_sc, span_sc.attrs['scores']), start=0): res = { 'score': score, 'spn': spn, 'label': spn.label_, 'start': spn.start, 'end': spn.end, 'compare': spn.start in overlap, "sents": len(list(spn.sents)) } # print(res) info[n] = res if res['compare']: if spn.start not in id_comp: id_comp[spn.start] = n else: same_lbl = res['label'] == info[id_comp[spn.start]]['label'] update = res['score'] > info[id_comp[spn.start]]['score'] if update and same_lbl: print(res['label'], info[id_comp[spn.start]]['label']) print(same_lbl) id_del.append(id_comp[spn.start]) id_comp[spn.start] = n else: id_del.append(n) # print(update) # delete span beyond sentences if len(list(spn.sents)) > 1: id_del.append(n) # print(id_comp) del_spans(span_sc, id_del) # for n, idx in enumerate(id_del): # # print(idx) # try: # del span_sc[idx - n] # except IndexError: # continue def cleanup_justify(doc, span_sc: dict): # This function adjusts the JUSTIFYING span # First create an index of span with JUSTIFYING tags justifies = {} for idx, span in enumerate(span_sc): # temp_root = span.root # while span.start <= temp_root.head.i <= span.end: # temp_root = temp_root.head if span.label_ in ['JUSTIFYING']: justifies[span.root] = { "span": span, "head": span.root.head, "start": span.start, "end": span.end, "del": False, "dependent": False, "span_idx": idx } # print(justifies) # flagging the dependency for spanroot, info in justifies.items(): if spanroot.head in justifies: info['dependent'] = True info['del'] = True # print(justifies) new_spans = [] for spanroot, info in justifies.items(): if not info['dependent']: # print("New Justifying candidate span:") # print(doc[spanroot.left_edge.i:spanroot.right_edge.i + 1]) new_span = doc[spanroot.left_edge.i:spanroot.right_edge.i + 1] new_span.label_ = "JUSTIFYING" if new_span not in span_sc: new_spans.append(new_span) info['del'] = True else: info['del'] = True to_delete = [ info['span_idx'] for spanroot, info in justifies.items() if info['del'] ] to_delete_span = [ info['span'] for spanroot, info in justifies.items() if info['del'] ] # print(to_delete) # print(to_delete_span) del_spans(span_sc, to_delete) span_grp = SpanGroup(doc, spans=new_spans) span_sc.extend(span_grp) # print(justifies)