Spaces:

egumasa
/

engagement-analyzer-demo

Running

File size: 4,039 Bytes

import re
from collections import Counter
from spacy.tokens import SpanGroup


def preprocess(text):
    text = re.sub("--- Para SEP ---", '\n', text)
    text = re.sub("\n\n", ' &&&&&&&&#&#&#&#&', text)
    text = re.sub('\n', ' ', text)
    text = re.sub(r'\s+', " ", text)
    text = re.sub('&&&&&&&&#&#&#&#&', '\n\n', text)
    return text


def del_spans(span_sc, indexes: list):

    indexes.sort(
        reverse=True
    )  # reversing allows the deletion from the last, keeping the original index

    for idx in indexes:
        if idx + 1 < len(span_sc):
            del span_sc[idx + 1]


def delete_overlapping_span(span_sc: dict):
    # print(span_sc)
    start_token_list = [spn.start for spn in span_sc]
    dict_ = Counter(start_token_list)
    overlap = {k: v for k, v in dict_.items() if v > 1}

    id_del = []
    id_comp = {}

    info = {}
    for n, (spn, score) in enumerate(zip(span_sc, span_sc.attrs['scores']),
                                     start=0):
        res = {
            'score': score,
            'spn': spn,
            'label': spn.label_,
            'start': spn.start,
            'end': spn.end,
            'compare': spn.start in overlap,
            "sents": len(list(spn.sents))
        }
        # print(res)
        info[n] = res

        if res['compare']:
            if spn.start not in id_comp:
                id_comp[spn.start] = n
            else:
                same_lbl = res['label'] == info[id_comp[spn.start]]['label']
                update = res['score'] > info[id_comp[spn.start]]['score']
                if update and same_lbl:
                    print(res['label'], info[id_comp[spn.start]]['label'])
                    print(same_lbl)
                    id_del.append(id_comp[spn.start])
                    id_comp[spn.start] = n
                else:
                    id_del.append(n)
                # print(update)

        # delete span beyond sentences
        if len(list(spn.sents)) > 1:
            id_del.append(n)

    # print(id_comp)
    del_spans(span_sc, id_del)
    # for n, idx in enumerate(id_del):
    #     # print(idx)

    #     try:
    #         del span_sc[idx - n]
    #     except IndexError:
    #         continue


def cleanup_justify(doc, span_sc: dict):
    # This function adjusts the JUSTIFYING span

    # First create an index of span with JUSTIFYING tags
    justifies = {}
    for idx, span in enumerate(span_sc):
        # temp_root = span.root
        # while span.start <= temp_root.head.i <= span.end:
        #     temp_root = temp_root.head
        if span.label_ in ['JUSTIFYING']:
            justifies[span.root] = {
                "span": span,
                "head": span.root.head,
                "start": span.start,
                "end": span.end,
                "del": False,
                "dependent": False,
                "span_idx": idx
            }
    # print(justifies)

    # flagging the dependency
    for spanroot, info in justifies.items():
        if spanroot.head in justifies:
            info['dependent'] = True
            info['del'] = True

    # print(justifies)
    new_spans = []
    for spanroot, info in justifies.items():

        if not info['dependent']:
            # print("New Justifying candidate span:")
            # print(doc[spanroot.left_edge.i:spanroot.right_edge.i + 1])

            new_span = doc[spanroot.left_edge.i:spanroot.right_edge.i + 1]
            new_span.label_ = "JUSTIFYING"

            if new_span not in span_sc:
                new_spans.append(new_span)
                info['del'] = True

        else:
            info['del'] = True

    to_delete = [
        info['span_idx'] for spanroot, info in justifies.items() if info['del']
    ]

    to_delete_span = [
        info['span'] for spanroot, info in justifies.items() if info['del']
    ]

    # print(to_delete)
    # print(to_delete_span)

    del_spans(span_sc, to_delete)

    span_grp = SpanGroup(doc, spans=new_spans)
    span_sc.extend(span_grp)

    # print(justifies)