Spaces:

egumasa
/

engagement-analyzer-demo5

Sleeping

File size: 41,305 Bytes

0146ef9

from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
import pandas as pd
import spacy
from spacy.language import Language
from skbio import diversity as dv

SPAN_ATTRS = ["text", "label_", "start", "end"]
CATEGORIES = [
    "ATTRIBUTION",
    "CITATION",
    "COUNTER",
    "DENY",
    "ENDOPHORIC",
    "ENTERTAIN",
    "JUSTIFYING",
    "MONOGLOSS",
    "PROCLAIM",
    "SOURCES",
]


def simple_table(
    doc: Union[spacy.tokens.Doc, Dict[str, str]],
    spans_key: str = "sc",
    attrs: List[str] = SPAN_ATTRS,
):
    columns = attrs + ["Conf. score"]
    data = [
        [str(getattr(span, attr)) for attr in attrs] + [score]  # [f'{score:.5f}']
        for span, score in zip(
            doc.spans[spans_key], doc.spans[spans_key].attrs["scores"]
        )
    ]
    return data, columns


# def span_info_aggregator()


def construction_classifier(doc, span):
    category = None
    spanroot = span.root

    ## Grabbing lexico-grammatical information
    span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span]
    span_dep = [t.dep_ for t in span]
    span_token = [t.norm_ for t in span]
    span_tag = [t.tag_ for t in span]

    c = [c for c in spanroot.children]
    c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]

    c_norm = [c.norm_ for c in spanroot.children]
    c_dep = [c.dep_ for c in spanroot.children]
    c_pos = [c.pos_ for c in spanroot.children]
    c_tag = [c.tag_ for c in spanroot.children]

    right_dep = [c.dep_ for c in spanroot.rights]

    # conditionals
    subjless = all(
        c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
        for c in spanroot.children
    )
    argmentless = all(
        c.dep_
        not in [
            "nsubj",
            "nsubjpass",
            "csubj",
            "csubjpass",
            "dobj",
            "ccomp",
            "xcomp",
            "dative",
            "attr",
            "oprd",
            "acomp",
        ]
        for c in spanroot.children
    )
    argless_span = all(
        c.dep_
        not in [
            "nsubj",
            "nsubjpass",
            "csubj",
            "csubjpass",
            "dobj",
            "ccomp",
            "xcomp",
            "dative",
            "attr",
            "oprd",
            "acomp",
        ]
        for c in span
    )

    ## nesting classifiers
    if spanroot.dep_ == "conj":
        while spanroot.dep_ == "conj":
            spanroot = spanroot.head
    # if spanroot.dep_ == "poss":
    #     while spanroot.dep_ == 'poss':
    #         spanroot = spanroot.head

    ## Conjunctions
    # Preconjunctions
    if spanroot.dep_ in ["preconj", "cc"]:
        category = "Conjunction"

    ## NOUN PHRASES
    # adverbial phrases
    if spanroot.dep_ in ["amod"]:
        category = "Adjectival modifier"
        # adverbial phrases
    if spanroot.dep_ in ["compound"]:
        category = "Compound noun"

    ## Nominal category
    if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]:
        if "acl" in c_dep:
            category = "Noun + Complement (Object)"
        else:
            category = "Object"

    if spanroot.dep_ in ["nsubj", "nsubjpass"]:
        if "acl" in c_dep:
            category = "Noun + Complement (Subject)"
        else:
            category = "Subject"

    ## ADJUNCTS
    # prep phrases
    if spanroot.dep_ in ["prep", "agent"]:
        category = "Prepositional phrase"
    # adverbial phrases
    if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod"]:
        category = "Adverbial phrase"

    ## Predication patterns
    if spanroot.dep_ in ["acomp", "oprd"]:
        if "xcomp" in c_dep:
            category = "Subject predicate to-cl"
        else:
            category = "Adjectival complement"

    if spanroot.dep_ in ["attr"]:
        subjless = all(
            c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
            for c in spanroot.children
        )

        c_head = [c.dep_ for c in spanroot.head.children]
        if "expl" in c_head and "no_det" in span_t_dep_:
            category = "There is/are no NOUN"
        elif "expl" in c_head and spanroot.pos_ in ["NOUN"]:
            category = "There is/are + Noun complement"
        elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
            category = "There is/are + Noun complement"

        elif spanroot.pos_ in ["NOUN", "PRON"]:
            if "acl" in c_dep:
                category = "Noun + Complement (attr)"
            else:
                category = "Nominal complement"

        elif not subjless and spanroot.pos_ in ["VERB", "AUX"]:
            category = "Main verb 4"

        elif spanroot.tag_ in ["NNP"]:
            category = "Nominal complement"

    ####################################
    ### clausal ####
    ####################################
    if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl"]:
        _check_to = [
            c.dep_
            for c in spanroot.subtree
            if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"])
            and c.head.dep_ == "xcomp"
        ]
        _check_ing = [
            c.dep_
            for c in spanroot.subtree
            if "Prog" in str(c.morph) and c.dep_ == "xcomp"
        ]
        root_before_ccomp = [
            c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"
        ]

        _check_for_to = [
            "_".join([c.norm_, c.dep_])
            for c in spanroot.subtree
            if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux")
        ]
        entire_cl = (
            spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
        )

        ## Start with broad category, which is then re-evaluated for specific constructions.
        if spanroot.dep_ in ["advcl", "mark", "acl", "pcomp"]:
            ## Adverbial clauses
            ### Finite-adverbial clauses
            ### Non-finite adverbial clauses
            subjless = all(
                c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
                for c in spanroot.children
            )
            entire_cl = (
                spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
            )

            if "mark" in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
                category = "Finite adverbial clause"
            elif "mark" in span_dep and "aux" in span_dep:
                category = "Finite adverbial clause"

            elif (
                "mark" in span_dep
                and spanroot.pos_ in ["VERB", "AUX"]
                and "expl" in c_dep
            ):
                category = "Finite adverbial clause"

            elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
                if spanroot.pos_ in ["VERB", "AUX"]:
                    category = "Finite adverbial clause"

                elif spanroot.pos_ not in ["VERB", "AUX"] and subjless:
                    category = "Non-finite adv clause 1"

                elif entire_cl:
                    category = "Finite adverbial clause"

            elif (
                str(spanroot.morph)
                in [
                    "Aspect=Prog|Tense=Pres|VerbForm=Part",
                    "Aspect=Perf|Tense=Past|VerbForm=Part",
                ]
                and "aux" not in c_dep
            ):
                # he doing his job
                if argmentless:
                    # e.g., frankly speaking, strictly speaking
                    category = "Adverbial Phrase"
                else:
                    category = "Non-finite adv clause 2"

            elif (
                spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless
            ):
                category = "Non-finite adv clause 3"

            elif "aux" in c_dep and "TO" in c_tag:
                category = "Adverbial Phrase"

            elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
                category = "Dependent Verb phrase"

            elif not argmentless:
                category = "Adverbial clause"

            elif spanroot.dep_ == "advcl":
                category = "Adverbial phrase"

        if spanroot.dep_ in ["relcl", "ccomp", "acl"]:
            head = spanroot.head
            if ";" in [t.norm_ for t in head.children]:
                category = "Main verb 3"
            elif "nsubj" not in span_dep:
                category = "Dependent verb 1"
            elif "mark" in span_dep:
                category = "Complement clause"
            elif (
                str(spanroot.morph)
                in [
                    "Aspect=Prog|Tense=Pres|VerbForm=Part",
                    "Aspect=Perf|Tense=Past|VerbForm=Part",
                ]
                and "aux" not in c_dep
            ):
                category = "Non-finite complement clause"
            elif spanroot.dep_ in ["relcl"]:
                category = "Relative clause"
            elif spanroot.dep_ in ["ccomp"]:
                category = "Complement clause"
            elif spanroot.dep_ in ["acl"]:
                category = "Noun Complement clause"
            else:
                # print(_check_for_to)
                category = "this one"

        ## Specific constructions
        # Extraposed that-clause or to-infinitives
        if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [
            "VERB",
            "AUX",
        ]:
            print(c_dep)
            if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
                # eg it seems odd (oprd) that X.
                # eg it is certain (acomp) that X.
                category = (
                    "Extraposed that-cl (adj-complement)"  # e.g., it is certain that X.
                )

            elif "xcomp" in c_dep or ("advcl" in c_dep):
                if "for_mark" in _check_for_to:
                    category = (
                        "Extraposed to-cl (explicit subj)"  # eg It is possible to .
                    )
                elif _check_to:
                    category = "Extraposed to-cl 1"  # eg It is possible to .
                elif _check_ing:
                    category = "Extraposed -ing 1"  # eg It is possible to .
            elif (
                ("prep" in right_dep or "npadvmod" in right_dep)
                and "ccomp" in right_dep
                and spanroot.lemma_ == "be"
            ):
                category = "Cleft construction"

            elif "attr" in c_dep:
                category = "Extraposed that-cl (copula)"  # eg It is a wonder that X.

            else:
                category = "Extraposed that-cl (VERB)"

        # if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
        #     category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
        elif (
            "it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_
        ) and "acomp" in c_dep:
            if "xcomp" in c_dep:
                if _check_to:
                    category = "Extraposed to-cl 2"  # eg it is difficult to decide.
                elif _check_ing:
                    category = "Extraposed -ing 2"  # eg it is difficult to decide.

            else:
                category = "Extraposed that-cl (adj-complement) 2"

        elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
            category = (
                "Extraposed that-cl (adj-complement) 3"  # eg it seems odd that X.
            )

        # something without dummy subject "it"
        elif (
            (("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep)
            and spanroot.pos_ in ["AUX", "VERB"]
            and "it" not in c_norm
        ):
            # store xcomp, if the head of the xcomp is acomp
            _check_xcomp = [
                c.dep_
                for c in spanroot.subtree
                if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"
            ]
            _check_ccomp = [
                c.dep_
                for c in spanroot.subtree
                if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"
            ]
            # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
            # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]

            if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
                if any(root_before_ccomp):
                    category = "Post-predicate that-cl"
                else:
                    category = "Comment clause"

            elif ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in _check_ccomp:
                category = "Post-predicate that-cl 2"

            elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
                category = "Post-predicate to-cl"

            elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to:
                category = "Subject predicate to-cl"

            elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
                category = "Subject predicate to-cl (passive)"

            elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing:
                category = "Subject predicate -ing"
            elif "ccomp" in c_dep:
                category = "Subject predicate that-cl"
            elif "acomp" in c_dep:
                category = "Adjectival predicate"

            elif "mark" in c_dep and ("nsubj" in c_dep or "nsubjpass" in c_dep):
                category = "Finite-adverbial clause"
            else:
                category = "Main verb 1"

        ## without dummy subject it, and lexical verbs
        elif (
            ("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep
            and spanroot.pos_ in ["AUX", "VERB"]
            and "it" not in c_norm
            and spanroot.lemma_ not in ["be"]
        ):
            _check_wh = [
                c.dep_
                for c in spanroot.subtree
                if (
                    c.dep_ in ["attr", "advmod", "dobj", "nsubj"]
                    and c.tag_ in ["WP", "WRB", "WDT", "WP$"]
                )
                and c.head.dep_ == "ccomp"
            ]
            _check_if = [
                c.dep_
                for c in spanroot.subtree
                if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"])
                and c.head.dep_ == "ccomp"
            ]

            # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
            # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]

            if "ccomp" in c_dep and (_check_wh or _check_if):
                category = "Post-predicate wh-cl"

            elif "ccomp" in c_dep:
                if any(root_before_ccomp):
                    category = "Post-predicate that-cl"
                else:
                    category = "Comment clause"

            elif "xcomp" in c_dep:
                if _check_to:
                    category = "Post-predicate to-cl"
                elif _check_ing:
                    category = "Post-predicate -ing"

        # Existential
        elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
            category = "There is/are NOUN"

        elif (
            "ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]
        ):
            category = "Cleft construction"

    if spanroot.dep_ in ["parataxis"]:
        if "_".join(span_dep) in [
            "nsubj_parataxis",
            "aux_parataxis",
            "nsubj_aux_parataxis",
        ]:
            category = "Comment clause"
        else:
            category = "parataxis (for now)"

    ## External comp
    if spanroot.dep_ in ["xcomp"]:
        if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_:
            category = "Adjective complement to-cl"
        if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_:
            category = "Verb complement to-cl"

    if spanroot.dep_ in ["pcomp"]:
        if (
            str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]
            and "ccomp" in c_dep
        ):
            category = "Participle + that-cl"
        elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
            category = "Participle"

    ## Simple classifier
    # if spanroot.dep_ in ['pcomp']:
    #     if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
    #         category = "Gerund"

    if spanroot.dep_ in ["neg"]:
        category = "Negative particle"
    if spanroot.dep_ in ["aux", "auxpass"]:
        category = "Auxiliary"

    # Modal verbs
    if spanroot.tag_ == "MD":
        category = "Modal auxiliary"

    if spanroot.dep_ in ["dep", "csubj", "csubjpass"]:
        if (
            spanroot.head.dep_ in ["ROOT", "ccomp"]
            and spanroot.head.pos_ in ["AUX", "VERB"]
            and spanroot.pos_ in ["AUX", "VERB"]
        ):
            if spanroot.morph == spanroot.head.morph:
                category = "Main verb 4"
            else:
                category = "Dependent verb 2"
        elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
            category = "Gerund"
        elif spanroot.head.dep_ in ["conj", "acl", "relcl"]:
            if spanroot.morph == spanroot.head.morph:
                category = "Main verb 4"
            else:
                category = "Dependent verb 2"
        elif "VerbForm=Fin" in str(spanroot.morph):
            category = "Dependent verb 2"

    # Appositive phrases
    if spanroot.dep_ in ["appos"]:
        if "nummod" in c_dep:
            category = "Apposition"
        elif spanroot.pos_ in ["PROPN"]:
            category = "Appositive Proper Nouns"
        elif spanroot.pos_ in ["NOUN"]:
            category = "Appositive Noun Phrase"
        elif spanroot.pos_ in ["VERB", "AUX"]:
            _check = any(
                c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
                for c in spanroot.children
            )
            if _check:
                category = "Appositive Finite-clause"

    if spanroot.dep_ in ["appos", "dep", "attr"]:
        if not subjless and spanroot.pos_ in ["VERB", "AUX"]:
            category = "Main verb 5"

    if spanroot.dep_ in ["dep", "mark"]:
        if spanroot.tag_ in ["RB", "IN", "CC"]:
            category = "Conjunction"

    # sometimes the extra-clausal links are not accurate
    if spanroot.dep_ in ["aux", "auxpass", "oprd", "appos", "xcomp"]:
        if spanroot.head.dep_ == "ROOT":
            category = "Main verb"
        else:
            category = "dependent verb 5"

    if span.label_ == "CITATION":
        if "NNP" in span_tag or "NNPS" in span_tag:
            if span_dep[0] == "punct" and span_dep[-1] == "punct":
                category = "Parenthetical Citation"
            elif span_tag[0] in ["NNP", "NNPS"]:
                category = "Narrative Citation"
        else:
            category = "Other Citation"

    if category == None:
        category = spanroot.dep_

    return category


def construction_classifier2(doc, span):
    category = None
    spanroot = span.root

    ## Grabbing lexico-grammatical information
    span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span]
    span_dep = [t.dep_ for t in span]
    span_token = [t.norm_ for t in span]
    span_tag = [t.tag_ for t in span]

    c = [c for c in spanroot.children]
    c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]

    c_norm = [c.norm_ for c in spanroot.children]
    c_dep = [c.dep_ for c in spanroot.children]
    c_pos = [c.pos_ for c in spanroot.children]
    c_tag = [c.tag_ for c in spanroot.children]

    right_dep = [c.dep_ for c in spanroot.rights]

    # conditionals
    subjless = all(
        c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
        for c in spanroot.children
    )
    argmentless = all(
        c.dep_
        not in [
            "nsubj",
            "nsubjpass",
            "csubj",
            "csubjpass",
            "dobj",
            "ccomp",
            "xcomp",
            "dative",
            "attr",
            "oprd",
            "acomp",
        ]
        for c in spanroot.children
    )
    argless_span = all(
        c.dep_
        not in [
            "nsubj",
            "nsubjpass",
            "csubj",
            "csubjpass",
            "dobj",
            "ccomp",
            "xcomp",
            "dative",
            "attr",
            "oprd",
            "acomp",
        ]
        for c in span
    )
    argless_span = all(
        c.dep_
        not in [
            "nsubj",
            "nsubjpass",
            "csubj",
            "csubjpass",
            "dobj",
            "ccomp",
            "xcomp",
            "dative",
            "attr",
            "oprd",
            "acomp",
        ]
        for c in span
    )

    ## nesting classifiers
    if spanroot.dep_ == "conj":
        while spanroot.dep_ == "conj":
            spanroot = spanroot.head

    if spanroot.dep_ == "poss":
        head = spanroot.head
        if head.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]:
            category = "Posessive Noun (Object)"
        elif head.dep_ in ["nsubj", "nsubjpass"]:
            category = "Posessive Noun (Subject)"
        else:
            category = "Posessive Noun (Other)"

    ## Conjunctions
    # Preconjunctions
    if spanroot.dep_ in ["preconj", "cc"]:
        category = "Conjunction"

    ## NOUN PHRASES
    # adverbial phrases
    if spanroot.dep_ in ["amod"]:
        category = "Adjectival modifier"
        # adverbial phrases
    if spanroot.dep_ in ["compound"]:
        category = "Compound noun"

    ## Nominal category
    if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]:
        if "acl" in c_dep:
            category = "Noun + Complement (Object)"
        else:
            category = "Object"

    if spanroot.dep_ in ["nsubj", "nsubjpass"]:
        if "acl" in c_dep:
            category = "Noun + Complement (Subject)"
        else:
            category = "Subject"

    ## ADJUNCTS
    # prep phrases
    if spanroot.dep_ in ["prep", "agent"]:
        category = "Prepositional phrase"

    # adverbial phrases
    if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod", "nummod"]:
        category = "Adverbial phrase"

    ## Predication patterns
    if spanroot.dep_ in ["acomp", "oprd"]:
        if "xcomp" in c_dep:
            category = "Subject predicate to-cl"
        else:
            category = "Adjectival complement"

    if spanroot.dep_ in ["attr"]:
        subjless = all(
            c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
            for c in spanroot.children
        )

        c_head = [c.dep_ for c in spanroot.head.children]
        if "expl" in c_head and "no_det" in span_t_dep_:
            category = "There is/are no NOUN"
        elif "expl" in c_head and spanroot.pos_ in ["NOUN"]:
            category = "There is/are + Noun complement"
        elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
            category = "There is/are + Noun complement"

        elif spanroot.pos_ in ["NOUN", "PRON"]:
            if "acl" in c_dep:
                category = "Noun + Complement (attr)"
            else:
                category = "Nominal complement"

        elif not subjless and spanroot.pos_ in ["VERB", "AUX"]:
            category = "Main verb 4"

        elif spanroot.tag_ in ["NNP"]:
            category = "Nominal complement"

    ## External comp
    if spanroot.dep_ in ["xcomp"]:
        if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_:
            category = "Adjective complement to-cl"
        if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_:
            category = "Verb complement to-cl"

    if spanroot.dep_ in ["pcomp"]:
        if (
            str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]
            and "ccomp" in c_dep
        ):
            category = "Participle + that-cl"
        elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
            category = "Participle"

    ## Simple classifier
    # if spanroot.dep_ in ['pcomp']:
    #     if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
    #         category = "Gerund"

    if spanroot.dep_ in ["neg"]:
        category = "Negative particle"
    if spanroot.dep_ in ["aux", "auxpass"]:
        category = "Auxiliary"

    # Modal verbs
    if spanroot.tag_ == "MD":
        category = "Modal auxiliary"

    ####################################
    ### clausal ####
    ####################################
    if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl", "punct"]:
        _check_to = [
            c.dep_
            for c in spanroot.subtree
            if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"])
            and c.head.dep_ == "xcomp"
        ]
        _check_ing = [
            c.dep_
            for c in spanroot.subtree
            if "Prog" in str(c.morph) and c.dep_ == "xcomp"
        ]
        root_before_ccomp = [
            c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"
        ]

        _check_for_to = [
            "_".join([c.norm_, c.dep_])
            for c in spanroot.subtree
            if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux")
        ]
        entire_cl = (
            spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
        )

        ## Start with broad category, which is then re-evaluated for specific constructions.
        if spanroot.dep_ in ["advcl", "acl", "punct", "pcomp"]:  #'mark',
            ## Adverbial clauses
            subjless = all(
                c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
                for c in spanroot.children
            )
            entire_cl = (
                spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
            )

            ### Finite-adverbial clauses
            if "mark" in span_dep and (
                spanroot.pos_ in ["VERB", "AUX"] or "aux" in span_dep
            ):
                category = "Finite adverbial clause"

            elif "mark" in span_dep and "aux" in span_dep:
                category = "Finite adverbial clause"

            elif (
                "mark" in span_dep
                and spanroot.pos_ in ["VERB", "AUX"]
                and "expl" in c_dep
            ):
                category = "Finite adverbial clause"

            elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
                if spanroot.pos_ in ["VERB", "AUX"]:
                    category = "Finite adverbial clause"

                elif spanroot.pos_ not in ["VERB", "AUX"] and subjless:
                    category = "Non-finite adv clause 1"

                elif not argmentless:
                    category = "Finite adverbial clause"

            ## non-finite
            elif (
                str(spanroot.morph)
                in [
                    "Aspect=Prog|Tense=Pres|VerbForm=Part",
                    "Aspect=Perf|Tense=Past|VerbForm=Part",
                ]
                and "aux" not in c_dep
            ):
                # he doing his job
                if argmentless:
                    # e.g., frankly speaking, strictly speaking
                    category = "Adverbial Phrase"
                else:
                    category = "Non-finite adv clause 2"

            elif (
                spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless
            ):
                category = "Non-finite adv clause 3"

            elif "aux" in c_dep and "TO" in c_tag:
                category = "Adverbial Phrase"

            elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
                category = "Dependent Verb phrase"

            elif not argmentless:
                category = "Adverbial clause"

            elif spanroot.dep_ == "advcl":
                category = "Adverbial phrase"

            else:
                category = "Finite adverbial clause "

        if spanroot.dep_ in ["relcl", "ccomp", "acl", "punct", "pcomp"]:
            head = spanroot.head
            if ";" in [t.norm_ for t in head.children]:
                category = "Main verb 3"

            elif "nsubj" not in span_dep:
                category = "Dependent verb 1"

            elif "mark" in span_dep:
                category = "Complement clause"
            elif (
                str(spanroot.morph)
                in [
                    "Aspect=Prog|Tense=Pres|VerbForm=Part",
                    "Aspect=Perf|Tense=Past|VerbForm=Part",
                ]
                and "aux" not in c_dep
            ):
                category = "Non-finite complement clause"
            elif spanroot.dep_ in ["relcl"]:
                category = "Relative clause"
            elif spanroot.dep_ in ["ccomp"]:
                category = "Complement clause"
            elif spanroot.dep_ in ["acl"]:
                category = "Noun Complement clause"

        ## Specific constructions
        # Extraposed that-clause or to-infinitives
        if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [
            "VERB",
            "AUX",
        ]:
            # print(c_dep)
            if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
                # eg it seems odd (oprd) that X.
                # eg it is certain (acomp) that X.
                category = (
                    "Extraposed that-cl (adj-complement)"  # e.g., it is certain that X.
                )

            elif "xcomp" in c_dep or ("advcl" in c_dep):
                if "for_mark" in _check_for_to:
                    category = (
                        "Extraposed to-cl (explicit subj)"  # eg It is possible to .
                    )
                elif _check_to:
                    category = "Extraposed to-cl 1"  # eg It is possible to .
                elif _check_ing:
                    category = "Extraposed -ing 1"  # eg It is possible to .
            elif (
                ("prep" in right_dep or "npadvmod" in right_dep)
                and "ccomp" in right_dep
                and spanroot.lemma_ == "be"
            ):
                category = "Cleft construction"

            elif "attr" in c_dep:
                category = "Extraposed that-cl (copula)"  # eg It is a wonder that X.

            else:
                category = "Extraposed that-cl (VERB)"

        # if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
        #     category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
        elif (
            "it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_
        ) and "acomp" in c_dep:
            if "xcomp" in c_dep:
                if _check_to:
                    category = "Extraposed to-cl 2"  # eg it is difficult to decide.
                elif _check_ing:
                    category = "Extraposed -ing 2"  # eg it is difficult to decide.

            else:
                category = "Extraposed that-cl (adj-complement) 2"

        elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
            category = (
                "Extraposed that-cl (adj-complement) 3"  # eg it seems odd that X.
            )

        # something without dummy subject "it"
        elif (
            (("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep)
            and spanroot.pos_ in ["AUX", "VERB"]
            and "it" not in c_norm
        ):
            # store xcomp, if the head of the xcomp is acomp
            _check_xcomp = [
                c.dep_
                for c in spanroot.subtree
                if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"
            ]
            _check_ccomp = [
                c.dep_
                for c in spanroot.subtree
                if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"
            ]
            # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
            # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]

            if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
                if any(root_before_ccomp):
                    category = "Post-predicate that-cl"
                else:
                    category = "Comment clause"

            elif ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in _check_ccomp:
                category = "Post-predicate that-cl 2"

            elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
                category = "Post-predicate to-cl"

            elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to:
                category = "Subject predicate to-cl"

            elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
                category = "Subject predicate to-cl (passive)"

            elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing:
                category = "Subject predicate -ing"
            elif "ccomp" in c_dep:
                category = "Subject predicate that-cl"
            elif "acomp" in c_dep:
                category = "Adjectival predicate"

            elif "mark" in c_dep and ("nsubj" in c_dep or "nsubjpass" in c_dep):
                category = "Finite-adverbial clause"
            elif not argmentless and "SCONJ" in c_pos:
                category = "Finite-adverbial clause"
            else:
                category = "Main verb 1"

        ## without dummy subject it, and lexical verbs
        elif (
            ("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep
            and spanroot.pos_ in ["AUX", "VERB"]
            and "it" not in c_norm
            and spanroot.lemma_ not in ["be"]
        ):
            _check_wh = [
                c.dep_
                for c in spanroot.subtree
                if (
                    c.dep_ in ["attr", "advmod", "dobj", "nsubj"]
                    and c.tag_ in ["WP", "WRB", "WDT", "WP$"]
                )
                and c.head.dep_ == "ccomp"
            ]
            _check_if = [
                c.dep_
                for c in spanroot.subtree
                if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"])
                and c.head.dep_ == "ccomp"
            ]

            # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
            # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]

            if "ccomp" in c_dep and (_check_wh or _check_if):
                category = "Post-predicate wh-cl"

            elif "ccomp" in c_dep:
                if any(root_before_ccomp):
                    category = "Post-predicate that-cl"
                else:
                    category = "Comment clause"

            elif "xcomp" in c_dep:
                if _check_to:
                    category = "Post-predicate to-cl"
                elif _check_ing:
                    category = "Post-predicate -ing"

        # Existential
        elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
            category = "There is/are NOUN"

        elif (
            "ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]
        ):
            category = "Cleft construction"

        ### The end of clausal analysis

    if spanroot.dep_ in ["parataxis"]:
        if "_".join(span_dep) in [
            "nsubj_parataxis",
            "aux_parataxis",
            "nsubj_aux_parataxis",
        ]:
            category = "Comment clause"
        else:
            category = "Parataxis"

    if spanroot.dep_ in ["dep", "csubj", "csubjpass"]:
        if (
            spanroot.head.dep_ in ["ROOT", "ccomp"]
            and spanroot.head.pos_ in ["AUX", "VERB"]
            and spanroot.pos_ in ["AUX", "VERB"]
        ):
            if spanroot.morph == spanroot.head.morph:
                category = "Main verb 4"
            else:
                category = "Dependent verb 2"
        elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
            category = "Gerund"
        elif "VerbForm=Fin" in str(spanroot.morph) or "VerbForm=Inf" in str(
            spanroot.morph
        ):
            category = "Dependent verb 2"
        elif spanroot.dep_ in ["csubj", "csubjpass"]:
            category = "Dependent verb (csubj)"

    # Appositive phrases
    if spanroot.dep_ in ["appos"]:
        if "nummod" in c_dep:
            category = "Apposition"
        if spanroot.pos_ in ["PROPN"]:
            category = "Appositive Proper Nouns"
        elif spanroot.pos_ in ["NOUN"]:
            category = "Appositive Noun Phrase"
        elif spanroot.pos_ in ["VERB", "AUX"]:
            _check = any(
                c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
                for c in spanroot.children
            )
            if _check:
                category = "Appositive Finite-clause"

    if spanroot.dep_ in ["appos", "dep", "attr"]:
        if not subjless and spanroot.pos_ in ["VERB", "AUX"]:
            category = "Main verb (likely parsing error)"

    # sometimes the dep are on the conjunctions
    if spanroot.dep_ in ["dep", "mark"]:
        if spanroot.tag_ in ["RB", "IN", "CC"]:
            category = "Conjunction"

    if spanroot.dep_ in ["intj"]:
        category = "Introjection"

    # sometimes the extra-clausal links are not accurate
    if (
        spanroot.dep_
        in ["aux", "auxpass", "oprd", "appos", "xcomp", "attr", "dep", "meta", "prt"]
        and category == None
    ):
        if spanroot.head.dep_ == "ROOT":
            category = "Main verb"
        else:
            category = "dependent verb 5"

    if span.label_ == "CITATION":
        if "NNP" in span_tag or "NNPS" in span_tag:
            if span_dep[0] == "punct" and span_dep[-1] == "punct":
                category = "Parenthetical Citation"
            elif span_tag[0] in ["NNP", "NNPS"]:
                category = "Narrative Citation"
        else:
            category = "Other Citation"

    if category == None:
        category = spanroot.dep_

    return category


def const_table(
    doc: Union[spacy.tokens.Doc, Dict[str, str]],
    spans_key: str = "sc",
    attrs: List[str] = SPAN_ATTRS,
):
    columns = attrs + [
        "Conf. score",
        "sent no.",
        "grammatical realization",
        "span dep",
        "ner",
        "POS",
        "span dep seq",
        "TAG sequence",
        "POS sequence",
        "head",
        "head dep",
        "children",
        "morphology",
        "sent",
    ]
    data = []
    # data = span_info_aggregator(doc, columns)
    sentences = {s: i for i, s in enumerate(doc.sents)}

    for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs["scores"]):
        span_info = []
        span_info.extend([str(getattr(span, attr)) for attr in attrs])

        span_info.append(score)
        span_info.append(int(sentences[span.sent]))
        span_info.append(construction_classifier2(doc, span))
        span_info.append(span.root.dep_)
        span_info.append(span.root.ent_type_)
        span_info.append(span.root.tag_)
        span_info.append("_".join([t.dep_ for t in span]))
        span_info.append("_".join([t.tag_ for t in span]))
        span_info.append("_".join([t.pos_ for t in span]))
        span_info.append(span.root.head.norm_)
        span_info.append(span.root.head.dep_)
        span_info.append("_".join([c.dep_ for c in span.root.children]))
        span_info.append(str(span.root.morph))
        span_info.append(span.sent.text.strip())

        data.append(span_info)

    return data, columns


def ngrammar(seq: list, n=2, concat=False, sep="-"):
    result = []
    n_item = len(seq)
    for idx, item in enumerate(seq):
        if idx + n <= n_item:
            if concat:
                result.append(sep.join(seq[idx : idx + n]))
            else:
                result.append(seq[idx : idx + n])
    return result


def diversity_values(count_vec: list):
    result = {}
    if len(count_vec) == 0:
        count_vec = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

    result["shannon"] = dv.alpha.shannon(list(count_vec), base=2)
    result["brillouin_d"] = dv.alpha.brillouin_d(list(count_vec))
    result["simpson_d"] = 1 - dv.alpha.simpson(list(count_vec))
    result["simpson_e"] = dv.alpha.simpson_e(list(count_vec))
    # result['gini_index'] = dv.alpha.gini_index(list(count_vec))
    # result['faith_pd'] = dv.alpha.faith_pd(list(count_vec))

    return result