from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable import pandas as pd import spacy from spacy.language import Language from skbio import diversity as dv SPAN_ATTRS = ["text", "label_", "start", "end"] CATEGORIES = [ "ATTRIBUTION", "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES", ] def simple_table( doc: Union[spacy.tokens.Doc, Dict[str, str]], spans_key: str = "sc", attrs: List[str] = SPAN_ATTRS, ): columns = attrs + ["Conf. score"] data = [ [str(getattr(span, attr)) for attr in attrs] + [score] # [f'{score:.5f}'] for span, score in zip( doc.spans[spans_key], doc.spans[spans_key].attrs["scores"] ) ] return data, columns # def span_info_aggregator() def construction_classifier(doc, span): category = None spanroot = span.root ## Grabbing lexico-grammatical information span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span] span_dep = [t.dep_ for t in span] span_token = [t.norm_ for t in span] span_tag = [t.tag_ for t in span] c = [c for c in spanroot.children] c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children] c_norm = [c.norm_ for c in spanroot.children] c_dep = [c.dep_ for c in spanroot.children] c_pos = [c.pos_ for c in spanroot.children] c_tag = [c.tag_ for c in spanroot.children] right_dep = [c.dep_ for c in spanroot.rights] # conditionals subjless = all( c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"] for c in spanroot.children ) argmentless = all( c.dep_ not in [ "nsubj", "nsubjpass", "csubj", "csubjpass", "dobj", "ccomp", "xcomp", "dative", "attr", "oprd", "acomp", ] for c in spanroot.children ) argless_span = all( c.dep_ not in [ "nsubj", "nsubjpass", "csubj", "csubjpass", "dobj", "ccomp", "xcomp", "dative", "attr", "oprd", "acomp", ] for c in span ) ## nesting classifiers if spanroot.dep_ == "conj": while spanroot.dep_ == "conj": spanroot = spanroot.head # if spanroot.dep_ == "poss": # while spanroot.dep_ == 'poss': # spanroot = spanroot.head ## Conjunctions # Preconjunctions if spanroot.dep_ in ["preconj", "cc"]: category = "Conjunction" ## NOUN PHRASES # adverbial phrases if spanroot.dep_ in ["amod"]: category = "Adjectival modifier" # adverbial phrases if spanroot.dep_ in ["compound"]: category = "Compound noun" ## Nominal category if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]: if "acl" in c_dep: category = "Noun + Complement (Object)" else: category = "Object" if spanroot.dep_ in ["nsubj", "nsubjpass"]: if "acl" in c_dep: category = "Noun + Complement (Subject)" else: category = "Subject" ## ADJUNCTS # prep phrases if spanroot.dep_ in ["prep", "agent"]: category = "Prepositional phrase" # adverbial phrases if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod"]: category = "Adverbial phrase" ## Predication patterns if spanroot.dep_ in ["acomp", "oprd"]: if "xcomp" in c_dep: category = "Subject predicate to-cl" else: category = "Adjectival complement" if spanroot.dep_ in ["attr"]: subjless = all( c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"] for c in spanroot.children ) c_head = [c.dep_ for c in spanroot.head.children] if "expl" in c_head and "no_det" in span_t_dep_: category = "There is/are no NOUN" elif "expl" in c_head and spanroot.pos_ in ["NOUN"]: category = "There is/are + Noun complement" elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]: category = "There is/are + Noun complement" elif spanroot.pos_ in ["NOUN", "PRON"]: if "acl" in c_dep: category = "Noun + Complement (attr)" else: category = "Nominal complement" elif not subjless and spanroot.pos_ in ["VERB", "AUX"]: category = "Main verb 4" elif spanroot.tag_ in ["NNP"]: category = "Nominal complement" #################################### ### clausal #### #################################### if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl"]: _check_to = [ c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp" ] _check_ing = [ c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp" ] root_before_ccomp = [ c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp" ] _check_for_to = [ "_".join([c.norm_, c.dep_]) for c in spanroot.subtree if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux") ] entire_cl = ( spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end ) ## Start with broad category, which is then re-evaluated for specific constructions. if spanroot.dep_ in ["advcl", "mark", "acl", "pcomp"]: ## Adverbial clauses ### Finite-adverbial clauses ### Non-finite adverbial clauses subjless = all( c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"] for c in spanroot.children ) entire_cl = ( spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end ) if "mark" in span_dep and spanroot.pos_ in ["VERB", "AUX"]: category = "Finite adverbial clause" elif "mark" in span_dep and "aux" in span_dep: category = "Finite adverbial clause" elif ( "mark" in span_dep and spanroot.pos_ in ["VERB", "AUX"] and "expl" in c_dep ): category = "Finite adverbial clause" elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag): if spanroot.pos_ in ["VERB", "AUX"]: category = "Finite adverbial clause" elif spanroot.pos_ not in ["VERB", "AUX"] and subjless: category = "Non-finite adv clause 1" elif entire_cl: category = "Finite adverbial clause" elif ( str(spanroot.morph) in [ "Aspect=Prog|Tense=Pres|VerbForm=Part", "Aspect=Perf|Tense=Past|VerbForm=Part", ] and "aux" not in c_dep ): # he doing his job if argmentless: # e.g., frankly speaking, strictly speaking category = "Adverbial Phrase" else: category = "Non-finite adv clause 2" elif ( spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless ): category = "Non-finite adv clause 3" elif "aux" in c_dep and "TO" in c_tag: category = "Adverbial Phrase" elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]: category = "Dependent Verb phrase" elif not argmentless: category = "Adverbial clause" elif spanroot.dep_ == "advcl": category = "Adverbial phrase" if spanroot.dep_ in ["relcl", "ccomp", "acl"]: head = spanroot.head if ";" in [t.norm_ for t in head.children]: category = "Main verb 3" elif "nsubj" not in span_dep: category = "Dependent verb 1" elif "mark" in span_dep: category = "Complement clause" elif ( str(spanroot.morph) in [ "Aspect=Prog|Tense=Pres|VerbForm=Part", "Aspect=Perf|Tense=Past|VerbForm=Part", ] and "aux" not in c_dep ): category = "Non-finite complement clause" elif spanroot.dep_ in ["relcl"]: category = "Relative clause" elif spanroot.dep_ in ["ccomp"]: category = "Complement clause" elif spanroot.dep_ in ["acl"]: category = "Noun Complement clause" else: # print(_check_for_to) category = "this one" ## Specific constructions # Extraposed that-clause or to-infinitives if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [ "VERB", "AUX", ]: print(c_dep) if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep: # eg it seems odd (oprd) that X. # eg it is certain (acomp) that X. category = ( "Extraposed that-cl (adj-complement)" # e.g., it is certain that X. ) elif "xcomp" in c_dep or ("advcl" in c_dep): if "for_mark" in _check_for_to: category = ( "Extraposed to-cl (explicit subj)" # eg It is possible to . ) elif _check_to: category = "Extraposed to-cl 1" # eg It is possible to . elif _check_ing: category = "Extraposed -ing 1" # eg It is possible to . elif ( ("prep" in right_dep or "npadvmod" in right_dep) and "ccomp" in right_dep and spanroot.lemma_ == "be" ): category = "Cleft construction" elif "attr" in c_dep: category = "Extraposed that-cl (copula)" # eg It is a wonder that X. else: category = "Extraposed that-cl (VERB)" # if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_): # category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X. elif ( "it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_ ) and "acomp" in c_dep: if "xcomp" in c_dep: if _check_to: category = "Extraposed to-cl 2" # eg it is difficult to decide. elif _check_ing: category = "Extraposed -ing 2" # eg it is difficult to decide. else: category = "Extraposed that-cl (adj-complement) 2" elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep: category = ( "Extraposed that-cl (adj-complement) 3" # eg it seems odd that X. ) # something without dummy subject "it" elif ( (("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep) and spanroot.pos_ in ["AUX", "VERB"] and "it" not in c_norm ): # store xcomp, if the head of the xcomp is acomp _check_xcomp = [ c.dep_ for c in spanroot.subtree if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp" ] _check_ccomp = [ c.dep_ for c in spanroot.subtree if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp" ] # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"] # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"] if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep: if any(root_before_ccomp): category = "Post-predicate that-cl" else: category = "Comment clause" elif ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in _check_ccomp: category = "Post-predicate that-cl 2" elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp: category = "Post-predicate to-cl" elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to: category = "Subject predicate to-cl" elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to: category = "Subject predicate to-cl (passive)" elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing: category = "Subject predicate -ing" elif "ccomp" in c_dep: category = "Subject predicate that-cl" elif "acomp" in c_dep: category = "Adjectival predicate" elif "mark" in c_dep and ("nsubj" in c_dep or "nsubjpass" in c_dep): category = "Finite-adverbial clause" else: category = "Main verb 1" ## without dummy subject it, and lexical verbs elif ( ("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep and spanroot.pos_ in ["AUX", "VERB"] and "it" not in c_norm and spanroot.lemma_ not in ["be"] ): _check_wh = [ c.dep_ for c in spanroot.subtree if ( c.dep_ in ["attr", "advmod", "dobj", "nsubj"] and c.tag_ in ["WP", "WRB", "WDT", "WP$"] ) and c.head.dep_ == "ccomp" ] _check_if = [ c.dep_ for c in spanroot.subtree if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"]) and c.head.dep_ == "ccomp" ] # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"] # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"] if "ccomp" in c_dep and (_check_wh or _check_if): category = "Post-predicate wh-cl" elif "ccomp" in c_dep: if any(root_before_ccomp): category = "Post-predicate that-cl" else: category = "Comment clause" elif "xcomp" in c_dep: if _check_to: category = "Post-predicate to-cl" elif _check_ing: category = "Post-predicate -ing" # Existential elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep: category = "There is/are NOUN" elif ( "ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"] ): category = "Cleft construction" if spanroot.dep_ in ["parataxis"]: if "_".join(span_dep) in [ "nsubj_parataxis", "aux_parataxis", "nsubj_aux_parataxis", ]: category = "Comment clause" else: category = "parataxis (for now)" ## External comp if spanroot.dep_ in ["xcomp"]: if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_: category = "Adjective complement to-cl" if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_: category = "Verb complement to-cl" if spanroot.dep_ in ["pcomp"]: if ( str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"] and "ccomp" in c_dep ): category = "Participle + that-cl" elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]: category = "Participle" ## Simple classifier # if spanroot.dep_ in ['pcomp']: # if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]: # category = "Gerund" if spanroot.dep_ in ["neg"]: category = "Negative particle" if spanroot.dep_ in ["aux", "auxpass"]: category = "Auxiliary" # Modal verbs if spanroot.tag_ == "MD": category = "Modal auxiliary" if spanroot.dep_ in ["dep", "csubj", "csubjpass"]: if ( spanroot.head.dep_ in ["ROOT", "ccomp"] and spanroot.head.pos_ in ["AUX", "VERB"] and spanroot.pos_ in ["AUX", "VERB"] ): if spanroot.morph == spanroot.head.morph: category = "Main verb 4" else: category = "Dependent verb 2" elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part": category = "Gerund" elif spanroot.head.dep_ in ["conj", "acl", "relcl"]: if spanroot.morph == spanroot.head.morph: category = "Main verb 4" else: category = "Dependent verb 2" elif "VerbForm=Fin" in str(spanroot.morph): category = "Dependent verb 2" # Appositive phrases if spanroot.dep_ in ["appos"]: if "nummod" in c_dep: category = "Apposition" elif spanroot.pos_ in ["PROPN"]: category = "Appositive Proper Nouns" elif spanroot.pos_ in ["NOUN"]: category = "Appositive Noun Phrase" elif spanroot.pos_ in ["VERB", "AUX"]: _check = any( c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"] for c in spanroot.children ) if _check: category = "Appositive Finite-clause" if spanroot.dep_ in ["appos", "dep", "attr"]: if not subjless and spanroot.pos_ in ["VERB", "AUX"]: category = "Main verb 5" if spanroot.dep_ in ["dep", "mark"]: if spanroot.tag_ in ["RB", "IN", "CC"]: category = "Conjunction" # sometimes the extra-clausal links are not accurate if spanroot.dep_ in ["aux", "auxpass", "oprd", "appos", "xcomp"]: if spanroot.head.dep_ == "ROOT": category = "Main verb" else: category = "dependent verb 5" if span.label_ == "CITATION": if "NNP" in span_tag or "NNPS" in span_tag: if span_dep[0] == "punct" and span_dep[-1] == "punct": category = "Parenthetical Citation" elif span_tag[0] in ["NNP", "NNPS"]: category = "Narrative Citation" else: category = "Other Citation" if category == None: category = spanroot.dep_ return category def construction_classifier2(doc, span): category = None spanroot = span.root ## Grabbing lexico-grammatical information span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span] span_dep = [t.dep_ for t in span] span_token = [t.norm_ for t in span] span_tag = [t.tag_ for t in span] c = [c for c in spanroot.children] c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children] c_norm = [c.norm_ for c in spanroot.children] c_dep = [c.dep_ for c in spanroot.children] c_pos = [c.pos_ for c in spanroot.children] c_tag = [c.tag_ for c in spanroot.children] right_dep = [c.dep_ for c in spanroot.rights] # conditionals subjless = all( c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"] for c in spanroot.children ) argmentless = all( c.dep_ not in [ "nsubj", "nsubjpass", "csubj", "csubjpass", "dobj", "ccomp", "xcomp", "dative", "attr", "oprd", "acomp", ] for c in spanroot.children ) argless_span = all( c.dep_ not in [ "nsubj", "nsubjpass", "csubj", "csubjpass", "dobj", "ccomp", "xcomp", "dative", "attr", "oprd", "acomp", ] for c in span ) argless_span = all( c.dep_ not in [ "nsubj", "nsubjpass", "csubj", "csubjpass", "dobj", "ccomp", "xcomp", "dative", "attr", "oprd", "acomp", ] for c in span ) ## nesting classifiers if spanroot.dep_ == "conj": while spanroot.dep_ == "conj": spanroot = spanroot.head if spanroot.dep_ == "poss": head = spanroot.head if head.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]: category = "Posessive Noun (Object)" elif head.dep_ in ["nsubj", "nsubjpass"]: category = "Posessive Noun (Subject)" else: category = "Posessive Noun (Other)" ## Conjunctions # Preconjunctions if spanroot.dep_ in ["preconj", "cc"]: category = "Conjunction" ## NOUN PHRASES # adverbial phrases if spanroot.dep_ in ["amod"]: category = "Adjectival modifier" # adverbial phrases if spanroot.dep_ in ["compound"]: category = "Compound noun" ## Nominal category if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]: if "acl" in c_dep: category = "Noun + Complement (Object)" else: category = "Object" if spanroot.dep_ in ["nsubj", "nsubjpass"]: if "acl" in c_dep: category = "Noun + Complement (Subject)" else: category = "Subject" ## ADJUNCTS # prep phrases if spanroot.dep_ in ["prep", "agent"]: category = "Prepositional phrase" # adverbial phrases if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod", "nummod"]: category = "Adverbial phrase" ## Predication patterns if spanroot.dep_ in ["acomp", "oprd"]: if "xcomp" in c_dep: category = "Subject predicate to-cl" else: category = "Adjectival complement" if spanroot.dep_ in ["attr"]: subjless = all( c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"] for c in spanroot.children ) c_head = [c.dep_ for c in spanroot.head.children] if "expl" in c_head and "no_det" in span_t_dep_: category = "There is/are no NOUN" elif "expl" in c_head and spanroot.pos_ in ["NOUN"]: category = "There is/are + Noun complement" elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]: category = "There is/are + Noun complement" elif spanroot.pos_ in ["NOUN", "PRON"]: if "acl" in c_dep: category = "Noun + Complement (attr)" else: category = "Nominal complement" elif not subjless and spanroot.pos_ in ["VERB", "AUX"]: category = "Main verb 4" elif spanroot.tag_ in ["NNP"]: category = "Nominal complement" ## External comp if spanroot.dep_ in ["xcomp"]: if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_: category = "Adjective complement to-cl" if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_: category = "Verb complement to-cl" if spanroot.dep_ in ["pcomp"]: if ( str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"] and "ccomp" in c_dep ): category = "Participle + that-cl" elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]: category = "Participle" ## Simple classifier # if spanroot.dep_ in ['pcomp']: # if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]: # category = "Gerund" if spanroot.dep_ in ["neg"]: category = "Negative particle" if spanroot.dep_ in ["aux", "auxpass"]: category = "Auxiliary" # Modal verbs if spanroot.tag_ == "MD": category = "Modal auxiliary" #################################### ### clausal #### #################################### if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl", "punct"]: _check_to = [ c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp" ] _check_ing = [ c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp" ] root_before_ccomp = [ c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp" ] _check_for_to = [ "_".join([c.norm_, c.dep_]) for c in spanroot.subtree if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux") ] entire_cl = ( spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end ) ## Start with broad category, which is then re-evaluated for specific constructions. if spanroot.dep_ in ["advcl", "acl", "punct", "pcomp"]: #'mark', ## Adverbial clauses subjless = all( c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"] for c in spanroot.children ) entire_cl = ( spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end ) ### Finite-adverbial clauses if "mark" in span_dep and ( spanroot.pos_ in ["VERB", "AUX"] or "aux" in span_dep ): category = "Finite adverbial clause" elif "mark" in span_dep and "aux" in span_dep: category = "Finite adverbial clause" elif ( "mark" in span_dep and spanroot.pos_ in ["VERB", "AUX"] and "expl" in c_dep ): category = "Finite adverbial clause" elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag): if spanroot.pos_ in ["VERB", "AUX"]: category = "Finite adverbial clause" elif spanroot.pos_ not in ["VERB", "AUX"] and subjless: category = "Non-finite adv clause 1" elif not argmentless: category = "Finite adverbial clause" ## non-finite elif ( str(spanroot.morph) in [ "Aspect=Prog|Tense=Pres|VerbForm=Part", "Aspect=Perf|Tense=Past|VerbForm=Part", ] and "aux" not in c_dep ): # he doing his job if argmentless: # e.g., frankly speaking, strictly speaking category = "Adverbial Phrase" else: category = "Non-finite adv clause 2" elif ( spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless ): category = "Non-finite adv clause 3" elif "aux" in c_dep and "TO" in c_tag: category = "Adverbial Phrase" elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]: category = "Dependent Verb phrase" elif not argmentless: category = "Adverbial clause" elif spanroot.dep_ == "advcl": category = "Adverbial phrase" else: category = "Finite adverbial clause " if spanroot.dep_ in ["relcl", "ccomp", "acl", "punct", "pcomp"]: head = spanroot.head if ";" in [t.norm_ for t in head.children]: category = "Main verb 3" elif "nsubj" not in span_dep: category = "Dependent verb 1" elif "mark" in span_dep: category = "Complement clause" elif ( str(spanroot.morph) in [ "Aspect=Prog|Tense=Pres|VerbForm=Part", "Aspect=Perf|Tense=Past|VerbForm=Part", ] and "aux" not in c_dep ): category = "Non-finite complement clause" elif spanroot.dep_ in ["relcl"]: category = "Relative clause" elif spanroot.dep_ in ["ccomp"]: category = "Complement clause" elif spanroot.dep_ in ["acl"]: category = "Noun Complement clause" ## Specific constructions # Extraposed that-clause or to-infinitives if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [ "VERB", "AUX", ]: # print(c_dep) if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep: # eg it seems odd (oprd) that X. # eg it is certain (acomp) that X. category = ( "Extraposed that-cl (adj-complement)" # e.g., it is certain that X. ) elif "xcomp" in c_dep or ("advcl" in c_dep): if "for_mark" in _check_for_to: category = ( "Extraposed to-cl (explicit subj)" # eg It is possible to . ) elif _check_to: category = "Extraposed to-cl 1" # eg It is possible to . elif _check_ing: category = "Extraposed -ing 1" # eg It is possible to . elif ( ("prep" in right_dep or "npadvmod" in right_dep) and "ccomp" in right_dep and spanroot.lemma_ == "be" ): category = "Cleft construction" elif "attr" in c_dep: category = "Extraposed that-cl (copula)" # eg It is a wonder that X. else: category = "Extraposed that-cl (VERB)" # if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_): # category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X. elif ( "it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_ ) and "acomp" in c_dep: if "xcomp" in c_dep: if _check_to: category = "Extraposed to-cl 2" # eg it is difficult to decide. elif _check_ing: category = "Extraposed -ing 2" # eg it is difficult to decide. else: category = "Extraposed that-cl (adj-complement) 2" elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep: category = ( "Extraposed that-cl (adj-complement) 3" # eg it seems odd that X. ) # something without dummy subject "it" elif ( (("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep) and spanroot.pos_ in ["AUX", "VERB"] and "it" not in c_norm ): # store xcomp, if the head of the xcomp is acomp _check_xcomp = [ c.dep_ for c in spanroot.subtree if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp" ] _check_ccomp = [ c.dep_ for c in spanroot.subtree if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp" ] # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"] # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"] if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep: if any(root_before_ccomp): category = "Post-predicate that-cl" else: category = "Comment clause" elif ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in _check_ccomp: category = "Post-predicate that-cl 2" elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp: category = "Post-predicate to-cl" elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to: category = "Subject predicate to-cl" elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to: category = "Subject predicate to-cl (passive)" elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing: category = "Subject predicate -ing" elif "ccomp" in c_dep: category = "Subject predicate that-cl" elif "acomp" in c_dep: category = "Adjectival predicate" elif "mark" in c_dep and ("nsubj" in c_dep or "nsubjpass" in c_dep): category = "Finite-adverbial clause" elif not argmentless and "SCONJ" in c_pos: category = "Finite-adverbial clause" else: category = "Main verb 1" ## without dummy subject it, and lexical verbs elif ( ("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep and spanroot.pos_ in ["AUX", "VERB"] and "it" not in c_norm and spanroot.lemma_ not in ["be"] ): _check_wh = [ c.dep_ for c in spanroot.subtree if ( c.dep_ in ["attr", "advmod", "dobj", "nsubj"] and c.tag_ in ["WP", "WRB", "WDT", "WP$"] ) and c.head.dep_ == "ccomp" ] _check_if = [ c.dep_ for c in spanroot.subtree if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"]) and c.head.dep_ == "ccomp" ] # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"] # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"] if "ccomp" in c_dep and (_check_wh or _check_if): category = "Post-predicate wh-cl" elif "ccomp" in c_dep: if any(root_before_ccomp): category = "Post-predicate that-cl" else: category = "Comment clause" elif "xcomp" in c_dep: if _check_to: category = "Post-predicate to-cl" elif _check_ing: category = "Post-predicate -ing" # Existential elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep: category = "There is/are NOUN" elif ( "ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"] ): category = "Cleft construction" ### The end of clausal analysis if spanroot.dep_ in ["parataxis"]: if "_".join(span_dep) in [ "nsubj_parataxis", "aux_parataxis", "nsubj_aux_parataxis", ]: category = "Comment clause" else: category = "Parataxis" if spanroot.dep_ in ["dep", "csubj", "csubjpass"]: if ( spanroot.head.dep_ in ["ROOT", "ccomp"] and spanroot.head.pos_ in ["AUX", "VERB"] and spanroot.pos_ in ["AUX", "VERB"] ): if spanroot.morph == spanroot.head.morph: category = "Main verb 4" else: category = "Dependent verb 2" elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part": category = "Gerund" elif "VerbForm=Fin" in str(spanroot.morph) or "VerbForm=Inf" in str( spanroot.morph ): category = "Dependent verb 2" elif spanroot.dep_ in ["csubj", "csubjpass"]: category = "Dependent verb (csubj)" # Appositive phrases if spanroot.dep_ in ["appos"]: if "nummod" in c_dep: category = "Apposition" if spanroot.pos_ in ["PROPN"]: category = "Appositive Proper Nouns" elif spanroot.pos_ in ["NOUN"]: category = "Appositive Noun Phrase" elif spanroot.pos_ in ["VERB", "AUX"]: _check = any( c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"] for c in spanroot.children ) if _check: category = "Appositive Finite-clause" if spanroot.dep_ in ["appos", "dep", "attr"]: if not subjless and spanroot.pos_ in ["VERB", "AUX"]: category = "Main verb (likely parsing error)" # sometimes the dep are on the conjunctions if spanroot.dep_ in ["dep", "mark"]: if spanroot.tag_ in ["RB", "IN", "CC"]: category = "Conjunction" if spanroot.dep_ in ["intj"]: category = "Introjection" # sometimes the extra-clausal links are not accurate if ( spanroot.dep_ in ["aux", "auxpass", "oprd", "appos", "xcomp", "attr", "dep", "meta", "prt"] and category == None ): if spanroot.head.dep_ == "ROOT": category = "Main verb" else: category = "dependent verb 5" if span.label_ == "CITATION": if "NNP" in span_tag or "NNPS" in span_tag: if span_dep[0] == "punct" and span_dep[-1] == "punct": category = "Parenthetical Citation" elif span_tag[0] in ["NNP", "NNPS"]: category = "Narrative Citation" else: category = "Other Citation" if category == None: category = spanroot.dep_ return category def const_table( doc: Union[spacy.tokens.Doc, Dict[str, str]], spans_key: str = "sc", attrs: List[str] = SPAN_ATTRS, ): columns = attrs + [ "Conf. score", "sent no.", "grammatical realization", "span dep", "ner", "POS", "span dep seq", "TAG sequence", "POS sequence", "head", "head dep", "children", "morphology", "sent", ] data = [] # data = span_info_aggregator(doc, columns) sentences = {s: i for i, s in enumerate(doc.sents)} for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs["scores"]): span_info = [] span_info.extend([str(getattr(span, attr)) for attr in attrs]) span_info.append(score) span_info.append(int(sentences[span.sent])) span_info.append(construction_classifier2(doc, span)) span_info.append(span.root.dep_) span_info.append(span.root.ent_type_) span_info.append(span.root.tag_) span_info.append("_".join([t.dep_ for t in span])) span_info.append("_".join([t.tag_ for t in span])) span_info.append("_".join([t.pos_ for t in span])) span_info.append(span.root.head.norm_) span_info.append(span.root.head.dep_) span_info.append("_".join([c.dep_ for c in span.root.children])) span_info.append(str(span.root.morph)) span_info.append(span.sent.text.strip()) data.append(span_info) return data, columns def ngrammar(seq: list, n=2, concat=False, sep="-"): result = [] n_item = len(seq) for idx, item in enumerate(seq): if idx + n <= n_item: if concat: result.append(sep.join(seq[idx : idx + n])) else: result.append(seq[idx : idx + n]) return result def diversity_values(count_vec: list): result = {} if len(count_vec) == 0: count_vec = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] result["shannon"] = dv.alpha.shannon(list(count_vec), base=2) result["brillouin_d"] = dv.alpha.brillouin_d(list(count_vec)) result["simpson_d"] = 1 - dv.alpha.simpson(list(count_vec)) result["simpson_e"] = dv.alpha.simpson_e(list(count_vec)) # result['gini_index'] = dv.alpha.gini_index(list(count_vec)) # result['faith_pd'] = dv.alpha.faith_pd(list(count_vec)) return result