Spaces:
Runtime error
Runtime error
| from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable | |
| import pandas as pd | |
| import spacy | |
| from spacy.language import Language | |
| from skbio import diversity as dv | |
| SPAN_ATTRS = ["text", "label_", "start", "end"] | |
| CATEGORIES = [ | |
| "ATTRIBUTION", | |
| "CITATION", | |
| "COUNTER", | |
| "DENY", | |
| "ENDOPHORIC", | |
| "ENTERTAIN", | |
| "JUSTIFYING", | |
| "MONOGLOSS", | |
| "PROCLAIM", | |
| "SOURCES", | |
| ] | |
| def simple_table( | |
| doc: Union[spacy.tokens.Doc, Dict[str, str]], | |
| spans_key: str = "sc", | |
| attrs: List[str] = SPAN_ATTRS, | |
| ): | |
| columns = attrs + ["Conf. score"] | |
| data = [ | |
| [str(getattr(span, attr)) for attr in attrs] + [score] # [f'{score:.5f}'] | |
| for span, score in zip( | |
| doc.spans[spans_key], doc.spans[spans_key].attrs["scores"] | |
| ) | |
| ] | |
| return data, columns | |
| # def span_info_aggregator() | |
| def construction_classifier(doc, span): | |
| category = None | |
| spanroot = span.root | |
| ## Grabbing lexico-grammatical information | |
| span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span] | |
| span_dep = [t.dep_ for t in span] | |
| span_token = [t.norm_ for t in span] | |
| span_tag = [t.tag_ for t in span] | |
| c = [c for c in spanroot.children] | |
| c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children] | |
| c_norm = [c.norm_ for c in spanroot.children] | |
| c_dep = [c.dep_ for c in spanroot.children] | |
| c_pos = [c.pos_ for c in spanroot.children] | |
| c_tag = [c.tag_ for c in spanroot.children] | |
| right_dep = [c.dep_ for c in spanroot.rights] | |
| # conditionals | |
| subjless = all( | |
| c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"] | |
| for c in spanroot.children | |
| ) | |
| argmentless = all( | |
| c.dep_ | |
| not in [ | |
| "nsubj", | |
| "nsubjpass", | |
| "csubj", | |
| "csubjpass", | |
| "dobj", | |
| "ccomp", | |
| "xcomp", | |
| "dative", | |
| "attr", | |
| "oprd", | |
| "acomp", | |
| ] | |
| for c in spanroot.children | |
| ) | |
| argless_span = all( | |
| c.dep_ | |
| not in [ | |
| "nsubj", | |
| "nsubjpass", | |
| "csubj", | |
| "csubjpass", | |
| "dobj", | |
| "ccomp", | |
| "xcomp", | |
| "dative", | |
| "attr", | |
| "oprd", | |
| "acomp", | |
| ] | |
| for c in span | |
| ) | |
| ## nesting classifiers | |
| if spanroot.dep_ == "conj": | |
| while spanroot.dep_ == "conj": | |
| spanroot = spanroot.head | |
| # if spanroot.dep_ == "poss": | |
| # while spanroot.dep_ == 'poss': | |
| # spanroot = spanroot.head | |
| ## Conjunctions | |
| # Preconjunctions | |
| if spanroot.dep_ in ["preconj", "cc"]: | |
| category = "Conjunction" | |
| ## NOUN PHRASES | |
| # adverbial phrases | |
| if spanroot.dep_ in ["amod"]: | |
| category = "Adjectival modifier" | |
| # adverbial phrases | |
| if spanroot.dep_ in ["compound"]: | |
| category = "Compound noun" | |
| ## Nominal category | |
| if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]: | |
| if "acl" in c_dep: | |
| category = "Noun + Complement (Object)" | |
| else: | |
| category = "Object" | |
| if spanroot.dep_ in ["nsubj", "nsubjpass"]: | |
| if "acl" in c_dep: | |
| category = "Noun + Complement (Subject)" | |
| else: | |
| category = "Subject" | |
| ## ADJUNCTS | |
| # prep phrases | |
| if spanroot.dep_ in ["prep", "agent"]: | |
| category = "Prepositional phrase" | |
| # adverbial phrases | |
| if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod"]: | |
| category = "Adverbial phrase" | |
| ## Predication patterns | |
| if spanroot.dep_ in ["acomp", "oprd"]: | |
| if "xcomp" in c_dep: | |
| category = "Subject predicate to-cl" | |
| else: | |
| category = "Adjectival complement" | |
| if spanroot.dep_ in ["attr"]: | |
| subjless = all( | |
| c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"] | |
| for c in spanroot.children | |
| ) | |
| c_head = [c.dep_ for c in spanroot.head.children] | |
| if "expl" in c_head and "no_det" in span_t_dep_: | |
| category = "There is/are no NOUN" | |
| elif "expl" in c_head and spanroot.pos_ in ["NOUN"]: | |
| category = "There is/are + Noun complement" | |
| elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]: | |
| category = "There is/are + Noun complement" | |
| elif spanroot.pos_ in ["NOUN", "PRON"]: | |
| if "acl" in c_dep: | |
| category = "Noun + Complement (attr)" | |
| else: | |
| category = "Nominal complement" | |
| elif not subjless and spanroot.pos_ in ["VERB", "AUX"]: | |
| category = "Main verb 4" | |
| elif spanroot.tag_ in ["NNP"]: | |
| category = "Nominal complement" | |
| #################################### | |
| ### clausal #### | |
| #################################### | |
| if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl"]: | |
| _check_to = [ | |
| c.dep_ | |
| for c in spanroot.subtree | |
| if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) | |
| and c.head.dep_ == "xcomp" | |
| ] | |
| _check_ing = [ | |
| c.dep_ | |
| for c in spanroot.subtree | |
| if "Prog" in str(c.morph) and c.dep_ == "xcomp" | |
| ] | |
| root_before_ccomp = [ | |
| c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp" | |
| ] | |
| _check_for_to = [ | |
| "_".join([c.norm_, c.dep_]) | |
| for c in spanroot.subtree | |
| if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux") | |
| ] | |
| entire_cl = ( | |
| spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end | |
| ) | |
| ## Start with broad category, which is then re-evaluated for specific constructions. | |
| if spanroot.dep_ in ["advcl", "mark", "acl", "pcomp"]: | |
| ## Adverbial clauses | |
| ### Finite-adverbial clauses | |
| ### Non-finite adverbial clauses | |
| subjless = all( | |
| c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"] | |
| for c in spanroot.children | |
| ) | |
| entire_cl = ( | |
| spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end | |
| ) | |
| if "mark" in span_dep and spanroot.pos_ in ["VERB", "AUX"]: | |
| category = "Finite adverbial clause" | |
| elif "mark" in span_dep and "aux" in span_dep: | |
| category = "Finite adverbial clause" | |
| elif ( | |
| "mark" in span_dep | |
| and spanroot.pos_ in ["VERB", "AUX"] | |
| and "expl" in c_dep | |
| ): | |
| category = "Finite adverbial clause" | |
| elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag): | |
| if spanroot.pos_ in ["VERB", "AUX"]: | |
| category = "Finite adverbial clause" | |
| elif spanroot.pos_ not in ["VERB", "AUX"] and subjless: | |
| category = "Non-finite adv clause 1" | |
| elif entire_cl: | |
| category = "Finite adverbial clause" | |
| elif ( | |
| str(spanroot.morph) | |
| in [ | |
| "Aspect=Prog|Tense=Pres|VerbForm=Part", | |
| "Aspect=Perf|Tense=Past|VerbForm=Part", | |
| ] | |
| and "aux" not in c_dep | |
| ): | |
| # he doing his job | |
| if argmentless: | |
| # e.g., frankly speaking, strictly speaking | |
| category = "Adverbial Phrase" | |
| else: | |
| category = "Non-finite adv clause 2" | |
| elif ( | |
| spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless | |
| ): | |
| category = "Non-finite adv clause 3" | |
| elif "aux" in c_dep and "TO" in c_tag: | |
| category = "Adverbial Phrase" | |
| elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]: | |
| category = "Dependent Verb phrase" | |
| elif not argmentless: | |
| category = "Adverbial clause" | |
| elif spanroot.dep_ == "advcl": | |
| category = "Adverbial phrase" | |
| if spanroot.dep_ in ["relcl", "ccomp", "acl"]: | |
| head = spanroot.head | |
| if ";" in [t.norm_ for t in head.children]: | |
| category = "Main verb 3" | |
| elif "nsubj" not in span_dep: | |
| category = "Dependent verb 1" | |
| elif "mark" in span_dep: | |
| category = "Complement clause" | |
| elif ( | |
| str(spanroot.morph) | |
| in [ | |
| "Aspect=Prog|Tense=Pres|VerbForm=Part", | |
| "Aspect=Perf|Tense=Past|VerbForm=Part", | |
| ] | |
| and "aux" not in c_dep | |
| ): | |
| category = "Non-finite complement clause" | |
| elif spanroot.dep_ in ["relcl"]: | |
| category = "Relative clause" | |
| elif spanroot.dep_ in ["ccomp"]: | |
| category = "Complement clause" | |
| elif spanroot.dep_ in ["acl"]: | |
| category = "Noun Complement clause" | |
| else: | |
| # print(_check_for_to) | |
| category = "this one" | |
| ## Specific constructions | |
| # Extraposed that-clause or to-infinitives | |
| if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [ | |
| "VERB", | |
| "AUX", | |
| ]: | |
| print(c_dep) | |
| if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep: | |
| # eg it seems odd (oprd) that X. | |
| # eg it is certain (acomp) that X. | |
| category = ( | |
| "Extraposed that-cl (adj-complement)" # e.g., it is certain that X. | |
| ) | |
| elif "xcomp" in c_dep or ("advcl" in c_dep): | |
| if "for_mark" in _check_for_to: | |
| category = ( | |
| "Extraposed to-cl (explicit subj)" # eg It is possible to . | |
| ) | |
| elif _check_to: | |
| category = "Extraposed to-cl 1" # eg It is possible to . | |
| elif _check_ing: | |
| category = "Extraposed -ing 1" # eg It is possible to . | |
| elif ( | |
| ("prep" in right_dep or "npadvmod" in right_dep) | |
| and "ccomp" in right_dep | |
| and spanroot.lemma_ == "be" | |
| ): | |
| category = "Cleft construction" | |
| elif "attr" in c_dep: | |
| category = "Extraposed that-cl (copula)" # eg It is a wonder that X. | |
| else: | |
| category = "Extraposed that-cl (VERB)" | |
| # if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_): | |
| # category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X. | |
| elif ( | |
| "it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_ | |
| ) and "acomp" in c_dep: | |
| if "xcomp" in c_dep: | |
| if _check_to: | |
| category = "Extraposed to-cl 2" # eg it is difficult to decide. | |
| elif _check_ing: | |
| category = "Extraposed -ing 2" # eg it is difficult to decide. | |
| else: | |
| category = "Extraposed that-cl (adj-complement) 2" | |
| elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep: | |
| category = ( | |
| "Extraposed that-cl (adj-complement) 3" # eg it seems odd that X. | |
| ) | |
| # something without dummy subject "it" | |
| elif ( | |
| (("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep) | |
| and spanroot.pos_ in ["AUX", "VERB"] | |
| and "it" not in c_norm | |
| ): | |
| # store xcomp, if the head of the xcomp is acomp | |
| _check_xcomp = [ | |
| c.dep_ | |
| for c in spanroot.subtree | |
| if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp" | |
| ] | |
| _check_ccomp = [ | |
| c.dep_ | |
| for c in spanroot.subtree | |
| if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp" | |
| ] | |
| # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"] | |
| # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"] | |
| if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep: | |
| if any(root_before_ccomp): | |
| category = "Post-predicate that-cl" | |
| else: | |
| category = "Comment clause" | |
| elif ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in _check_ccomp: | |
| category = "Post-predicate that-cl 2" | |
| elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp: | |
| category = "Post-predicate to-cl" | |
| elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to: | |
| category = "Subject predicate to-cl" | |
| elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to: | |
| category = "Subject predicate to-cl (passive)" | |
| elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing: | |
| category = "Subject predicate -ing" | |
| elif "ccomp" in c_dep: | |
| category = "Subject predicate that-cl" | |
| elif "acomp" in c_dep: | |
| category = "Adjectival predicate" | |
| elif "mark" in c_dep and ("nsubj" in c_dep or "nsubjpass" in c_dep): | |
| category = "Finite-adverbial clause" | |
| else: | |
| category = "Main verb 1" | |
| ## without dummy subject it, and lexical verbs | |
| elif ( | |
| ("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep | |
| and spanroot.pos_ in ["AUX", "VERB"] | |
| and "it" not in c_norm | |
| and spanroot.lemma_ not in ["be"] | |
| ): | |
| _check_wh = [ | |
| c.dep_ | |
| for c in spanroot.subtree | |
| if ( | |
| c.dep_ in ["attr", "advmod", "dobj", "nsubj"] | |
| and c.tag_ in ["WP", "WRB", "WDT", "WP$"] | |
| ) | |
| and c.head.dep_ == "ccomp" | |
| ] | |
| _check_if = [ | |
| c.dep_ | |
| for c in spanroot.subtree | |
| if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"]) | |
| and c.head.dep_ == "ccomp" | |
| ] | |
| # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"] | |
| # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"] | |
| if "ccomp" in c_dep and (_check_wh or _check_if): | |
| category = "Post-predicate wh-cl" | |
| elif "ccomp" in c_dep: | |
| if any(root_before_ccomp): | |
| category = "Post-predicate that-cl" | |
| else: | |
| category = "Comment clause" | |
| elif "xcomp" in c_dep: | |
| if _check_to: | |
| category = "Post-predicate to-cl" | |
| elif _check_ing: | |
| category = "Post-predicate -ing" | |
| # Existential | |
| elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep: | |
| category = "There is/are NOUN" | |
| elif ( | |
| "ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"] | |
| ): | |
| category = "Cleft construction" | |
| if spanroot.dep_ in ["parataxis"]: | |
| if "_".join(span_dep) in [ | |
| "nsubj_parataxis", | |
| "aux_parataxis", | |
| "nsubj_aux_parataxis", | |
| ]: | |
| category = "Comment clause" | |
| else: | |
| category = "parataxis (for now)" | |
| ## External comp | |
| if spanroot.dep_ in ["xcomp"]: | |
| if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_: | |
| category = "Adjective complement to-cl" | |
| if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_: | |
| category = "Verb complement to-cl" | |
| if spanroot.dep_ in ["pcomp"]: | |
| if ( | |
| str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"] | |
| and "ccomp" in c_dep | |
| ): | |
| category = "Participle + that-cl" | |
| elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]: | |
| category = "Participle" | |
| ## Simple classifier | |
| # if spanroot.dep_ in ['pcomp']: | |
| # if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]: | |
| # category = "Gerund" | |
| if spanroot.dep_ in ["neg"]: | |
| category = "Negative particle" | |
| if spanroot.dep_ in ["aux", "auxpass"]: | |
| category = "Auxiliary" | |
| # Modal verbs | |
| if spanroot.tag_ == "MD": | |
| category = "Modal auxiliary" | |
| if spanroot.dep_ in ["dep", "csubj", "csubjpass"]: | |
| if ( | |
| spanroot.head.dep_ in ["ROOT", "ccomp"] | |
| and spanroot.head.pos_ in ["AUX", "VERB"] | |
| and spanroot.pos_ in ["AUX", "VERB"] | |
| ): | |
| if spanroot.morph == spanroot.head.morph: | |
| category = "Main verb 4" | |
| else: | |
| category = "Dependent verb 2" | |
| elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part": | |
| category = "Gerund" | |
| elif spanroot.head.dep_ in ["conj", "acl", "relcl"]: | |
| if spanroot.morph == spanroot.head.morph: | |
| category = "Main verb 4" | |
| else: | |
| category = "Dependent verb 2" | |
| elif "VerbForm=Fin" in str(spanroot.morph): | |
| category = "Dependent verb 2" | |
| # Appositive phrases | |
| if spanroot.dep_ in ["appos"]: | |
| if "nummod" in c_dep: | |
| category = "Apposition" | |
| elif spanroot.pos_ in ["PROPN"]: | |
| category = "Appositive Proper Nouns" | |
| elif spanroot.pos_ in ["NOUN"]: | |
| category = "Appositive Noun Phrase" | |
| elif spanroot.pos_ in ["VERB", "AUX"]: | |
| _check = any( | |
| c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"] | |
| for c in spanroot.children | |
| ) | |
| if _check: | |
| category = "Appositive Finite-clause" | |
| if spanroot.dep_ in ["appos", "dep", "attr"]: | |
| if not subjless and spanroot.pos_ in ["VERB", "AUX"]: | |
| category = "Main verb 5" | |
| if spanroot.dep_ in ["dep", "mark"]: | |
| if spanroot.tag_ in ["RB", "IN", "CC"]: | |
| category = "Conjunction" | |
| # sometimes the extra-clausal links are not accurate | |
| if spanroot.dep_ in ["aux", "auxpass", "oprd", "appos", "xcomp"]: | |
| if spanroot.head.dep_ == "ROOT": | |
| category = "Main verb" | |
| else: | |
| category = "dependent verb 5" | |
| if span.label_ == "CITATION": | |
| if "NNP" in span_tag or "NNPS" in span_tag: | |
| if span_dep[0] == "punct" and span_dep[-1] == "punct": | |
| category = "Parenthetical Citation" | |
| elif span_tag[0] in ["NNP", "NNPS"]: | |
| category = "Narrative Citation" | |
| else: | |
| category = "Other Citation" | |
| if category == None: | |
| category = spanroot.dep_ | |
| return category | |
| def construction_classifier2(doc, span): | |
| category = None | |
| spanroot = span.root | |
| ## Grabbing lexico-grammatical information | |
| span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span] | |
| span_dep = [t.dep_ for t in span] | |
| span_token = [t.norm_ for t in span] | |
| span_tag = [t.tag_ for t in span] | |
| c = [c for c in spanroot.children] | |
| c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children] | |
| c_norm = [c.norm_ for c in spanroot.children] | |
| c_dep = [c.dep_ for c in spanroot.children] | |
| c_pos = [c.pos_ for c in spanroot.children] | |
| c_tag = [c.tag_ for c in spanroot.children] | |
| right_dep = [c.dep_ for c in spanroot.rights] | |
| # conditionals | |
| subjless = all( | |
| c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"] | |
| for c in spanroot.children | |
| ) | |
| argmentless = all( | |
| c.dep_ | |
| not in [ | |
| "nsubj", | |
| "nsubjpass", | |
| "csubj", | |
| "csubjpass", | |
| "dobj", | |
| "ccomp", | |
| "xcomp", | |
| "dative", | |
| "attr", | |
| "oprd", | |
| "acomp", | |
| ] | |
| for c in spanroot.children | |
| ) | |
| argless_span = all( | |
| c.dep_ | |
| not in [ | |
| "nsubj", | |
| "nsubjpass", | |
| "csubj", | |
| "csubjpass", | |
| "dobj", | |
| "ccomp", | |
| "xcomp", | |
| "dative", | |
| "attr", | |
| "oprd", | |
| "acomp", | |
| ] | |
| for c in span | |
| ) | |
| argless_span = all( | |
| c.dep_ | |
| not in [ | |
| "nsubj", | |
| "nsubjpass", | |
| "csubj", | |
| "csubjpass", | |
| "dobj", | |
| "ccomp", | |
| "xcomp", | |
| "dative", | |
| "attr", | |
| "oprd", | |
| "acomp", | |
| ] | |
| for c in span | |
| ) | |
| ## nesting classifiers | |
| if spanroot.dep_ == "conj": | |
| while spanroot.dep_ == "conj": | |
| spanroot = spanroot.head | |
| if spanroot.dep_ == "poss": | |
| head = spanroot.head | |
| if head.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]: | |
| category = "Posessive Noun (Object)" | |
| elif head.dep_ in ["nsubj", "nsubjpass"]: | |
| category = "Posessive Noun (Subject)" | |
| else: | |
| category = "Posessive Noun (Other)" | |
| ## Conjunctions | |
| # Preconjunctions | |
| if spanroot.dep_ in ["preconj", "cc"]: | |
| category = "Conjunction" | |
| ## NOUN PHRASES | |
| # adverbial phrases | |
| if spanroot.dep_ in ["amod"]: | |
| category = "Adjectival modifier" | |
| # adverbial phrases | |
| if spanroot.dep_ in ["compound"]: | |
| category = "Compound noun" | |
| ## Nominal category | |
| if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]: | |
| if "acl" in c_dep: | |
| category = "Noun + Complement (Object)" | |
| else: | |
| category = "Object" | |
| if spanroot.dep_ in ["nsubj", "nsubjpass"]: | |
| if "acl" in c_dep: | |
| category = "Noun + Complement (Subject)" | |
| else: | |
| category = "Subject" | |
| ## ADJUNCTS | |
| # prep phrases | |
| if spanroot.dep_ in ["prep", "agent"]: | |
| category = "Prepositional phrase" | |
| # adverbial phrases | |
| if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod", "nummod"]: | |
| category = "Adverbial phrase" | |
| ## Predication patterns | |
| if spanroot.dep_ in ["acomp", "oprd"]: | |
| if "xcomp" in c_dep: | |
| category = "Subject predicate to-cl" | |
| else: | |
| category = "Adjectival complement" | |
| if spanroot.dep_ in ["attr"]: | |
| subjless = all( | |
| c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"] | |
| for c in spanroot.children | |
| ) | |
| c_head = [c.dep_ for c in spanroot.head.children] | |
| if "expl" in c_head and "no_det" in span_t_dep_: | |
| category = "There is/are no NOUN" | |
| elif "expl" in c_head and spanroot.pos_ in ["NOUN"]: | |
| category = "There is/are + Noun complement" | |
| elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]: | |
| category = "There is/are + Noun complement" | |
| elif spanroot.pos_ in ["NOUN", "PRON"]: | |
| if "acl" in c_dep: | |
| category = "Noun + Complement (attr)" | |
| else: | |
| category = "Nominal complement" | |
| elif not subjless and spanroot.pos_ in ["VERB", "AUX"]: | |
| category = "Main verb 4" | |
| elif spanroot.tag_ in ["NNP"]: | |
| category = "Nominal complement" | |
| ## External comp | |
| if spanroot.dep_ in ["xcomp"]: | |
| if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_: | |
| category = "Adjective complement to-cl" | |
| if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_: | |
| category = "Verb complement to-cl" | |
| if spanroot.dep_ in ["pcomp"]: | |
| if ( | |
| str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"] | |
| and "ccomp" in c_dep | |
| ): | |
| category = "Participle + that-cl" | |
| elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]: | |
| category = "Participle" | |
| ## Simple classifier | |
| # if spanroot.dep_ in ['pcomp']: | |
| # if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]: | |
| # category = "Gerund" | |
| if spanroot.dep_ in ["neg"]: | |
| category = "Negative particle" | |
| if spanroot.dep_ in ["aux", "auxpass"]: | |
| category = "Auxiliary" | |
| # Modal verbs | |
| if spanroot.tag_ == "MD": | |
| category = "Modal auxiliary" | |
| #################################### | |
| ### clausal #### | |
| #################################### | |
| if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl", "punct"]: | |
| _check_to = [ | |
| c.dep_ | |
| for c in spanroot.subtree | |
| if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) | |
| and c.head.dep_ == "xcomp" | |
| ] | |
| _check_ing = [ | |
| c.dep_ | |
| for c in spanroot.subtree | |
| if "Prog" in str(c.morph) and c.dep_ == "xcomp" | |
| ] | |
| root_before_ccomp = [ | |
| c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp" | |
| ] | |
| _check_for_to = [ | |
| "_".join([c.norm_, c.dep_]) | |
| for c in spanroot.subtree | |
| if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux") | |
| ] | |
| entire_cl = ( | |
| spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end | |
| ) | |
| ## Start with broad category, which is then re-evaluated for specific constructions. | |
| if spanroot.dep_ in ["advcl", "acl", "punct", "pcomp"]: #'mark', | |
| ## Adverbial clauses | |
| subjless = all( | |
| c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"] | |
| for c in spanroot.children | |
| ) | |
| entire_cl = ( | |
| spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end | |
| ) | |
| ### Finite-adverbial clauses | |
| if "mark" in span_dep and ( | |
| spanroot.pos_ in ["VERB", "AUX"] or "aux" in span_dep | |
| ): | |
| category = "Finite adverbial clause" | |
| elif "mark" in span_dep and "aux" in span_dep: | |
| category = "Finite adverbial clause" | |
| elif ( | |
| "mark" in span_dep | |
| and spanroot.pos_ in ["VERB", "AUX"] | |
| and "expl" in c_dep | |
| ): | |
| category = "Finite adverbial clause" | |
| elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag): | |
| if spanroot.pos_ in ["VERB", "AUX"]: | |
| category = "Finite adverbial clause" | |
| elif spanroot.pos_ not in ["VERB", "AUX"] and subjless: | |
| category = "Non-finite adv clause 1" | |
| elif not argmentless: | |
| category = "Finite adverbial clause" | |
| ## non-finite | |
| elif ( | |
| str(spanroot.morph) | |
| in [ | |
| "Aspect=Prog|Tense=Pres|VerbForm=Part", | |
| "Aspect=Perf|Tense=Past|VerbForm=Part", | |
| ] | |
| and "aux" not in c_dep | |
| ): | |
| # he doing his job | |
| if argmentless: | |
| # e.g., frankly speaking, strictly speaking | |
| category = "Adverbial Phrase" | |
| else: | |
| category = "Non-finite adv clause 2" | |
| elif ( | |
| spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless | |
| ): | |
| category = "Non-finite adv clause 3" | |
| elif "aux" in c_dep and "TO" in c_tag: | |
| category = "Adverbial Phrase" | |
| elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]: | |
| category = "Dependent Verb phrase" | |
| elif not argmentless: | |
| category = "Adverbial clause" | |
| elif spanroot.dep_ == "advcl": | |
| category = "Adverbial phrase" | |
| else: | |
| category = "Finite adverbial clause " | |
| if spanroot.dep_ in ["relcl", "ccomp", "acl", "punct", "pcomp"]: | |
| head = spanroot.head | |
| if ";" in [t.norm_ for t in head.children]: | |
| category = "Main verb 3" | |
| elif "nsubj" not in span_dep: | |
| category = "Dependent verb 1" | |
| elif "mark" in span_dep: | |
| category = "Complement clause" | |
| elif ( | |
| str(spanroot.morph) | |
| in [ | |
| "Aspect=Prog|Tense=Pres|VerbForm=Part", | |
| "Aspect=Perf|Tense=Past|VerbForm=Part", | |
| ] | |
| and "aux" not in c_dep | |
| ): | |
| category = "Non-finite complement clause" | |
| elif spanroot.dep_ in ["relcl"]: | |
| category = "Relative clause" | |
| elif spanroot.dep_ in ["ccomp"]: | |
| category = "Complement clause" | |
| elif spanroot.dep_ in ["acl"]: | |
| category = "Noun Complement clause" | |
| ## Specific constructions | |
| # Extraposed that-clause or to-infinitives | |
| if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [ | |
| "VERB", | |
| "AUX", | |
| ]: | |
| # print(c_dep) | |
| if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep: | |
| # eg it seems odd (oprd) that X. | |
| # eg it is certain (acomp) that X. | |
| category = ( | |
| "Extraposed that-cl (adj-complement)" # e.g., it is certain that X. | |
| ) | |
| elif "xcomp" in c_dep or ("advcl" in c_dep): | |
| if "for_mark" in _check_for_to: | |
| category = ( | |
| "Extraposed to-cl (explicit subj)" # eg It is possible to . | |
| ) | |
| elif _check_to: | |
| category = "Extraposed to-cl 1" # eg It is possible to . | |
| elif _check_ing: | |
| category = "Extraposed -ing 1" # eg It is possible to . | |
| elif ( | |
| ("prep" in right_dep or "npadvmod" in right_dep) | |
| and "ccomp" in right_dep | |
| and spanroot.lemma_ == "be" | |
| ): | |
| category = "Cleft construction" | |
| elif "attr" in c_dep: | |
| category = "Extraposed that-cl (copula)" # eg It is a wonder that X. | |
| else: | |
| category = "Extraposed that-cl (VERB)" | |
| # if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_): | |
| # category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X. | |
| elif ( | |
| "it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_ | |
| ) and "acomp" in c_dep: | |
| if "xcomp" in c_dep: | |
| if _check_to: | |
| category = "Extraposed to-cl 2" # eg it is difficult to decide. | |
| elif _check_ing: | |
| category = "Extraposed -ing 2" # eg it is difficult to decide. | |
| else: | |
| category = "Extraposed that-cl (adj-complement) 2" | |
| elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep: | |
| category = ( | |
| "Extraposed that-cl (adj-complement) 3" # eg it seems odd that X. | |
| ) | |
| # something without dummy subject "it" | |
| elif ( | |
| (("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep) | |
| and spanroot.pos_ in ["AUX", "VERB"] | |
| and "it" not in c_norm | |
| ): | |
| # store xcomp, if the head of the xcomp is acomp | |
| _check_xcomp = [ | |
| c.dep_ | |
| for c in spanroot.subtree | |
| if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp" | |
| ] | |
| _check_ccomp = [ | |
| c.dep_ | |
| for c in spanroot.subtree | |
| if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp" | |
| ] | |
| # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"] | |
| # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"] | |
| if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep: | |
| if any(root_before_ccomp): | |
| category = "Post-predicate that-cl" | |
| else: | |
| category = "Comment clause" | |
| elif ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in _check_ccomp: | |
| category = "Post-predicate that-cl 2" | |
| elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp: | |
| category = "Post-predicate to-cl" | |
| elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to: | |
| category = "Subject predicate to-cl" | |
| elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to: | |
| category = "Subject predicate to-cl (passive)" | |
| elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing: | |
| category = "Subject predicate -ing" | |
| elif "ccomp" in c_dep: | |
| category = "Subject predicate that-cl" | |
| elif "acomp" in c_dep: | |
| category = "Adjectival predicate" | |
| elif "mark" in c_dep and ("nsubj" in c_dep or "nsubjpass" in c_dep): | |
| category = "Finite-adverbial clause" | |
| elif not argmentless and "SCONJ" in c_pos: | |
| category = "Finite-adverbial clause" | |
| else: | |
| category = "Main verb 1" | |
| ## without dummy subject it, and lexical verbs | |
| elif ( | |
| ("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep | |
| and spanroot.pos_ in ["AUX", "VERB"] | |
| and "it" not in c_norm | |
| and spanroot.lemma_ not in ["be"] | |
| ): | |
| _check_wh = [ | |
| c.dep_ | |
| for c in spanroot.subtree | |
| if ( | |
| c.dep_ in ["attr", "advmod", "dobj", "nsubj"] | |
| and c.tag_ in ["WP", "WRB", "WDT", "WP$"] | |
| ) | |
| and c.head.dep_ == "ccomp" | |
| ] | |
| _check_if = [ | |
| c.dep_ | |
| for c in spanroot.subtree | |
| if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"]) | |
| and c.head.dep_ == "ccomp" | |
| ] | |
| # _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"] | |
| # _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"] | |
| if "ccomp" in c_dep and (_check_wh or _check_if): | |
| category = "Post-predicate wh-cl" | |
| elif "ccomp" in c_dep: | |
| if any(root_before_ccomp): | |
| category = "Post-predicate that-cl" | |
| else: | |
| category = "Comment clause" | |
| elif "xcomp" in c_dep: | |
| if _check_to: | |
| category = "Post-predicate to-cl" | |
| elif _check_ing: | |
| category = "Post-predicate -ing" | |
| # Existential | |
| elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep: | |
| category = "There is/are NOUN" | |
| elif ( | |
| "ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"] | |
| ): | |
| category = "Cleft construction" | |
| ### The end of clausal analysis | |
| if spanroot.dep_ in ["parataxis"]: | |
| if "_".join(span_dep) in [ | |
| "nsubj_parataxis", | |
| "aux_parataxis", | |
| "nsubj_aux_parataxis", | |
| ]: | |
| category = "Comment clause" | |
| else: | |
| category = "Parataxis" | |
| if spanroot.dep_ in ["dep", "csubj", "csubjpass"]: | |
| if ( | |
| spanroot.head.dep_ in ["ROOT", "ccomp"] | |
| and spanroot.head.pos_ in ["AUX", "VERB"] | |
| and spanroot.pos_ in ["AUX", "VERB"] | |
| ): | |
| if spanroot.morph == spanroot.head.morph: | |
| category = "Main verb 4" | |
| else: | |
| category = "Dependent verb 2" | |
| elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part": | |
| category = "Gerund" | |
| elif "VerbForm=Fin" in str(spanroot.morph) or "VerbForm=Inf" in str( | |
| spanroot.morph | |
| ): | |
| category = "Dependent verb 2" | |
| elif spanroot.dep_ in ["csubj", "csubjpass"]: | |
| category = "Dependent verb (csubj)" | |
| # Appositive phrases | |
| if spanroot.dep_ in ["appos"]: | |
| if "nummod" in c_dep: | |
| category = "Apposition" | |
| if spanroot.pos_ in ["PROPN"]: | |
| category = "Appositive Proper Nouns" | |
| elif spanroot.pos_ in ["NOUN"]: | |
| category = "Appositive Noun Phrase" | |
| elif spanroot.pos_ in ["VERB", "AUX"]: | |
| _check = any( | |
| c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"] | |
| for c in spanroot.children | |
| ) | |
| if _check: | |
| category = "Appositive Finite-clause" | |
| if spanroot.dep_ in ["appos", "dep", "attr"]: | |
| if not subjless and spanroot.pos_ in ["VERB", "AUX"]: | |
| category = "Main verb (likely parsing error)" | |
| # sometimes the dep are on the conjunctions | |
| if spanroot.dep_ in ["dep", "mark"]: | |
| if spanroot.tag_ in ["RB", "IN", "CC"]: | |
| category = "Conjunction" | |
| if spanroot.dep_ in ["intj"]: | |
| category = "Introjection" | |
| # sometimes the extra-clausal links are not accurate | |
| if ( | |
| spanroot.dep_ | |
| in ["aux", "auxpass", "oprd", "appos", "xcomp", "attr", "dep", "meta", "prt"] | |
| and category == None | |
| ): | |
| if spanroot.head.dep_ == "ROOT": | |
| category = "Main verb" | |
| else: | |
| category = "dependent verb 5" | |
| if span.label_ == "CITATION": | |
| if "NNP" in span_tag or "NNPS" in span_tag: | |
| if span_dep[0] == "punct" and span_dep[-1] == "punct": | |
| category = "Parenthetical Citation" | |
| elif span_tag[0] in ["NNP", "NNPS"]: | |
| category = "Narrative Citation" | |
| else: | |
| category = "Other Citation" | |
| if category == None: | |
| category = spanroot.dep_ | |
| return category | |
| def const_table( | |
| doc: Union[spacy.tokens.Doc, Dict[str, str]], | |
| spans_key: str = "sc", | |
| attrs: List[str] = SPAN_ATTRS, | |
| ): | |
| columns = attrs + [ | |
| "Conf. score", | |
| "sent no.", | |
| "grammatical realization", | |
| "span dep", | |
| "ner", | |
| "POS", | |
| "span dep seq", | |
| "TAG sequence", | |
| "POS sequence", | |
| "head", | |
| "head dep", | |
| "children", | |
| "morphology", | |
| "sent", | |
| ] | |
| data = [] | |
| # data = span_info_aggregator(doc, columns) | |
| sentences = {s: i for i, s in enumerate(doc.sents)} | |
| for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs["scores"]): | |
| span_info = [] | |
| span_info.extend([str(getattr(span, attr)) for attr in attrs]) | |
| span_info.append(score) | |
| span_info.append(int(sentences[span.sent])) | |
| span_info.append(construction_classifier2(doc, span)) | |
| span_info.append(span.root.dep_) | |
| span_info.append(span.root.ent_type_) | |
| span_info.append(span.root.tag_) | |
| span_info.append("_".join([t.dep_ for t in span])) | |
| span_info.append("_".join([t.tag_ for t in span])) | |
| span_info.append("_".join([t.pos_ for t in span])) | |
| span_info.append(span.root.head.norm_) | |
| span_info.append(span.root.head.dep_) | |
| span_info.append("_".join([c.dep_ for c in span.root.children])) | |
| span_info.append(str(span.root.morph)) | |
| span_info.append(span.sent.text.strip()) | |
| data.append(span_info) | |
| return data, columns | |
| def ngrammar(seq: list, n=2, concat=False, sep="-"): | |
| result = [] | |
| n_item = len(seq) | |
| for idx, item in enumerate(seq): | |
| if idx + n <= n_item: | |
| if concat: | |
| result.append(sep.join(seq[idx : idx + n])) | |
| else: | |
| result.append(seq[idx : idx + n]) | |
| return result | |
| def diversity_values(count_vec: list): | |
| result = {} | |
| if len(count_vec) == 0: | |
| count_vec = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] | |
| result["shannon"] = dv.alpha.shannon(list(count_vec), base=2) | |
| result["brillouin_d"] = dv.alpha.brillouin_d(list(count_vec)) | |
| result["simpson_d"] = 1 - dv.alpha.simpson(list(count_vec)) | |
| result["simpson_e"] = dv.alpha.simpson_e(list(count_vec)) | |
| # result['gini_index'] = dv.alpha.gini_index(list(count_vec)) | |
| # result['faith_pd'] = dv.alpha.faith_pd(list(count_vec)) | |
| return result | |