engagement-analyzer-demo5 / pipeline /post_processors.py
egumasa's picture
push
0146ef9
from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
import pandas as pd
import spacy
from spacy.language import Language
from skbio import diversity as dv
SPAN_ATTRS = ["text", "label_", "start", "end"]
CATEGORIES = [
"ATTRIBUTION",
"CITATION",
"COUNTER",
"DENY",
"ENDOPHORIC",
"ENTERTAIN",
"JUSTIFYING",
"MONOGLOSS",
"PROCLAIM",
"SOURCES",
]
def simple_table(
doc: Union[spacy.tokens.Doc, Dict[str, str]],
spans_key: str = "sc",
attrs: List[str] = SPAN_ATTRS,
):
columns = attrs + ["Conf. score"]
data = [
[str(getattr(span, attr)) for attr in attrs] + [score] # [f'{score:.5f}']
for span, score in zip(
doc.spans[spans_key], doc.spans[spans_key].attrs["scores"]
)
]
return data, columns
# def span_info_aggregator()
def construction_classifier(doc, span):
category = None
spanroot = span.root
## Grabbing lexico-grammatical information
span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span]
span_dep = [t.dep_ for t in span]
span_token = [t.norm_ for t in span]
span_tag = [t.tag_ for t in span]
c = [c for c in spanroot.children]
c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
c_norm = [c.norm_ for c in spanroot.children]
c_dep = [c.dep_ for c in spanroot.children]
c_pos = [c.pos_ for c in spanroot.children]
c_tag = [c.tag_ for c in spanroot.children]
right_dep = [c.dep_ for c in spanroot.rights]
# conditionals
subjless = all(
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
for c in spanroot.children
)
argmentless = all(
c.dep_
not in [
"nsubj",
"nsubjpass",
"csubj",
"csubjpass",
"dobj",
"ccomp",
"xcomp",
"dative",
"attr",
"oprd",
"acomp",
]
for c in spanroot.children
)
argless_span = all(
c.dep_
not in [
"nsubj",
"nsubjpass",
"csubj",
"csubjpass",
"dobj",
"ccomp",
"xcomp",
"dative",
"attr",
"oprd",
"acomp",
]
for c in span
)
## nesting classifiers
if spanroot.dep_ == "conj":
while spanroot.dep_ == "conj":
spanroot = spanroot.head
# if spanroot.dep_ == "poss":
# while spanroot.dep_ == 'poss':
# spanroot = spanroot.head
## Conjunctions
# Preconjunctions
if spanroot.dep_ in ["preconj", "cc"]:
category = "Conjunction"
## NOUN PHRASES
# adverbial phrases
if spanroot.dep_ in ["amod"]:
category = "Adjectival modifier"
# adverbial phrases
if spanroot.dep_ in ["compound"]:
category = "Compound noun"
## Nominal category
if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]:
if "acl" in c_dep:
category = "Noun + Complement (Object)"
else:
category = "Object"
if spanroot.dep_ in ["nsubj", "nsubjpass"]:
if "acl" in c_dep:
category = "Noun + Complement (Subject)"
else:
category = "Subject"
## ADJUNCTS
# prep phrases
if spanroot.dep_ in ["prep", "agent"]:
category = "Prepositional phrase"
# adverbial phrases
if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod"]:
category = "Adverbial phrase"
## Predication patterns
if spanroot.dep_ in ["acomp", "oprd"]:
if "xcomp" in c_dep:
category = "Subject predicate to-cl"
else:
category = "Adjectival complement"
if spanroot.dep_ in ["attr"]:
subjless = all(
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
for c in spanroot.children
)
c_head = [c.dep_ for c in spanroot.head.children]
if "expl" in c_head and "no_det" in span_t_dep_:
category = "There is/are no NOUN"
elif "expl" in c_head and spanroot.pos_ in ["NOUN"]:
category = "There is/are + Noun complement"
elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
category = "There is/are + Noun complement"
elif spanroot.pos_ in ["NOUN", "PRON"]:
if "acl" in c_dep:
category = "Noun + Complement (attr)"
else:
category = "Nominal complement"
elif not subjless and spanroot.pos_ in ["VERB", "AUX"]:
category = "Main verb 4"
elif spanroot.tag_ in ["NNP"]:
category = "Nominal complement"
####################################
### clausal ####
####################################
if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl"]:
_check_to = [
c.dep_
for c in spanroot.subtree
if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"])
and c.head.dep_ == "xcomp"
]
_check_ing = [
c.dep_
for c in spanroot.subtree
if "Prog" in str(c.morph) and c.dep_ == "xcomp"
]
root_before_ccomp = [
c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"
]
_check_for_to = [
"_".join([c.norm_, c.dep_])
for c in spanroot.subtree
if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux")
]
entire_cl = (
spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
)
## Start with broad category, which is then re-evaluated for specific constructions.
if spanroot.dep_ in ["advcl", "mark", "acl", "pcomp"]:
## Adverbial clauses
### Finite-adverbial clauses
### Non-finite adverbial clauses
subjless = all(
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
for c in spanroot.children
)
entire_cl = (
spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
)
if "mark" in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
category = "Finite adverbial clause"
elif "mark" in span_dep and "aux" in span_dep:
category = "Finite adverbial clause"
elif (
"mark" in span_dep
and spanroot.pos_ in ["VERB", "AUX"]
and "expl" in c_dep
):
category = "Finite adverbial clause"
elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
if spanroot.pos_ in ["VERB", "AUX"]:
category = "Finite adverbial clause"
elif spanroot.pos_ not in ["VERB", "AUX"] and subjless:
category = "Non-finite adv clause 1"
elif entire_cl:
category = "Finite adverbial clause"
elif (
str(spanroot.morph)
in [
"Aspect=Prog|Tense=Pres|VerbForm=Part",
"Aspect=Perf|Tense=Past|VerbForm=Part",
]
and "aux" not in c_dep
):
# he doing his job
if argmentless:
# e.g., frankly speaking, strictly speaking
category = "Adverbial Phrase"
else:
category = "Non-finite adv clause 2"
elif (
spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless
):
category = "Non-finite adv clause 3"
elif "aux" in c_dep and "TO" in c_tag:
category = "Adverbial Phrase"
elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
category = "Dependent Verb phrase"
elif not argmentless:
category = "Adverbial clause"
elif spanroot.dep_ == "advcl":
category = "Adverbial phrase"
if spanroot.dep_ in ["relcl", "ccomp", "acl"]:
head = spanroot.head
if ";" in [t.norm_ for t in head.children]:
category = "Main verb 3"
elif "nsubj" not in span_dep:
category = "Dependent verb 1"
elif "mark" in span_dep:
category = "Complement clause"
elif (
str(spanroot.morph)
in [
"Aspect=Prog|Tense=Pres|VerbForm=Part",
"Aspect=Perf|Tense=Past|VerbForm=Part",
]
and "aux" not in c_dep
):
category = "Non-finite complement clause"
elif spanroot.dep_ in ["relcl"]:
category = "Relative clause"
elif spanroot.dep_ in ["ccomp"]:
category = "Complement clause"
elif spanroot.dep_ in ["acl"]:
category = "Noun Complement clause"
else:
# print(_check_for_to)
category = "this one"
## Specific constructions
# Extraposed that-clause or to-infinitives
if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [
"VERB",
"AUX",
]:
print(c_dep)
if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
# eg it seems odd (oprd) that X.
# eg it is certain (acomp) that X.
category = (
"Extraposed that-cl (adj-complement)" # e.g., it is certain that X.
)
elif "xcomp" in c_dep or ("advcl" in c_dep):
if "for_mark" in _check_for_to:
category = (
"Extraposed to-cl (explicit subj)" # eg It is possible to .
)
elif _check_to:
category = "Extraposed to-cl 1" # eg It is possible to .
elif _check_ing:
category = "Extraposed -ing 1" # eg It is possible to .
elif (
("prep" in right_dep or "npadvmod" in right_dep)
and "ccomp" in right_dep
and spanroot.lemma_ == "be"
):
category = "Cleft construction"
elif "attr" in c_dep:
category = "Extraposed that-cl (copula)" # eg It is a wonder that X.
else:
category = "Extraposed that-cl (VERB)"
# if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
# category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
elif (
"it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_
) and "acomp" in c_dep:
if "xcomp" in c_dep:
if _check_to:
category = "Extraposed to-cl 2" # eg it is difficult to decide.
elif _check_ing:
category = "Extraposed -ing 2" # eg it is difficult to decide.
else:
category = "Extraposed that-cl (adj-complement) 2"
elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
category = (
"Extraposed that-cl (adj-complement) 3" # eg it seems odd that X.
)
# something without dummy subject "it"
elif (
(("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep)
and spanroot.pos_ in ["AUX", "VERB"]
and "it" not in c_norm
):
# store xcomp, if the head of the xcomp is acomp
_check_xcomp = [
c.dep_
for c in spanroot.subtree
if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"
]
_check_ccomp = [
c.dep_
for c in spanroot.subtree
if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"
]
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
if any(root_before_ccomp):
category = "Post-predicate that-cl"
else:
category = "Comment clause"
elif ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in _check_ccomp:
category = "Post-predicate that-cl 2"
elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
category = "Post-predicate to-cl"
elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to:
category = "Subject predicate to-cl"
elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
category = "Subject predicate to-cl (passive)"
elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing:
category = "Subject predicate -ing"
elif "ccomp" in c_dep:
category = "Subject predicate that-cl"
elif "acomp" in c_dep:
category = "Adjectival predicate"
elif "mark" in c_dep and ("nsubj" in c_dep or "nsubjpass" in c_dep):
category = "Finite-adverbial clause"
else:
category = "Main verb 1"
## without dummy subject it, and lexical verbs
elif (
("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep
and spanroot.pos_ in ["AUX", "VERB"]
and "it" not in c_norm
and spanroot.lemma_ not in ["be"]
):
_check_wh = [
c.dep_
for c in spanroot.subtree
if (
c.dep_ in ["attr", "advmod", "dobj", "nsubj"]
and c.tag_ in ["WP", "WRB", "WDT", "WP$"]
)
and c.head.dep_ == "ccomp"
]
_check_if = [
c.dep_
for c in spanroot.subtree
if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"])
and c.head.dep_ == "ccomp"
]
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
if "ccomp" in c_dep and (_check_wh or _check_if):
category = "Post-predicate wh-cl"
elif "ccomp" in c_dep:
if any(root_before_ccomp):
category = "Post-predicate that-cl"
else:
category = "Comment clause"
elif "xcomp" in c_dep:
if _check_to:
category = "Post-predicate to-cl"
elif _check_ing:
category = "Post-predicate -ing"
# Existential
elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
category = "There is/are NOUN"
elif (
"ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]
):
category = "Cleft construction"
if spanroot.dep_ in ["parataxis"]:
if "_".join(span_dep) in [
"nsubj_parataxis",
"aux_parataxis",
"nsubj_aux_parataxis",
]:
category = "Comment clause"
else:
category = "parataxis (for now)"
## External comp
if spanroot.dep_ in ["xcomp"]:
if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_:
category = "Adjective complement to-cl"
if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_:
category = "Verb complement to-cl"
if spanroot.dep_ in ["pcomp"]:
if (
str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]
and "ccomp" in c_dep
):
category = "Participle + that-cl"
elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
category = "Participle"
## Simple classifier
# if spanroot.dep_ in ['pcomp']:
# if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
# category = "Gerund"
if spanroot.dep_ in ["neg"]:
category = "Negative particle"
if spanroot.dep_ in ["aux", "auxpass"]:
category = "Auxiliary"
# Modal verbs
if spanroot.tag_ == "MD":
category = "Modal auxiliary"
if spanroot.dep_ in ["dep", "csubj", "csubjpass"]:
if (
spanroot.head.dep_ in ["ROOT", "ccomp"]
and spanroot.head.pos_ in ["AUX", "VERB"]
and spanroot.pos_ in ["AUX", "VERB"]
):
if spanroot.morph == spanroot.head.morph:
category = "Main verb 4"
else:
category = "Dependent verb 2"
elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
category = "Gerund"
elif spanroot.head.dep_ in ["conj", "acl", "relcl"]:
if spanroot.morph == spanroot.head.morph:
category = "Main verb 4"
else:
category = "Dependent verb 2"
elif "VerbForm=Fin" in str(spanroot.morph):
category = "Dependent verb 2"
# Appositive phrases
if spanroot.dep_ in ["appos"]:
if "nummod" in c_dep:
category = "Apposition"
elif spanroot.pos_ in ["PROPN"]:
category = "Appositive Proper Nouns"
elif spanroot.pos_ in ["NOUN"]:
category = "Appositive Noun Phrase"
elif spanroot.pos_ in ["VERB", "AUX"]:
_check = any(
c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
for c in spanroot.children
)
if _check:
category = "Appositive Finite-clause"
if spanroot.dep_ in ["appos", "dep", "attr"]:
if not subjless and spanroot.pos_ in ["VERB", "AUX"]:
category = "Main verb 5"
if spanroot.dep_ in ["dep", "mark"]:
if spanroot.tag_ in ["RB", "IN", "CC"]:
category = "Conjunction"
# sometimes the extra-clausal links are not accurate
if spanroot.dep_ in ["aux", "auxpass", "oprd", "appos", "xcomp"]:
if spanroot.head.dep_ == "ROOT":
category = "Main verb"
else:
category = "dependent verb 5"
if span.label_ == "CITATION":
if "NNP" in span_tag or "NNPS" in span_tag:
if span_dep[0] == "punct" and span_dep[-1] == "punct":
category = "Parenthetical Citation"
elif span_tag[0] in ["NNP", "NNPS"]:
category = "Narrative Citation"
else:
category = "Other Citation"
if category == None:
category = spanroot.dep_
return category
def construction_classifier2(doc, span):
category = None
spanroot = span.root
## Grabbing lexico-grammatical information
span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span]
span_dep = [t.dep_ for t in span]
span_token = [t.norm_ for t in span]
span_tag = [t.tag_ for t in span]
c = [c for c in spanroot.children]
c_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in spanroot.children]
c_norm = [c.norm_ for c in spanroot.children]
c_dep = [c.dep_ for c in spanroot.children]
c_pos = [c.pos_ for c in spanroot.children]
c_tag = [c.tag_ for c in spanroot.children]
right_dep = [c.dep_ for c in spanroot.rights]
# conditionals
subjless = all(
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
for c in spanroot.children
)
argmentless = all(
c.dep_
not in [
"nsubj",
"nsubjpass",
"csubj",
"csubjpass",
"dobj",
"ccomp",
"xcomp",
"dative",
"attr",
"oprd",
"acomp",
]
for c in spanroot.children
)
argless_span = all(
c.dep_
not in [
"nsubj",
"nsubjpass",
"csubj",
"csubjpass",
"dobj",
"ccomp",
"xcomp",
"dative",
"attr",
"oprd",
"acomp",
]
for c in span
)
argless_span = all(
c.dep_
not in [
"nsubj",
"nsubjpass",
"csubj",
"csubjpass",
"dobj",
"ccomp",
"xcomp",
"dative",
"attr",
"oprd",
"acomp",
]
for c in span
)
## nesting classifiers
if spanroot.dep_ == "conj":
while spanroot.dep_ == "conj":
spanroot = spanroot.head
if spanroot.dep_ == "poss":
head = spanroot.head
if head.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]:
category = "Posessive Noun (Object)"
elif head.dep_ in ["nsubj", "nsubjpass"]:
category = "Posessive Noun (Subject)"
else:
category = "Posessive Noun (Other)"
## Conjunctions
# Preconjunctions
if spanroot.dep_ in ["preconj", "cc"]:
category = "Conjunction"
## NOUN PHRASES
# adverbial phrases
if spanroot.dep_ in ["amod"]:
category = "Adjectival modifier"
# adverbial phrases
if spanroot.dep_ in ["compound"]:
category = "Compound noun"
## Nominal category
if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj", "dative"]:
if "acl" in c_dep:
category = "Noun + Complement (Object)"
else:
category = "Object"
if spanroot.dep_ in ["nsubj", "nsubjpass"]:
if "acl" in c_dep:
category = "Noun + Complement (Subject)"
else:
category = "Subject"
## ADJUNCTS
# prep phrases
if spanroot.dep_ in ["prep", "agent"]:
category = "Prepositional phrase"
# adverbial phrases
if spanroot.dep_ in ["advmod", "npadvmod", "nmod", "npmod", "quantmod", "nummod"]:
category = "Adverbial phrase"
## Predication patterns
if spanroot.dep_ in ["acomp", "oprd"]:
if "xcomp" in c_dep:
category = "Subject predicate to-cl"
else:
category = "Adjectival complement"
if spanroot.dep_ in ["attr"]:
subjless = all(
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
for c in spanroot.children
)
c_head = [c.dep_ for c in spanroot.head.children]
if "expl" in c_head and "no_det" in span_t_dep_:
category = "There is/are no NOUN"
elif "expl" in c_head and spanroot.pos_ in ["NOUN"]:
category = "There is/are + Noun complement"
elif "expl" in c_head and spanroot.tag_ in ["NN", "NNS"]:
category = "There is/are + Noun complement"
elif spanroot.pos_ in ["NOUN", "PRON"]:
if "acl" in c_dep:
category = "Noun + Complement (attr)"
else:
category = "Nominal complement"
elif not subjless and spanroot.pos_ in ["VERB", "AUX"]:
category = "Main verb 4"
elif spanroot.tag_ in ["NNP"]:
category = "Nominal complement"
## External comp
if spanroot.dep_ in ["xcomp"]:
if spanroot.head.pos_ == "ADJ" and "to_aux" in c_t_dep_:
category = "Adjective complement to-cl"
if spanroot.head.pos_ == "VERB" and "to_aux" in c_t_dep_:
category = "Verb complement to-cl"
if spanroot.dep_ in ["pcomp"]:
if (
str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]
and "ccomp" in c_dep
):
category = "Participle + that-cl"
elif str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
category = "Participle"
## Simple classifier
# if spanroot.dep_ in ['pcomp']:
# if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
# category = "Gerund"
if spanroot.dep_ in ["neg"]:
category = "Negative particle"
if spanroot.dep_ in ["aux", "auxpass"]:
category = "Auxiliary"
# Modal verbs
if spanroot.tag_ == "MD":
category = "Modal auxiliary"
####################################
### clausal ####
####################################
if spanroot.dep_ in ["ROOT", "advcl", "ccomp", "acl", "pcomp", "relcl", "punct"]:
_check_to = [
c.dep_
for c in spanroot.subtree
if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"])
and c.head.dep_ == "xcomp"
]
_check_ing = [
c.dep_
for c in spanroot.subtree
if "Prog" in str(c.morph) and c.dep_ == "xcomp"
]
root_before_ccomp = [
c.i > spanroot.i for c in spanroot.children if c.dep_ == "ccomp"
]
_check_for_to = [
"_".join([c.norm_, c.dep_])
for c in spanroot.subtree
if c.head.dep_ == "advcl" and (c.dep_ == "mark" or c.dep_ == "aux")
]
entire_cl = (
spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
)
## Start with broad category, which is then re-evaluated for specific constructions.
if spanroot.dep_ in ["advcl", "acl", "punct", "pcomp"]: #'mark',
## Adverbial clauses
subjless = all(
c.dep_ not in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
for c in spanroot.children
)
entire_cl = (
spanroot.left_edge.i == span.start and spanroot.right_edge.i == span.end
)
### Finite-adverbial clauses
if "mark" in span_dep and (
spanroot.pos_ in ["VERB", "AUX"] or "aux" in span_dep
):
category = "Finite adverbial clause"
elif "mark" in span_dep and "aux" in span_dep:
category = "Finite adverbial clause"
elif (
"mark" in span_dep
and spanroot.pos_ in ["VERB", "AUX"]
and "expl" in c_dep
):
category = "Finite adverbial clause"
elif "advmod" in span_dep and ("WRB" in span_tag or "WDT" in span_tag):
if spanroot.pos_ in ["VERB", "AUX"]:
category = "Finite adverbial clause"
elif spanroot.pos_ not in ["VERB", "AUX"] and subjless:
category = "Non-finite adv clause 1"
elif not argmentless:
category = "Finite adverbial clause"
## non-finite
elif (
str(spanroot.morph)
in [
"Aspect=Prog|Tense=Pres|VerbForm=Part",
"Aspect=Perf|Tense=Past|VerbForm=Part",
]
and "aux" not in c_dep
):
# he doing his job
if argmentless:
# e.g., frankly speaking, strictly speaking
category = "Adverbial Phrase"
else:
category = "Non-finite adv clause 2"
elif (
spanroot.pos_ not in ["VERB", "AUX"] and "mark" in span_dep and subjless
):
category = "Non-finite adv clause 3"
elif "aux" in c_dep and "TO" in c_tag:
category = "Adverbial Phrase"
elif "mark" not in span_dep and spanroot.pos_ in ["VERB", "AUX"]:
category = "Dependent Verb phrase"
elif not argmentless:
category = "Adverbial clause"
elif spanroot.dep_ == "advcl":
category = "Adverbial phrase"
else:
category = "Finite adverbial clause "
if spanroot.dep_ in ["relcl", "ccomp", "acl", "punct", "pcomp"]:
head = spanroot.head
if ";" in [t.norm_ for t in head.children]:
category = "Main verb 3"
elif "nsubj" not in span_dep:
category = "Dependent verb 1"
elif "mark" in span_dep:
category = "Complement clause"
elif (
str(spanroot.morph)
in [
"Aspect=Prog|Tense=Pres|VerbForm=Part",
"Aspect=Perf|Tense=Past|VerbForm=Part",
]
and "aux" not in c_dep
):
category = "Non-finite complement clause"
elif spanroot.dep_ in ["relcl"]:
category = "Relative clause"
elif spanroot.dep_ in ["ccomp"]:
category = "Complement clause"
elif spanroot.dep_ in ["acl"]:
category = "Noun Complement clause"
## Specific constructions
# Extraposed that-clause or to-infinitives
if ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and spanroot.pos_ in [
"VERB",
"AUX",
]:
# print(c_dep)
if ("acomp" in c_dep or "oprd" in c_dep) and "ccomp" in c_dep:
# eg it seems odd (oprd) that X.
# eg it is certain (acomp) that X.
category = (
"Extraposed that-cl (adj-complement)" # e.g., it is certain that X.
)
elif "xcomp" in c_dep or ("advcl" in c_dep):
if "for_mark" in _check_for_to:
category = (
"Extraposed to-cl (explicit subj)" # eg It is possible to .
)
elif _check_to:
category = "Extraposed to-cl 1" # eg It is possible to .
elif _check_ing:
category = "Extraposed -ing 1" # eg It is possible to .
elif (
("prep" in right_dep or "npadvmod" in right_dep)
and "ccomp" in right_dep
and spanroot.lemma_ == "be"
):
category = "Cleft construction"
elif "attr" in c_dep:
category = "Extraposed that-cl (copula)" # eg It is a wonder that X.
else:
category = "Extraposed that-cl (VERB)"
# if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
# category = "Extraposed that-cl (VERB)1" #e.g., it has been shown that X.
elif (
"it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_
) and "acomp" in c_dep:
if "xcomp" in c_dep:
if _check_to:
category = "Extraposed to-cl 2" # eg it is difficult to decide.
elif _check_ing:
category = "Extraposed -ing 2" # eg it is difficult to decide.
else:
category = "Extraposed that-cl (adj-complement) 2"
elif ("it_nsubjpass" in c_t_dep_ or "it_nsubj" in c_t_dep_) and "oprd" in c_dep:
category = (
"Extraposed that-cl (adj-complement) 3" # eg it seems odd that X.
)
# something without dummy subject "it"
elif (
(("nsubj" in c_dep and spanroot.lemma_ in ["be"]) or "nsubjpass" in c_dep)
and spanroot.pos_ in ["AUX", "VERB"]
and "it" not in c_norm
):
# store xcomp, if the head of the xcomp is acomp
_check_xcomp = [
c.dep_
for c in spanroot.subtree
if c.dep_ in ["xcomp"] and c.head.dep_ == "acomp"
]
_check_ccomp = [
c.dep_
for c in spanroot.subtree
if c.dep_ in ["ccomp"] and c.head.dep_ == "acomp"
]
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
if ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in c_dep:
if any(root_before_ccomp):
category = "Post-predicate that-cl"
else:
category = "Comment clause"
elif ("attr" in c_dep or "acomp" in c_dep) and "ccomp" in _check_ccomp:
category = "Post-predicate that-cl 2"
elif ("attr" in c_dep or "acomp" in c_dep) and "xcomp" in _check_xcomp:
category = "Post-predicate to-cl"
elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_to:
category = "Subject predicate to-cl"
elif "xcomp" in c_dep and "auxpass" in c_dep and _check_to:
category = "Subject predicate to-cl (passive)"
elif "xcomp" in c_dep and spanroot.lemma_ in ["be"] and _check_ing:
category = "Subject predicate -ing"
elif "ccomp" in c_dep:
category = "Subject predicate that-cl"
elif "acomp" in c_dep:
category = "Adjectival predicate"
elif "mark" in c_dep and ("nsubj" in c_dep or "nsubjpass" in c_dep):
category = "Finite-adverbial clause"
elif not argmentless and "SCONJ" in c_pos:
category = "Finite-adverbial clause"
else:
category = "Main verb 1"
## without dummy subject it, and lexical verbs
elif (
("nsubj" in c_dep or "nsubjpass" in c_dep) in c_dep
and spanroot.pos_ in ["AUX", "VERB"]
and "it" not in c_norm
and spanroot.lemma_ not in ["be"]
):
_check_wh = [
c.dep_
for c in spanroot.subtree
if (
c.dep_ in ["attr", "advmod", "dobj", "nsubj"]
and c.tag_ in ["WP", "WRB", "WDT", "WP$"]
)
and c.head.dep_ == "ccomp"
]
_check_if = [
c.dep_
for c in spanroot.subtree
if (c.dep_ in ["mark"] and c.norm_ in ["whether", "if"])
and c.head.dep_ == "ccomp"
]
# _check_to = [c.dep_ for c in spanroot.subtree if (c.dep_ in ["aux"] and c.pos_ in ["PART", "SCONJ"]) and c.head.dep_ == "xcomp"]
# _check_ing = [c.dep_ for c in spanroot.subtree if "Prog" in str(c.morph) and c.dep_ == "xcomp"]
if "ccomp" in c_dep and (_check_wh or _check_if):
category = "Post-predicate wh-cl"
elif "ccomp" in c_dep:
if any(root_before_ccomp):
category = "Post-predicate that-cl"
else:
category = "Comment clause"
elif "xcomp" in c_dep:
if _check_to:
category = "Post-predicate to-cl"
elif _check_ing:
category = "Post-predicate -ing"
# Existential
elif "expl" in c_dep and "NOUN" in c_pos and "mark" not in c_dep:
category = "There is/are NOUN"
elif (
"ccomp" in c_dep and "it_nsubj" in span_t_dep_ and spanroot.pos_ in ["AUX"]
):
category = "Cleft construction"
### The end of clausal analysis
if spanroot.dep_ in ["parataxis"]:
if "_".join(span_dep) in [
"nsubj_parataxis",
"aux_parataxis",
"nsubj_aux_parataxis",
]:
category = "Comment clause"
else:
category = "Parataxis"
if spanroot.dep_ in ["dep", "csubj", "csubjpass"]:
if (
spanroot.head.dep_ in ["ROOT", "ccomp"]
and spanroot.head.pos_ in ["AUX", "VERB"]
and spanroot.pos_ in ["AUX", "VERB"]
):
if spanroot.morph == spanroot.head.morph:
category = "Main verb 4"
else:
category = "Dependent verb 2"
elif str(spanroot.morph) == "Aspect=Prog|Tense=Pres|VerbForm=Part":
category = "Gerund"
elif "VerbForm=Fin" in str(spanroot.morph) or "VerbForm=Inf" in str(
spanroot.morph
):
category = "Dependent verb 2"
elif spanroot.dep_ in ["csubj", "csubjpass"]:
category = "Dependent verb (csubj)"
# Appositive phrases
if spanroot.dep_ in ["appos"]:
if "nummod" in c_dep:
category = "Apposition"
if spanroot.pos_ in ["PROPN"]:
category = "Appositive Proper Nouns"
elif spanroot.pos_ in ["NOUN"]:
category = "Appositive Noun Phrase"
elif spanroot.pos_ in ["VERB", "AUX"]:
_check = any(
c.dep_ in ["nsubj", "nsubjpass", "csubj", "csubjpass"]
for c in spanroot.children
)
if _check:
category = "Appositive Finite-clause"
if spanroot.dep_ in ["appos", "dep", "attr"]:
if not subjless and spanroot.pos_ in ["VERB", "AUX"]:
category = "Main verb (likely parsing error)"
# sometimes the dep are on the conjunctions
if spanroot.dep_ in ["dep", "mark"]:
if spanroot.tag_ in ["RB", "IN", "CC"]:
category = "Conjunction"
if spanroot.dep_ in ["intj"]:
category = "Introjection"
# sometimes the extra-clausal links are not accurate
if (
spanroot.dep_
in ["aux", "auxpass", "oprd", "appos", "xcomp", "attr", "dep", "meta", "prt"]
and category == None
):
if spanroot.head.dep_ == "ROOT":
category = "Main verb"
else:
category = "dependent verb 5"
if span.label_ == "CITATION":
if "NNP" in span_tag or "NNPS" in span_tag:
if span_dep[0] == "punct" and span_dep[-1] == "punct":
category = "Parenthetical Citation"
elif span_tag[0] in ["NNP", "NNPS"]:
category = "Narrative Citation"
else:
category = "Other Citation"
if category == None:
category = spanroot.dep_
return category
def const_table(
doc: Union[spacy.tokens.Doc, Dict[str, str]],
spans_key: str = "sc",
attrs: List[str] = SPAN_ATTRS,
):
columns = attrs + [
"Conf. score",
"sent no.",
"grammatical realization",
"span dep",
"ner",
"POS",
"span dep seq",
"TAG sequence",
"POS sequence",
"head",
"head dep",
"children",
"morphology",
"sent",
]
data = []
# data = span_info_aggregator(doc, columns)
sentences = {s: i for i, s in enumerate(doc.sents)}
for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs["scores"]):
span_info = []
span_info.extend([str(getattr(span, attr)) for attr in attrs])
span_info.append(score)
span_info.append(int(sentences[span.sent]))
span_info.append(construction_classifier2(doc, span))
span_info.append(span.root.dep_)
span_info.append(span.root.ent_type_)
span_info.append(span.root.tag_)
span_info.append("_".join([t.dep_ for t in span]))
span_info.append("_".join([t.tag_ for t in span]))
span_info.append("_".join([t.pos_ for t in span]))
span_info.append(span.root.head.norm_)
span_info.append(span.root.head.dep_)
span_info.append("_".join([c.dep_ for c in span.root.children]))
span_info.append(str(span.root.morph))
span_info.append(span.sent.text.strip())
data.append(span_info)
return data, columns
def ngrammar(seq: list, n=2, concat=False, sep="-"):
result = []
n_item = len(seq)
for idx, item in enumerate(seq):
if idx + n <= n_item:
if concat:
result.append(sep.join(seq[idx : idx + n]))
else:
result.append(seq[idx : idx + n])
return result
def diversity_values(count_vec: list):
result = {}
if len(count_vec) == 0:
count_vec = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
result["shannon"] = dv.alpha.shannon(list(count_vec), base=2)
result["brillouin_d"] = dv.alpha.brillouin_d(list(count_vec))
result["simpson_d"] = 1 - dv.alpha.simpson(list(count_vec))
result["simpson_e"] = dv.alpha.simpson_e(list(count_vec))
# result['gini_index'] = dv.alpha.gini_index(list(count_vec))
# result['faith_pd'] = dv.alpha.faith_pd(list(count_vec))
return result