|
|
|
from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable |
|
import pandas as pd |
|
import spacy |
|
from spacy.language import Language |
|
|
|
SPAN_ATTRS = ["text", "label_", "start", "end"] |
|
|
|
|
|
def simple_table(doc: Union[spacy.tokens.Doc, Dict[str, str]], |
|
spans_key: str = "sc", |
|
attrs: List[str] = SPAN_ATTRS): |
|
columns = attrs + ["Conf. score"] |
|
data = [ |
|
[str(getattr(span, attr)) |
|
for attr in attrs] + [score] |
|
for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores']) |
|
] |
|
return data, columns |
|
|
|
|
|
|
|
|
|
def construction_classifier(doc, span): |
|
category = span.root.dep_ |
|
spanroot = span.root |
|
|
|
|
|
span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span] |
|
span_dep = [t.dep_ for t in span] |
|
span_token = [t.norm_ for t in span] |
|
span_tag = [t.tag_ for t in span] |
|
|
|
|
|
c_dep = [c.dep_ for c in spanroot.children] |
|
c_pos = [c.pos_ for c in spanroot.children] |
|
c_tag = [c.tag_ for c in spanroot.children] |
|
|
|
|
|
if spanroot.dep_ == "conj": |
|
while spanroot.dep_ == 'conj': |
|
spanroot = spanroot.head |
|
if spanroot.dep_ == "poss": |
|
while spanroot.dep_ == 'poss': |
|
spanroot = spanroot.head |
|
|
|
|
|
|
|
if spanroot.dep_ in ['pcomp']: |
|
if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]: |
|
category = "Gerund" |
|
|
|
|
|
if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj"]: |
|
category = "Object" |
|
if spanroot.dep_ in ["nsubj", "nsubjpass"]: |
|
category = "Subject" |
|
if spanroot.dep_ in ["cc"]: |
|
category = "Coordinating conjunction" |
|
|
|
if spanroot.dep_ in ["ROOT", "advcl"]: |
|
if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_): |
|
category = "It is X that-clause" |
|
elif "nsubj" in c_dep and "acomp" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_): |
|
category = "It is X that-clause" |
|
elif "nsubj" in c_dep and "oprd" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_): |
|
category = "It is X that-clause" |
|
elif "nsubj" in c_dep and "it" in span_token and spanroot.pos_ == "VERB": |
|
category = "It VERB that-clause" |
|
elif "expl" in c_dep and "NOUN" in c_pos: |
|
category = "There is/are NOUN" |
|
elif spanroot.pos_ in ["AUX", 'VERB']: |
|
category = "Main verb" |
|
else: |
|
category = spanroot.dep_ |
|
|
|
if spanroot.dep_ in ['attr']: |
|
c_head = [c.dep_ for c in spanroot.head.children] |
|
if "expl" in c_head and "no_det" in span_t_dep_: |
|
category = "There is/are no NOUN" |
|
|
|
|
|
|
|
if spanroot.tag_ == "MD": |
|
category = "Modal auxiliary" |
|
|
|
if spanroot.dep_ in ['prep']: |
|
category = 'Prepositional Phrase' |
|
|
|
if spanroot.dep_ in ['advmod']: |
|
category = "Adverbial modifier" |
|
|
|
if spanroot.dep_ in ['acomp']: |
|
category = "Adjectival complement" |
|
|
|
if spanroot.dep_ in ['neg']: |
|
category = "Negative particle" |
|
|
|
|
|
if spanroot.dep_ in ['preconj']: |
|
category = "Conjunction" |
|
|
|
|
|
|
|
if spanroot.dep_ in ['advcl', 'mark', 'acl']: |
|
if "mark" in span_dep: |
|
category = "Finite adverbial clause" |
|
if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"] and "aux" not in c_dep: |
|
category = "Non-finite adv clause" |
|
|
|
|
|
|
|
|
|
|
|
|
|
if spanroot.dep_ in ['relcl', 'ccomp']: |
|
head = spanroot.head |
|
if ";" in [t.norm_ for t in head.children]: |
|
category = "Main verb" |
|
elif "nsubj" not in span_dep: |
|
category = "Dependent verb" |
|
|
|
if spanroot.dep_ in ['dep']: |
|
if spanroot.head.dep_ in ['ROOT', 'ccomp'] and spanroot.head.pos_ in ['AUX', 'VERB'] and spanroot.pos_ in ['AUX', 'VERB']: |
|
if spanroot.morph == spanroot.head.morph: |
|
category = "Main verb" |
|
else: |
|
category = "Dependent verb" |
|
|
|
|
|
|
|
|
|
if span.label_ == "CITATION": |
|
if "NNP" in span_tag or "NNPS" in span_tag: |
|
if span_dep[0] == 'punct' and span_dep[-1] == 'punct': |
|
category = "Parenthetical Citation" |
|
elif span_tag[0] in ["NNP", "NNPS"]: |
|
category = "Narrative Citation" |
|
else: |
|
category = "Other Citation" |
|
|
|
|
|
return category |
|
|
|
|
|
def const_table(doc: Union[spacy.tokens.Doc, Dict[str, str]], |
|
spans_key: str = "sc", |
|
attrs: List[str] = SPAN_ATTRS): |
|
columns = attrs + ["Conf. score", "sent no.", "grammatical realization", 'span dep', "ner", |
|
"POS", 'span dep seq', "POS sequence", "head", "children", "morphology", ] |
|
data = [] |
|
|
|
sentences = {s: i for i, s in enumerate(doc.sents)} |
|
|
|
for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores']): |
|
|
|
span_info = [] |
|
span_info.extend([str(getattr(span, attr)) for attr in attrs]) |
|
|
|
span_info.append(score) |
|
span_info.append(sentences[span.sent]) |
|
span_info.append(construction_classifier(doc, span)) |
|
span_info.append(span.root.dep_) |
|
span_info.append(span.root.ent_type_) |
|
span_info.append(span.root.tag_) |
|
span_info.append("_".join([t.dep_ for t in span])) |
|
span_info.append("_".join([t.tag_ for t in span])) |
|
span_info.append(span.root.head.norm_) |
|
span_info.append("_".join([c.dep_ for c in span.root.children])) |
|
span_info.append(span.root.morph) |
|
data.append(span_info) |
|
|
|
return data, columns |
|
|
|
|
|
def ngrammar(seq: list, n=2): |
|
result = [] |
|
n_item = len(seq) |
|
for idx, item in enumerate(seq): |
|
if idx + n <= n_item: |
|
result.append(seq[idx: idx + n]) |
|
return result |
|
|