from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
import pandas as pd
import spacy
from spacy.language import Language
SPAN_ATTRS = ["text", "label_", "start", "end"]
def simple_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
spans_key: str = "sc",
attrs: List[str] = SPAN_ATTRS):
columns = attrs + ["Conf. score"]
data = [
[str(getattr(span, attr))
for attr in attrs] + [score] # [f'{score:.5f}']
for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores'])
return data, columns
# def span_info_aggregator()
def construction_classifier(doc, span):
category = span.root.dep_
spanroot = span.root
span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span]
span_dep = [t.dep_ for t in span]
span_token = [t.norm_ for t in span]
span_tag = [t.tag_ for t in span]
c_dep = [c.dep_ for c in spanroot.children]
c_pos = [c.pos_ for c in spanroot.children]
c_tag = [c.tag_ for c in spanroot.children]
## nesting classifiers
if spanroot.dep_ == "conj":
while spanroot.dep_ == 'conj':
spanroot = spanroot.head
if spanroot.dep_ == "poss":
while spanroot.dep_ == 'poss':
spanroot = spanroot.head
## Simple classifier
if spanroot.dep_ in ['pcomp']:
if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
category = "Gerund"
if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj"]:
category = "Object"
if spanroot.dep_ in ["nsubj", "nsubjpass"]:
category = "Subject"
if spanroot.dep_ in ["cc"]:
category = "Coordinating conjunction"
if spanroot.dep_ in ["ROOT", "advcl"]:
if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
category = "It is X that-clause"
elif "nsubj" in c_dep and "acomp" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
category = "It is X that-clause"
elif "nsubj" in c_dep and "oprd" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
category = "It is X that-clause"
elif "nsubj" in c_dep and "it" in span_token and spanroot.pos_ == "VERB":
category = "It VERB that-clause"
elif "expl" in c_dep and "NOUN" in c_pos:
category = "There is/are NOUN"
elif spanroot.pos_ in ["AUX", 'VERB']:
category = "Main verb"
category = spanroot.dep_
if spanroot.dep_ in ['attr']:
c_head = [c.dep_ for c in spanroot.head.children]
if "expl" in c_head and "no_det" in span_t_dep_:
category = "There is/are no NOUN"
# Modal verbs
if spanroot.tag_ == "MD":
category = "Modal auxiliary"
# prep phrases
if spanroot.dep_ in ['prep']:
category = 'Prepositional Phrase'
# adverbial phrases
if spanroot.dep_ in ['advmod']:
category = "Adverbial modifier"
# adverbial phrases
if spanroot.dep_ in ['acomp']:
category = "Adjectival complement"
if spanroot.dep_ in ['neg']:
category = "Negative particle"
# Preconjunctions
if spanroot.dep_ in ['preconj']:
category = "Conjunction"
# Adverbial clauses
## Check the status of the adverbial clauses carefully
if spanroot.dep_ in ['advcl', 'mark', 'acl']:
if "mark" in span_dep:
category = "Finite adverbial clause"
if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"] and "aux" not in c_dep:
category = "Non-finite adv clause"
# Check whether it has a subject or not
# elif "nsubj" in [c.dep_ for c in spanroot.children]:
# category = "Adverbial clauses"
# else:
# category = "Other advcl"
if spanroot.dep_ in ['relcl', 'ccomp']:
head = spanroot.head
if ";" in [t.norm_ for t in head.children]:
category = "Main verb"
elif "nsubj" not in span_dep:
category = "Dependent verb"
if spanroot.dep_ in ['dep']:
if spanroot.head.dep_ in ['ROOT', 'ccomp'] and spanroot.head.pos_ in ['AUX', 'VERB'] and spanroot.pos_ in ['AUX', 'VERB']:
if spanroot.morph == spanroot.head.morph:
category = "Main verb"
category = "Dependent verb"
if span.label_ == "CITATION":
if "NNP" in span_tag or "NNPS" in span_tag:
if span_dep[0] == 'punct' and span_dep[-1] == 'punct':
category = "Parenthetical Citation"
elif span_tag[0] in ["NNP", "NNPS"]:
category = "Narrative Citation"
category = "Other Citation"
return category
def const_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
spans_key: str = "sc",
attrs: List[str] = SPAN_ATTRS):
columns = attrs + ["Conf. score", "sent no.", "grammatical realization", 'span dep', "ner",
"POS", 'span dep seq', "POS sequence", "head", "children", "morphology", ]
data = []
# data = span_info_aggregator(doc, columns)
sentences = {s: i for i, s in enumerate(doc.sents)}
for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores']):
span_info = []
span_info.extend([str(getattr(span, attr)) for attr in attrs])
span_info.append(construction_classifier(doc, span))
span_info.append("_".join([t.dep_ for t in span]))
span_info.append("_".join([t.tag_ for t in span]))
span_info.append("_".join([c.dep_ for c in span.root.children]))
return data, columns
def ngrammar(seq: list, n=2):
result = []
n_item = len(seq)
for idx, item in enumerate(seq):
if idx + n <= n_item:
result.append(seq[idx: idx + n])
return result