engagement-analyzer-demo / pipeline /post_processors.py
egumasa's picture
updated model
7cf7080
raw
history blame
No virus
6.34 kB
from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
import pandas as pd
import spacy
from spacy.language import Language
SPAN_ATTRS = ["text", "label_", "start", "end"]
def simple_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
spans_key: str = "sc",
attrs: List[str] = SPAN_ATTRS):
columns = attrs + ["Conf. score"]
data = [
[str(getattr(span, attr))
for attr in attrs] + [score] # [f'{score:.5f}']
for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores'])
]
return data, columns
# def span_info_aggregator()
def construction_classifier(doc, span):
category = span.root.dep_
spanroot = span.root
##
span_t_dep_ = ["_".join([t.norm_, t.dep_]) for t in span]
span_dep = [t.dep_ for t in span]
span_token = [t.norm_ for t in span]
span_tag = [t.tag_ for t in span]
c_dep = [c.dep_ for c in spanroot.children]
c_pos = [c.pos_ for c in spanroot.children]
c_tag = [c.tag_ for c in spanroot.children]
## nesting classifiers
if spanroot.dep_ == "conj":
while spanroot.dep_ == 'conj':
spanroot = spanroot.head
if spanroot.dep_ == "poss":
while spanroot.dep_ == 'poss':
spanroot = spanroot.head
## Simple classifier
if spanroot.dep_ in ['pcomp']:
if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"]:
category = "Gerund"
if spanroot.dep_ in ["pobj", "dobj", "obj", "iobj"]:
category = "Object"
if spanroot.dep_ in ["nsubj", "nsubjpass"]:
category = "Subject"
if spanroot.dep_ in ["cc"]:
category = "Coordinating conjunction"
if spanroot.dep_ in ["ROOT", "advcl"]:
if "ccomp" in c_dep and "auxpass" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
category = "It is X that-clause"
elif "nsubj" in c_dep and "acomp" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
category = "It is X that-clause"
elif "nsubj" in c_dep and "oprd" in c_dep and ("it_nsubjpass" in span_t_dep_ or "it_nsubj" in span_t_dep_):
category = "It is X that-clause"
elif "nsubj" in c_dep and "it" in span_token and spanroot.pos_ == "VERB":
category = "It VERB that-clause"
elif "expl" in c_dep and "NOUN" in c_pos:
category = "There is/are NOUN"
elif spanroot.pos_ in ["AUX", 'VERB']:
category = "Main verb"
else:
category = spanroot.dep_
if spanroot.dep_ in ['attr']:
c_head = [c.dep_ for c in spanroot.head.children]
if "expl" in c_head and "no_det" in span_t_dep_:
category = "There is/are no NOUN"
# Modal verbs
if spanroot.tag_ == "MD":
category = "Modal auxiliary"
# prep phrases
if spanroot.dep_ in ['prep']:
category = 'Prepositional Phrase'
# adverbial phrases
if spanroot.dep_ in ['advmod']:
category = "Adverbial modifier"
# adverbial phrases
if spanroot.dep_ in ['acomp']:
category = "Adjectival complement"
if spanroot.dep_ in ['neg']:
category = "Negative particle"
# Preconjunctions
if spanroot.dep_ in ['preconj']:
category = "Conjunction"
# Adverbial clauses
## Check the status of the adverbial clauses carefully
if spanroot.dep_ in ['advcl', 'mark', 'acl']:
if "mark" in span_dep:
category = "Finite adverbial clause"
if str(spanroot.morph) in ["Aspect=Prog|Tense=Pres|VerbForm=Part"] and "aux" not in c_dep:
category = "Non-finite adv clause"
# Check whether it has a subject or not
# elif "nsubj" in [c.dep_ for c in spanroot.children]:
# category = "Adverbial clauses"
# else:
# category = "Other advcl"
if spanroot.dep_ in ['relcl', 'ccomp']:
head = spanroot.head
if ";" in [t.norm_ for t in head.children]:
category = "Main verb"
elif "nsubj" not in span_dep:
category = "Dependent verb"
if spanroot.dep_ in ['dep']:
if spanroot.head.dep_ in ['ROOT', 'ccomp'] and spanroot.head.pos_ in ['AUX', 'VERB'] and spanroot.pos_ in ['AUX', 'VERB']:
if spanroot.morph == spanroot.head.morph:
category = "Main verb"
else:
category = "Dependent verb"
if span.label_ == "CITATION":
if "NNP" in span_tag or "NNPS" in span_tag:
if span_dep[0] == 'punct' and span_dep[-1] == 'punct':
category = "Parenthetical Citation"
elif span_tag[0] in ["NNP", "NNPS"]:
category = "Narrative Citation"
else:
category = "Other Citation"
return category
def const_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
spans_key: str = "sc",
attrs: List[str] = SPAN_ATTRS):
columns = attrs + ["Conf. score", "sent no.", "grammatical realization", 'span dep', "ner",
"POS", 'span dep seq', "POS sequence", "head", "children", "morphology", ]
data = []
# data = span_info_aggregator(doc, columns)
sentences = {s: i for i, s in enumerate(doc.sents)}
for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores']):
span_info = []
span_info.extend([str(getattr(span, attr)) for attr in attrs])
span_info.append(score)
span_info.append(sentences[span.sent])
span_info.append(construction_classifier(doc, span))
span_info.append(span.root.dep_)
span_info.append(span.root.ent_type_)
span_info.append(span.root.tag_)
span_info.append("_".join([t.dep_ for t in span]))
span_info.append("_".join([t.tag_ for t in span]))
span_info.append(span.root.head.norm_)
span_info.append("_".join([c.dep_ for c in span.root.children]))
span_info.append(span.root.morph)
data.append(span_info)
return data, columns
def ngrammar(seq: list, n=2):
result = []
n_item = len(seq)
for idx, item in enumerate(seq):
if idx + n <= n_item:
result.append(seq[idx: idx + n])
return result