""" Triples ------- :mod:`textacy.extract.triples`: Extract structured triples from a document or sentence through rule-based pattern-matching of the annotated tokens. """ from __future__ import annotations import collections from operator import attrgetter from typing import Iterable, List, Tuple from spacy.symbols import ( AUX, VERB, agent, attr, aux, auxpass, csubj, csubjpass, dobj, neg, nsubj, nsubjpass, obj, pobj, xcomp, ) from spacy.tokens import Span, Token from textacy import types _NOMINAL_SUBJ_DEPS = {nsubj, nsubjpass} _CLAUSAL_SUBJ_DEPS = {csubj, csubjpass} _ACTIVE_SUBJ_DEPS = {csubj, nsubj} _VERB_MODIFIER_DEPS = {aux, auxpass, neg} SVOTriple: Tuple[List[Token], List[Token], List[Token]] = collections.namedtuple( "SVOTriple", ["subject", "verb", "object"] ) def subject_verb_object_triples(doclike: types.DocLike) -> Iterable[SVOTriple]: """ Extract an ordered sequence of subject-verb-object triples from a document or sentence. Args: doclike Yields: Next SVO triple as (subject, verb, object), in approximate order of appearance. """ if isinstance(doclike, Span): sents = [doclike] else: sents = doclike.sents for sent in sents: # connect subjects/objects to direct verb heads # and expand them to include conjuncts, compound nouns, ... verb_sos = collections.defaultdict(lambda: collections.defaultdict(set)) for tok in sent: head = tok.head # ensure entry for all verbs, even if empty # to catch conjugate verbs without direct subject/object deps if tok.pos == VERB: _ = verb_sos[tok] # nominal subject of active or passive verb if tok.dep in _NOMINAL_SUBJ_DEPS: if head.pos == VERB: verb_sos[head]["subjects"].update(expand_noun(tok)) # clausal subject of active or passive verb elif tok.dep in _CLAUSAL_SUBJ_DEPS: if head.pos == VERB: verb_sos[head]["subjects"].update(tok.subtree) # nominal direct object of transitive verb elif tok.dep == obj: if head.pos == VERB: verb_sos[head]["objects"].update(expand_noun(tok)) # prepositional object acting as agent of passive verb elif tok.dep == pobj: if head.dep == agent and head.head.pos == VERB: verb_sos[head.head]["objects"].update(expand_noun(tok)) # open clausal complement, but not as a secondary predicate elif tok.dep == xcomp: if ( head.pos == VERB and not any(child.dep == obj for child in head.children) ): # TODO: just the verb, or the whole tree? # verb_sos[verb]["objects"].update(expand_verb(tok)) verb_sos[head]["objects"].update(tok.subtree) # fill in any indirect relationships connected via verb conjuncts for verb, so_dict in verb_sos.items(): conjuncts = verb.conjuncts if so_dict.get("subjects"): for conj in conjuncts: conj_so_dict = verb_sos.get(conj) if conj_so_dict and not conj_so_dict.get("subjects"): conj_so_dict["subjects"].update(so_dict["subjects"]) if not so_dict.get("objects"): so_dict["objects"].update( obj for conj in conjuncts for obj in verb_sos.get(conj, {}).get("objects", []) ) # expand verbs and restructure into svo triples for verb, so_dict in verb_sos.items(): if so_dict["subjects"] and so_dict["objects"]: yield SVOTriple( subject=sorted(so_dict["subjects"], key=attrgetter("i")), verb=sorted(expand_verb(verb), key=attrgetter("i")), object=sorted(so_dict["objects"], key=attrgetter("i")), ) def expand_noun(tok: Token) -> List[Token]: """Expand a noun token to include all associated conjunct and compound nouns.""" tok_and_conjuncts = [tok] + list(tok.conjuncts) compounds = [ child for tc in tok_and_conjuncts for child in tc.children # TODO: why doesn't compound import from spacy.symbols? if child.dep_ == "compound" ] return tok_and_conjuncts + compounds def expand_verb(tok: Token) -> List[Token]: """Expand a verb token to include all associated auxiliary and negation tokens.""" verb_modifiers = [ child for child in tok.children if child.dep in _VERB_MODIFIER_DEPS ] return [tok] + verb_modifiers