Szabó Gergő commited on
Commit
2ecc574
1 Parent(s): 19cfb7e

triples package

Browse files
Files changed (2) hide show
  1. examples/relation.py +12 -12
  2. resources/triples.py +125 -0
examples/relation.py CHANGED
@@ -1,12 +1,9 @@
1
  import gradio as gr
2
 
3
  import spacy
4
- import sys
5
  import pandas as pd
6
- from spacy import displacy
7
 
8
- sys.path.append('/home/gszabo/PycharmProjects/textacy/textacy/src/textacy/extract')
9
- import triples
10
 
11
  nlp = spacy.load("hu_core_news_lg")
12
 
@@ -41,14 +38,17 @@ def process(text: str) -> pd.DataFrame:
41
 
42
  return pd.DataFrame(relation_list, columns=['Subject', 'Verb', 'Object'])
43
 
44
- EXAMPLES = ["Vespucci 1450-es években született Firenzében, és 1497 és 1504 között legalább két felfedező úton vett részt.",
45
- "Einstein megmutatta, ha feltételezi, hogy a fény valóban csak diszkrét csomagokban terjed, akkor meg tudja magyarázni a fényelektromos jelenség furcsa tulajdonságait.",
46
- "Einstein megállapította, hogy hasonló energiaeloszlás lehet érvényes az atomokra is.",
47
- "Hawking úgy nyilatkozott, hogy a felfedezései az élete legizgalmasabb eseményei voltak."]
48
-
49
- # displacy checker
50
- # text = nlp(EXAMPLES[3])
51
- # displacy.serve(text, style="dep")
 
 
 
52
 
53
  demo = gr.Interface(
54
  fn=process,
 
1
  import gradio as gr
2
 
3
  import spacy
 
4
  import pandas as pd
 
5
 
6
+ from resources import triples
 
7
 
8
  nlp = spacy.load("hu_core_news_lg")
9
 
 
38
 
39
  return pd.DataFrame(relation_list, columns=['Subject', 'Verb', 'Object'])
40
 
41
+ EXAMPLES = ["Anna éppen most házat épít magának.",
42
+ "András főzni fog, ha haza ért.",
43
+ "Jéghideg narancslevet fogok kortyolni Mallorca homokos partján.",
44
+ "Júliska fagyit fog árulni.",
45
+ "Einstein megmutatta, hogy hogyan kell házat építeni.",
46
+ "Vespucci 1497 és 1504 között legalább két felfedező úton vett részt.",
47
+ "Einstein megállapította, hogy az atomokra hasonló energiaeloszlás lehet érvényes.",
48
+ "Hawking úgy nyilatkozott, hogy a felfedezései az élete legizgalmasabb eseményei voltak.",
49
+ "Einstein megmutatta, ha feltételezi, hogy a fény valóban csak diszkrét csomagokban terjed, akkor meg tudja magyarázni a fényelektromos jelenség furcsa tulajdonságait."]
50
+
51
+ # process(EXAMPLES[4])
52
 
53
  demo = gr.Interface(
54
  fn=process,
resources/triples.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Triples
3
+ -------
4
+
5
+ :mod:`textacy.extract.triples`: Extract structured triples from a document or sentence
6
+ through rule-based pattern-matching of the annotated tokens.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import collections
11
+ from operator import attrgetter
12
+ from typing import Iterable, List, Tuple
13
+
14
+ from spacy.symbols import (
15
+ AUX, VERB,
16
+ agent, attr, aux, auxpass, csubj, csubjpass, dobj, neg, nsubj, nsubjpass, obj, pobj, xcomp,
17
+ )
18
+ from spacy.tokens import Span, Token
19
+
20
+ from textacy import types
21
+
22
+
23
+ _NOMINAL_SUBJ_DEPS = {nsubj, nsubjpass}
24
+ _CLAUSAL_SUBJ_DEPS = {csubj, csubjpass}
25
+ _ACTIVE_SUBJ_DEPS = {csubj, nsubj}
26
+ _VERB_MODIFIER_DEPS = {aux, auxpass, neg}
27
+
28
+ SVOTriple: Tuple[List[Token], List[Token], List[Token]] = collections.namedtuple(
29
+ "SVOTriple", ["subject", "verb", "object"]
30
+ )
31
+
32
+
33
+ def subject_verb_object_triples(doclike: types.DocLike) -> Iterable[SVOTriple]:
34
+ """
35
+ Extract an ordered sequence of subject-verb-object triples from a document
36
+ or sentence.
37
+
38
+ Args:
39
+ doclike
40
+
41
+ Yields:
42
+ Next SVO triple as (subject, verb, object), in approximate order of appearance.
43
+ """
44
+ if isinstance(doclike, Span):
45
+ sents = [doclike]
46
+ else:
47
+ sents = doclike.sents
48
+
49
+ for sent in sents:
50
+ # connect subjects/objects to direct verb heads
51
+ # and expand them to include conjuncts, compound nouns, ...
52
+ verb_sos = collections.defaultdict(lambda: collections.defaultdict(set))
53
+ for tok in sent:
54
+ head = tok.head
55
+ # ensure entry for all verbs, even if empty
56
+ # to catch conjugate verbs without direct subject/object deps
57
+ if tok.pos == VERB:
58
+ _ = verb_sos[tok]
59
+ # nominal subject of active or passive verb
60
+ if tok.dep in _NOMINAL_SUBJ_DEPS:
61
+ if head.pos == VERB:
62
+ verb_sos[head]["subjects"].update(expand_noun(tok))
63
+ # clausal subject of active or passive verb
64
+ elif tok.dep in _CLAUSAL_SUBJ_DEPS:
65
+ if head.pos == VERB:
66
+ verb_sos[head]["subjects"].update(tok.subtree)
67
+ # nominal direct object of transitive verb
68
+ elif tok.dep == obj:
69
+ if head.pos == VERB:
70
+ verb_sos[head]["objects"].update(expand_noun(tok))
71
+ # prepositional object acting as agent of passive verb
72
+ elif tok.dep == pobj:
73
+ if head.dep == agent and head.head.pos == VERB:
74
+ verb_sos[head.head]["objects"].update(expand_noun(tok))
75
+ # open clausal complement, but not as a secondary predicate
76
+ elif tok.dep == xcomp:
77
+ if (
78
+ head.pos == VERB
79
+ and not any(child.dep == obj for child in head.children)
80
+ ):
81
+ # TODO: just the verb, or the whole tree?
82
+ # verb_sos[verb]["objects"].update(expand_verb(tok))
83
+ verb_sos[head]["objects"].update(tok.subtree)
84
+ # fill in any indirect relationships connected via verb conjuncts
85
+ for verb, so_dict in verb_sos.items():
86
+ conjuncts = verb.conjuncts
87
+ if so_dict.get("subjects"):
88
+ for conj in conjuncts:
89
+ conj_so_dict = verb_sos.get(conj)
90
+ if conj_so_dict and not conj_so_dict.get("subjects"):
91
+ conj_so_dict["subjects"].update(so_dict["subjects"])
92
+ if not so_dict.get("objects"):
93
+ so_dict["objects"].update(
94
+ obj
95
+ for conj in conjuncts
96
+ for obj in verb_sos.get(conj, {}).get("objects", [])
97
+ )
98
+ # expand verbs and restructure into svo triples
99
+ for verb, so_dict in verb_sos.items():
100
+ if so_dict["subjects"] and so_dict["objects"]:
101
+ yield SVOTriple(
102
+ subject=sorted(so_dict["subjects"], key=attrgetter("i")),
103
+ verb=sorted(expand_verb(verb), key=attrgetter("i")),
104
+ object=sorted(so_dict["objects"], key=attrgetter("i")),
105
+ )
106
+
107
+ def expand_noun(tok: Token) -> List[Token]:
108
+ """Expand a noun token to include all associated conjunct and compound nouns."""
109
+ tok_and_conjuncts = [tok] + list(tok.conjuncts)
110
+ compounds = [
111
+ child
112
+ for tc in tok_and_conjuncts
113
+ for child in tc.children
114
+ # TODO: why doesn't compound import from spacy.symbols?
115
+ if child.dep_ == "compound"
116
+ ]
117
+ return tok_and_conjuncts + compounds
118
+
119
+
120
+ def expand_verb(tok: Token) -> List[Token]:
121
+ """Expand a verb token to include all associated auxiliary and negation tokens."""
122
+ verb_modifiers = [
123
+ child for child in tok.children if child.dep in _VERB_MODIFIER_DEPS
124
+ ]
125
+ return [tok] + verb_modifiers