Spaces:
Sleeping
Sleeping
# Modified version of nltk framenet | |
# Natural Language Toolkit: Framenet Corpus Reader | |
# | |
# Copyright (C) 2001-2020 NLTK Project | |
# Authors: Chuck Wooters <wooters@icsi.berkeley.edu>, | |
# Nathan Schneider <nathan.schneider@georgetown.edu> | |
# URL: <http://nltk.org/> | |
# For license information, see LICENSE.TXT | |
""" | |
Corpus reader for the FrameNet 1.7 lexicon and corpus. | |
""" | |
import os | |
import re | |
import textwrap | |
import itertools | |
import sys | |
import types | |
from collections import defaultdict, OrderedDict | |
from operator import itemgetter | |
from itertools import zip_longest | |
from pprint import pprint | |
from nltk.corpus.reader import XMLCorpusReader, XMLCorpusView | |
from nltk.corpus.util import LazyCorpusLoader | |
from nltk.util import LazyConcatenation, LazyMap, LazyIteratorList | |
__docformat__ = "epytext en" | |
def mimic_wrap(lines, wrap_at=65, **kwargs): | |
""" | |
Wrap the first of 'lines' with textwrap and the remaining lines at exactly the same | |
positions as the first. | |
""" | |
l0 = textwrap.fill(lines[0], wrap_at, drop_whitespace=False).split("\n") | |
yield l0 | |
def _(line): | |
il0 = 0 | |
while line and il0 < len(l0) - 1: | |
yield line[: len(l0[il0])] | |
line = line[len(l0[il0]) :] | |
il0 += 1 | |
if line: # Remaining stuff on this line past the end of the mimicked line. | |
# So just textwrap this line. | |
for ln in textwrap.fill(line, wrap_at, drop_whitespace=False).split("\n"): | |
yield ln | |
for l in lines[1:]: | |
yield list(_(l)) | |
def _pretty_longstring(defstr, prefix="", wrap_at=65): | |
""" | |
Helper function for pretty-printing a long string. | |
:param defstr: The string to be printed. | |
:type defstr: str | |
:return: A nicely formated string representation of the long string. | |
:rtype: str | |
""" | |
outstr = "" | |
for line in textwrap.fill(defstr, wrap_at).split("\n"): | |
outstr += prefix + line + "\n" | |
return outstr | |
def _pretty_any(obj): | |
""" | |
Helper function for pretty-printing any AttrDict object. | |
:param obj: The obj to be printed. | |
:type obj: AttrDict | |
:return: A nicely formated string representation of the AttrDict object. | |
:rtype: str | |
""" | |
outstr = "" | |
for k in obj: | |
if isinstance(obj[k], str) and len(obj[k]) > 65: | |
outstr += "[{0}]\n".format(k) | |
outstr += "{0}".format(_pretty_longstring(obj[k], prefix=" ")) | |
outstr += "\n" | |
else: | |
outstr += "[{0}] {1}\n".format(k, obj[k]) | |
return outstr | |
def _pretty_semtype(st): | |
""" | |
Helper function for pretty-printing a semantic type. | |
:param st: The semantic type to be printed. | |
:type st: AttrDict | |
:return: A nicely formated string representation of the semantic type. | |
:rtype: str | |
""" | |
semkeys = st.keys() | |
if len(semkeys) == 1: | |
return "<None>" | |
outstr = "" | |
outstr += "semantic type ({0.ID}): {0.name}\n".format(st) | |
if "abbrev" in semkeys: | |
outstr += "[abbrev] {0}\n".format(st.abbrev) | |
if "definition" in semkeys: | |
outstr += "[definition]\n" | |
outstr += _pretty_longstring(st.definition, " ") | |
outstr += "[rootType] {0}({1})\n".format(st.rootType.name, st.rootType.ID) | |
if st.superType is None: | |
outstr += "[superType] <None>\n" | |
else: | |
outstr += "[superType] {0}({1})\n".format(st.superType.name, st.superType.ID) | |
outstr += "[subTypes] {0} subtypes\n".format(len(st.subTypes)) | |
outstr += ( | |
" " | |
+ ", ".join("{0}({1})".format(x.name, x.ID) for x in st.subTypes) | |
+ "\n" * (len(st.subTypes) > 0) | |
) | |
return outstr | |
def _pretty_frame_relation_type(freltyp): | |
""" | |
Helper function for pretty-printing a frame relation type. | |
:param freltyp: The frame relation type to be printed. | |
:type freltyp: AttrDict | |
:return: A nicely formated string representation of the frame relation type. | |
:rtype: str | |
""" | |
outstr = "<frame relation type ({0.ID}): {0.superFrameName} -- {0.name} -> {0.subFrameName}>".format( | |
freltyp | |
) | |
return outstr | |
def _pretty_frame_relation(frel): | |
""" | |
Helper function for pretty-printing a frame relation. | |
:param frel: The frame relation to be printed. | |
:type frel: AttrDict | |
:return: A nicely formated string representation of the frame relation. | |
:rtype: str | |
""" | |
outstr = "<{0.type.superFrameName}={0.superFrameName} -- {0.type.name} -> {0.type.subFrameName}={0.subFrameName}>".format( | |
frel | |
) | |
return outstr | |
def _pretty_fe_relation(ferel): | |
""" | |
Helper function for pretty-printing an FE relation. | |
:param ferel: The FE relation to be printed. | |
:type ferel: AttrDict | |
:return: A nicely formated string representation of the FE relation. | |
:rtype: str | |
""" | |
outstr = "<{0.type.superFrameName}={0.frameRelation.superFrameName}.{0.superFEName} -- {0.type.name} -> {0.type.subFrameName}={0.frameRelation.subFrameName}.{0.subFEName}>".format( | |
ferel | |
) | |
return outstr | |
def _pretty_lu(lu): | |
""" | |
Helper function for pretty-printing a lexical unit. | |
:param lu: The lu to be printed. | |
:type lu: AttrDict | |
:return: A nicely formated string representation of the lexical unit. | |
:rtype: str | |
""" | |
lukeys = lu.keys() | |
outstr = "" | |
outstr += "lexical unit ({0.ID}): {0.name}\n\n".format(lu) | |
if "definition" in lukeys: | |
outstr += "[definition]\n" | |
outstr += _pretty_longstring(lu.definition, " ") | |
if "frame" in lukeys: | |
outstr += "\n[frame] {0}({1})\n".format(lu.frame.name, lu.frame.ID) | |
if "incorporatedFE" in lukeys: | |
outstr += "\n[incorporatedFE] {0}\n".format(lu.incorporatedFE) | |
if "POS" in lukeys: | |
outstr += "\n[POS] {0}\n".format(lu.POS) | |
if "status" in lukeys: | |
outstr += "\n[status] {0}\n".format(lu.status) | |
if "totalAnnotated" in lukeys: | |
outstr += "\n[totalAnnotated] {0} annotated examples\n".format( | |
lu.totalAnnotated | |
) | |
if "lexemes" in lukeys: | |
outstr += "\n[lexemes] {0}\n".format( | |
" ".join("{0}/{1}".format(lex.name, lex.POS) for lex in lu.lexemes) | |
) | |
if "semTypes" in lukeys: | |
outstr += "\n[semTypes] {0} semantic types\n".format(len(lu.semTypes)) | |
outstr += ( | |
" " * (len(lu.semTypes) > 0) | |
+ ", ".join("{0}({1})".format(x.name, x.ID) for x in lu.semTypes) | |
+ "\n" * (len(lu.semTypes) > 0) | |
) | |
if "URL" in lukeys: | |
outstr += "\n[URL] {0}\n".format(lu.URL) | |
if "subCorpus" in lukeys: | |
subc = [x.name for x in lu.subCorpus] | |
outstr += "\n[subCorpus] {0} subcorpora\n".format(len(lu.subCorpus)) | |
for line in textwrap.fill(", ".join(sorted(subc)), 60).split("\n"): | |
outstr += " {0}\n".format(line) | |
if "exemplars" in lukeys: | |
outstr += "\n[exemplars] {0} sentences across all subcorpora\n".format( | |
len(lu.exemplars) | |
) | |
return outstr | |
def _pretty_exemplars(exemplars, lu): | |
""" | |
Helper function for pretty-printing a list of exemplar sentences for a lexical unit. | |
:param sent: The list of exemplar sentences to be printed. | |
:type sent: list(AttrDict) | |
:return: An index of the text of the exemplar sentences. | |
:rtype: str | |
""" | |
outstr = "" | |
outstr += "exemplar sentences for {0.name} in {0.frame.name}:\n\n".format(lu) | |
for i, sent in enumerate(exemplars): | |
outstr += "[{0}] {1}\n".format(i, sent.text) | |
outstr += "\n" | |
return outstr | |
def _pretty_fulltext_sentences(sents): | |
""" | |
Helper function for pretty-printing a list of annotated sentences for a full-text document. | |
:param sent: The list of sentences to be printed. | |
:type sent: list(AttrDict) | |
:return: An index of the text of the sentences. | |
:rtype: str | |
""" | |
outstr = "" | |
outstr += "full-text document ({0.ID}) {0.name}:\n\n".format(sents) | |
outstr += "[corpid] {0.corpid}\n[corpname] {0.corpname}\n[description] {0.description}\n[URL] {0.URL}\n\n".format( | |
sents | |
) | |
outstr += "[sentence]\n".format(sents) | |
for i, sent in enumerate(sents.sentence): | |
outstr += "[{0}] {1}\n".format(i, sent.text) | |
outstr += "\n" | |
return outstr | |
def _pretty_fulltext_sentence(sent): | |
""" | |
Helper function for pretty-printing an annotated sentence from a full-text document. | |
:param sent: The sentence to be printed. | |
:type sent: list(AttrDict) | |
:return: The text of the sentence with annotation set indices on frame targets. | |
:rtype: str | |
""" | |
outstr = "" | |
outstr += "full-text sentence ({0.ID}) in {1}:\n\n".format( | |
sent, sent.doc.get("name", sent.doc.description) | |
) | |
outstr += "\n[POS] {0} tags\n".format(len(sent.POS)) | |
outstr += "\n[POS_tagset] {0}\n\n".format(sent.POS_tagset) | |
outstr += "[text] + [annotationSet]\n\n" | |
outstr += sent._ascii() # -> _annotation_ascii() | |
outstr += "\n" | |
return outstr | |
def _pretty_pos(aset): | |
""" | |
Helper function for pretty-printing a sentence with its POS tags. | |
:param aset: The POS annotation set of the sentence to be printed. | |
:type sent: list(AttrDict) | |
:return: The text of the sentence and its POS tags. | |
:rtype: str | |
""" | |
outstr = "" | |
outstr += "POS annotation set ({0.ID}) {0.POS_tagset} in sentence {0.sent.ID}:\n\n".format( | |
aset | |
) | |
# list the target spans and their associated aset index | |
overt = sorted(aset.POS) | |
sent = aset.sent | |
s0 = sent.text | |
s1 = "" | |
s2 = "" | |
i = 0 | |
adjust = 0 | |
for j, k, lbl in overt: | |
assert j >= i, ("Overlapping targets?", (j, k, lbl)) | |
s1 += " " * (j - i) + "-" * (k - j) | |
if len(lbl) > (k - j): | |
# add space in the sentence to make room for the annotation index | |
amt = len(lbl) - (k - j) | |
s0 = ( | |
s0[: k + adjust] + "~" * amt + s0[k + adjust :] | |
) # '~' to prevent line wrapping | |
s1 = s1[: k + adjust] + " " * amt + s1[k + adjust :] | |
adjust += amt | |
s2 += " " * (j - i) + lbl.ljust(k - j) | |
i = k | |
long_lines = [s0, s1, s2] | |
outstr += "\n\n".join( | |
map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" ")) | |
).replace("~", " ") | |
outstr += "\n" | |
return outstr | |
def _pretty_annotation(sent, aset_level=False): | |
""" | |
Helper function for pretty-printing an exemplar sentence for a lexical unit. | |
:param sent: An annotation set or exemplar sentence to be printed. | |
:param aset_level: If True, 'sent' is actually an annotation set within a sentence. | |
:type sent: AttrDict | |
:return: A nicely formated string representation of the exemplar sentence | |
with its target, frame, and FE annotations. | |
:rtype: str | |
""" | |
sentkeys = sent.keys() | |
outstr = "annotation set" if aset_level else "exemplar sentence" | |
outstr += " ({0.ID}):\n".format(sent) | |
if aset_level: # TODO: any UNANN exemplars? | |
outstr += "\n[status] {0}\n".format(sent.status) | |
for k in ("corpID", "docID", "paragNo", "sentNo", "aPos"): | |
if k in sentkeys: | |
outstr += "[{0}] {1}\n".format(k, sent[k]) | |
outstr += ( | |
"\n[LU] ({0.ID}) {0.name} in {0.frame.name}\n".format(sent.LU) | |
if sent.LU | |
else "\n[LU] Not found!" | |
) | |
outstr += "\n[frame] ({0.ID}) {0.name}\n".format( | |
sent.frame | |
) # redundant with above, but .frame is convenient | |
if not aset_level: | |
outstr += "\n[annotationSet] {0} annotation sets\n".format( | |
len(sent.annotationSet) | |
) | |
outstr += "\n[POS] {0} tags\n".format(len(sent.POS)) | |
outstr += "\n[POS_tagset] {0}\n".format(sent.POS_tagset) | |
outstr += "\n[GF] {0} relation{1}\n".format( | |
len(sent.GF), "s" if len(sent.GF) != 1 else "" | |
) | |
outstr += "\n[PT] {0} phrase{1}\n".format( | |
len(sent.PT), "s" if len(sent.PT) != 1 else "" | |
) | |
""" | |
Special Layers | |
-------------- | |
The 'NER' layer contains, for some of the data, named entity labels. | |
The 'WSL' (word status layer) contains, for some of the data, | |
spans which should not in principle be considered targets (NT). | |
The 'Other' layer records relative clause constructions (Rel=relativizer, Ant=antecedent), | |
pleonastic 'it' (Null), and existential 'there' (Exist). | |
On occasion they are duplicated by accident (e.g., annotationSet 1467275 in lu6700.xml). | |
The 'Sent' layer appears to contain labels that the annotator has flagged the | |
sentence with for their convenience: values include | |
'sense1', 'sense2', 'sense3', etc.; | |
'Blend', 'Canonical', 'Idiom', 'Metaphor', 'Special-Sent', | |
'keepS', 'deleteS', 'reexamine' | |
(sometimes they are duplicated for no apparent reason). | |
The POS-specific layers may contain the following kinds of spans: | |
Asp (aspectual particle), Non-Asp (non-aspectual particle), | |
Cop (copula), Supp (support), Ctrlr (controller), | |
Gov (governor), X. Gov and X always cooccur. | |
>>> from nltk.corpus import framenet as fn | |
>>> def f(luRE, lyr, ignore=set()): | |
... for i,ex in enumerate(fn.exemplars(luRE)): | |
... if lyr in ex and ex[lyr] and set(zip(*ex[lyr])[2]) - ignore: | |
... print(i,ex[lyr]) | |
- Verb: Asp, Non-Asp | |
- Noun: Cop, Supp, Ctrlr, Gov, X | |
- Adj: Cop, Supp, Ctrlr, Gov, X | |
- Prep: Cop, Supp, Ctrlr | |
- Adv: Ctrlr | |
- Scon: (none) | |
- Art: (none) | |
""" | |
for lyr in ("NER", "WSL", "Other", "Sent"): | |
if lyr in sent and sent[lyr]: | |
outstr += "\n[{0}] {1} entr{2}\n".format( | |
lyr, len(sent[lyr]), "ies" if len(sent[lyr]) != 1 else "y" | |
) | |
outstr += "\n[text] + [Target] + [FE]" | |
# POS-specific layers: syntactically important words that are neither the target | |
# nor the FEs. Include these along with the first FE layer but with '^' underlining. | |
for lyr in ("Verb", "Noun", "Adj", "Adv", "Prep", "Scon", "Art"): | |
if lyr in sent and sent[lyr]: | |
outstr += " + [{0}]".format(lyr) | |
if "FE2" in sentkeys: | |
outstr += " + [FE2]" | |
if "FE3" in sentkeys: | |
outstr += " + [FE3]" | |
outstr += "\n\n" | |
outstr += sent._ascii() # -> _annotation_ascii() | |
outstr += "\n" | |
return outstr | |
def _annotation_ascii(sent): | |
""" | |
Given a sentence or FE annotation set, construct the width-limited string showing | |
an ASCII visualization of the sentence's annotations, calling either | |
_annotation_ascii_frames() or _annotation_ascii_FEs() as appropriate. | |
This will be attached as a method to appropriate AttrDict instances | |
and called in the full pretty-printing of the instance. | |
""" | |
if sent._type == "fulltext_sentence" or ( | |
"annotationSet" in sent and len(sent.annotationSet) > 2 | |
): | |
# a full-text sentence OR sentence with multiple targets. | |
# (multiple targets = >2 annotation sets, because the first annotation set is POS.) | |
return _annotation_ascii_frames(sent) | |
else: # an FE annotation set, or an LU sentence with 1 target | |
return _annotation_ascii_FEs(sent) | |
def _annotation_ascii_frames(sent): | |
""" | |
ASCII string rendering of the sentence along with its targets and frame names. | |
Called for all full-text sentences, as well as the few LU sentences with multiple | |
targets (e.g., fn.lu(6412).exemplars[82] has two want.v targets). | |
Line-wrapped to limit the display width. | |
""" | |
# list the target spans and their associated aset index | |
overt = [] | |
for a, aset in enumerate(sent.annotationSet[1:]): | |
for j, k in aset.Target: | |
indexS = "[{0}]".format(a + 1) | |
if aset.status == "UNANN" or aset.LU.status == "Problem": | |
indexS += " " | |
if aset.status == "UNANN": | |
indexS += ( | |
"!" | |
) # warning indicator that there is a frame annotation but no FE annotation | |
if aset.LU.status == "Problem": | |
indexS += ( | |
"?" | |
) # warning indicator that there is a missing LU definition (because the LU has Problem status) | |
overt.append((j, k, aset.LU.frame.name, indexS)) | |
overt = sorted(overt) | |
duplicates = set() | |
for o, (j, k, fname, asetIndex) in enumerate(overt): | |
if o > 0 and j <= overt[o - 1][1]: | |
# multiple annotation sets on the same target | |
# (e.g. due to a coordination construction or multiple annotators) | |
if ( | |
overt[o - 1][:2] == (j, k) and overt[o - 1][2] == fname | |
): # same target, same frame | |
# splice indices together | |
combinedIndex = ( | |
overt[o - 1][3] + asetIndex | |
) # e.g., '[1][2]', '[1]! [2]' | |
combinedIndex = combinedIndex.replace(" !", "! ").replace(" ?", "? ") | |
overt[o - 1] = overt[o - 1][:3] + (combinedIndex,) | |
duplicates.add(o) | |
else: # different frames, same or overlapping targets | |
s = sent.text | |
for j, k, fname, asetIndex in overt: | |
s += "\n" + asetIndex + " " + sent.text[j:k] + " :: " + fname | |
s += "\n(Unable to display sentence with targets marked inline due to overlap)" | |
return s | |
for o in reversed(sorted(duplicates)): | |
del overt[o] | |
s0 = sent.text | |
s1 = "" | |
s11 = "" | |
s2 = "" | |
i = 0 | |
adjust = 0 | |
fAbbrevs = OrderedDict() | |
for j, k, fname, asetIndex in overt: | |
if not j >= i: | |
assert j >= i, ( | |
"Overlapping targets?" | |
+ ( | |
" UNANN" | |
if any(aset.status == "UNANN" for aset in sent.annotationSet[1:]) | |
else "" | |
), | |
(j, k, asetIndex), | |
) | |
s1 += " " * (j - i) + "*" * (k - j) | |
short = fname[: k - j] | |
if (k - j) < len(fname): | |
r = 0 | |
while short in fAbbrevs: | |
if fAbbrevs[short] == fname: | |
break | |
r += 1 | |
short = fname[: k - j - 1] + str(r) | |
else: # short not in fAbbrevs | |
fAbbrevs[short] = fname | |
s11 += " " * (j - i) + short.ljust(k - j) | |
if len(asetIndex) > (k - j): | |
# add space in the sentence to make room for the annotation index | |
amt = len(asetIndex) - (k - j) | |
s0 = ( | |
s0[: k + adjust] + "~" * amt + s0[k + adjust :] | |
) # '~' to prevent line wrapping | |
s1 = s1[: k + adjust] + " " * amt + s1[k + adjust :] | |
s11 = s11[: k + adjust] + " " * amt + s11[k + adjust :] | |
adjust += amt | |
s2 += " " * (j - i) + asetIndex.ljust(k - j) | |
i = k | |
long_lines = [s0, s1, s11, s2] | |
outstr = "\n\n".join( | |
map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" ")) | |
).replace("~", " ") | |
outstr += "\n" | |
if fAbbrevs: | |
outstr += " (" + ", ".join("=".join(pair) for pair in fAbbrevs.items()) + ")" | |
assert len(fAbbrevs) == len(dict(fAbbrevs)), "Abbreviation clash" | |
return outstr | |
def _annotation_ascii_FE_layer(overt, ni, feAbbrevs): | |
"""Helper for _annotation_ascii_FEs().""" | |
s1 = "" | |
s2 = "" | |
i = 0 | |
for j, k, fename in overt: | |
s1 += " " * (j - i) + ("^" if fename.islower() else "-") * (k - j) | |
short = fename[: k - j] | |
if len(fename) > len(short): | |
r = 0 | |
while short in feAbbrevs: | |
if feAbbrevs[short] == fename: | |
break | |
r += 1 | |
short = fename[: k - j - 1] + str(r) | |
else: # short not in feAbbrevs | |
feAbbrevs[short] = fename | |
s2 += " " * (j - i) + short.ljust(k - j) | |
i = k | |
sNI = "" | |
if ni: | |
sNI += " [" + ", ".join(":".join(x) for x in sorted(ni.items())) + "]" | |
return [s1, s2, sNI] | |
def _annotation_ascii_FEs(sent): | |
""" | |
ASCII string rendering of the sentence along with a single target and its FEs. | |
Secondary and tertiary FE layers are included if present. | |
'sent' can be an FE annotation set or an LU sentence with a single target. | |
Line-wrapped to limit the display width. | |
""" | |
feAbbrevs = OrderedDict() | |
posspec = [] # POS-specific layer spans (e.g., Supp[ort], Cop[ula]) | |
posspec_separate = False | |
for lyr in ("Verb", "Noun", "Adj", "Adv", "Prep", "Scon", "Art"): | |
if lyr in sent and sent[lyr]: | |
for a, b, lbl in sent[lyr]: | |
if ( | |
lbl == "X" | |
): # skip this, which covers an entire phrase typically containing the target and all its FEs | |
# (but do display the Gov) | |
continue | |
if any(1 for x, y, felbl in sent.FE[0] if x <= a < y or a <= x < b): | |
# overlap between one of the POS-specific layers and first FE layer | |
posspec_separate = ( | |
True | |
) # show POS-specific layers on a separate line | |
posspec.append( | |
(a, b, lbl.lower().replace("-", "")) | |
) # lowercase Cop=>cop, Non-Asp=>nonasp, etc. to distinguish from FE names | |
if posspec_separate: | |
POSSPEC = _annotation_ascii_FE_layer(posspec, {}, feAbbrevs) | |
FE1 = _annotation_ascii_FE_layer( | |
sorted(sent.FE[0] + (posspec if not posspec_separate else [])), | |
sent.FE[1], | |
feAbbrevs, | |
) | |
FE2 = FE3 = None | |
if "FE2" in sent: | |
FE2 = _annotation_ascii_FE_layer(sent.FE2[0], sent.FE2[1], feAbbrevs) | |
if "FE3" in sent: | |
FE3 = _annotation_ascii_FE_layer(sent.FE3[0], sent.FE3[1], feAbbrevs) | |
for i, j in sent.Target: | |
FE1span, FE1name, FE1exp = FE1 | |
if len(FE1span) < j: | |
FE1span += " " * (j - len(FE1span)) | |
if len(FE1name) < j: | |
FE1name += " " * (j - len(FE1name)) | |
FE1[1] = FE1name | |
FE1[0] = ( | |
FE1span[:i] + FE1span[i:j].replace(" ", "*").replace("-", "=") + FE1span[j:] | |
) | |
long_lines = [sent.text] | |
if posspec_separate: | |
long_lines.extend(POSSPEC[:2]) | |
long_lines.extend([FE1[0], FE1[1] + FE1[2]]) # lines with no length limit | |
if FE2: | |
long_lines.extend([FE2[0], FE2[1] + FE2[2]]) | |
if FE3: | |
long_lines.extend([FE3[0], FE3[1] + FE3[2]]) | |
long_lines.append("") | |
outstr = "\n".join( | |
map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" ")) | |
) | |
if feAbbrevs: | |
outstr += "(" + ", ".join("=".join(pair) for pair in feAbbrevs.items()) + ")" | |
assert len(feAbbrevs) == len(dict(feAbbrevs)), "Abbreviation clash" | |
outstr += "\n" | |
return outstr | |
def _pretty_fe(fe): | |
""" | |
Helper function for pretty-printing a frame element. | |
:param fe: The frame element to be printed. | |
:type fe: AttrDict | |
:return: A nicely formated string representation of the frame element. | |
:rtype: str | |
""" | |
fekeys = fe.keys() | |
outstr = "" | |
outstr += "frame element ({0.ID}): {0.name}\n of {1.name}({1.ID})\n".format( | |
fe, fe.frame | |
) | |
if "definition" in fekeys: | |
outstr += "[definition]\n" | |
outstr += _pretty_longstring(fe.definition, " ") | |
if "abbrev" in fekeys: | |
outstr += "[abbrev] {0}\n".format(fe.abbrev) | |
if "coreType" in fekeys: | |
outstr += "[coreType] {0}\n".format(fe.coreType) | |
if "requiresFE" in fekeys: | |
outstr += "[requiresFE] " | |
if fe.requiresFE is None: | |
outstr += "<None>\n" | |
else: | |
outstr += "{0}({1})\n".format(fe.requiresFE.name, fe.requiresFE.ID) | |
if "excludesFE" in fekeys: | |
outstr += "[excludesFE] " | |
if fe.excludesFE is None: | |
outstr += "<None>\n" | |
else: | |
outstr += "{0}({1})\n".format(fe.excludesFE.name, fe.excludesFE.ID) | |
if "semType" in fekeys: | |
outstr += "[semType] " | |
if fe.semType is None: | |
outstr += "<None>\n" | |
else: | |
outstr += "\n " + "{0}({1})".format(fe.semType.name, fe.semType.ID) + "\n" | |
return outstr | |
def _pretty_frame(frame): | |
""" | |
Helper function for pretty-printing a frame. | |
:param frame: The frame to be printed. | |
:type frame: AttrDict | |
:return: A nicely formated string representation of the frame. | |
:rtype: str | |
""" | |
outstr = "" | |
outstr += "frame ({0.ID}): {0.name}\n\n".format(frame) | |
outstr += "[URL] {0}\n\n".format(frame.URL) | |
outstr += "[definition]\n" | |
outstr += _pretty_longstring(frame.definition, " ") + "\n" | |
outstr += "[semTypes] {0} semantic types\n".format(len(frame.semTypes)) | |
outstr += ( | |
" " * (len(frame.semTypes) > 0) | |
+ ", ".join("{0}({1})".format(x.name, x.ID) for x in frame.semTypes) | |
+ "\n" * (len(frame.semTypes) > 0) | |
) | |
outstr += "\n[frameRelations] {0} frame relations\n".format( | |
len(frame.frameRelations) | |
) | |
outstr += " " + "\n ".join(repr(frel) for frel in frame.frameRelations) + "\n" | |
outstr += "\n[lexUnit] {0} lexical units\n".format(len(frame.lexUnit)) | |
lustrs = [] | |
for luName, lu in sorted(frame.lexUnit.items()): | |
tmpstr = "{0} ({1})".format(luName, lu.ID) | |
lustrs.append(tmpstr) | |
outstr += "{0}\n".format(_pretty_longstring(", ".join(lustrs), prefix=" ")) | |
outstr += "\n[FE] {0} frame elements\n".format(len(frame.FE)) | |
fes = {} | |
for feName, fe in sorted(frame.FE.items()): | |
try: | |
fes[fe.coreType].append("{0} ({1})".format(feName, fe.ID)) | |
except KeyError: | |
fes[fe.coreType] = [] | |
fes[fe.coreType].append("{0} ({1})".format(feName, fe.ID)) | |
for ct in sorted( | |
fes.keys(), | |
key=lambda ct2: [ | |
"Core", | |
"Core-Unexpressed", | |
"Peripheral", | |
"Extra-Thematic", | |
].index(ct2), | |
): | |
outstr += "{0:>16}: {1}\n".format(ct, ", ".join(sorted(fes[ct]))) | |
outstr += "\n[FEcoreSets] {0} frame element core sets\n".format( | |
len(frame.FEcoreSets) | |
) | |
outstr += ( | |
" " | |
+ "\n ".join( | |
", ".join([x.name for x in coreSet]) for coreSet in frame.FEcoreSets | |
) | |
+ "\n" | |
) | |
return outstr | |
class FramenetError(Exception): | |
"""An exception class for framenet-related errors.""" | |
class AttrDict(dict): | |
"""A class that wraps a dict and allows accessing the keys of the | |
dict as if they were attributes. Taken from here: | |
http://stackoverflow.com/a/14620633/8879 | |
>>> foo = {'a':1, 'b':2, 'c':3} | |
>>> bar = AttrDict(foo) | |
>>> pprint(dict(bar)) | |
{'a': 1, 'b': 2, 'c': 3} | |
>>> bar.b | |
2 | |
>>> bar.d = 4 | |
>>> pprint(dict(bar)) | |
{'a': 1, 'b': 2, 'c': 3, 'd': 4} | |
""" | |
def __init__(self, *args, **kwargs): | |
super(AttrDict, self).__init__(*args, **kwargs) | |
# self.__dict__ = self | |
def __setattr__(self, name, value): | |
self[name] = value | |
def __getattr__(self, name): | |
if name == "_short_repr": | |
return self._short_repr | |
return self[name] | |
def __getitem__(self, name): | |
v = super(AttrDict, self).__getitem__(name) | |
if isinstance(v, Future): | |
return v._data() | |
return v | |
def _short_repr(self): | |
if "_type" in self: | |
if self["_type"].endswith("relation"): | |
return self.__repr__() | |
try: | |
return "<{0} ID={1} name={2}>".format( | |
self["_type"], self["ID"], self["name"] | |
) | |
except KeyError: | |
try: # no ID--e.g., for _type=lusubcorpus | |
return "<{0} name={1}>".format(self["_type"], self["name"]) | |
except KeyError: # no name--e.g., for _type=lusentence | |
return "<{0} ID={1}>".format(self["_type"], self["ID"]) | |
else: | |
return self.__repr__() | |
def _str(self): | |
outstr = "" | |
if "_type" not in self: | |
outstr = _pretty_any(self) | |
elif self["_type"] == "frame": | |
outstr = _pretty_frame(self) | |
elif self["_type"] == "fe": | |
outstr = _pretty_fe(self) | |
elif self["_type"] == "lu": | |
outstr = _pretty_lu(self) | |
elif self["_type"] == "luexemplars": # list of ALL exemplars for LU | |
outstr = _pretty_exemplars(self, self[0].LU) | |
elif ( | |
self["_type"] == "fulltext_annotation" | |
): # list of all sentences for full-text doc | |
outstr = _pretty_fulltext_sentences(self) | |
elif self["_type"] == "lusentence": | |
outstr = _pretty_annotation(self) | |
elif self["_type"] == "fulltext_sentence": | |
outstr = _pretty_fulltext_sentence(self) | |
elif self["_type"] in ("luannotationset", "fulltext_annotationset"): | |
outstr = _pretty_annotation(self, aset_level=True) | |
elif self["_type"] == "posannotationset": | |
outstr = _pretty_pos(self) | |
elif self["_type"] == "semtype": | |
outstr = _pretty_semtype(self) | |
elif self["_type"] == "framerelationtype": | |
outstr = _pretty_frame_relation_type(self) | |
elif self["_type"] == "framerelation": | |
outstr = _pretty_frame_relation(self) | |
elif self["_type"] == "ferelation": | |
outstr = _pretty_fe_relation(self) | |
else: | |
outstr = _pretty_any(self) | |
# ensure result is unicode string prior to applying the | |
# decorator (because non-ASCII characters | |
# could in principle occur in the data and would trigger an encoding error when | |
# passed as arguments to str.format()). | |
# assert isinstance(outstr, unicode) # not in Python 3.2 | |
return outstr | |
def __str__(self): | |
return self._str() | |
def __repr__(self): | |
return self.__str__() | |
class SpecialList(list): | |
""" | |
A list subclass which adds a '_type' attribute for special printing | |
(similar to an AttrDict, though this is NOT an AttrDict subclass). | |
""" | |
def __init__(self, typ, *args, **kwargs): | |
super(SpecialList, self).__init__(*args, **kwargs) | |
self._type = typ | |
def _str(self): | |
outstr = "" | |
assert self._type | |
if len(self) == 0: | |
outstr = "[]" | |
elif self._type == "luexemplars": # list of ALL exemplars for LU | |
outstr = _pretty_exemplars(self, self[0].LU) | |
else: | |
assert False, self._type | |
return outstr | |
def __str__(self): | |
return self._str() | |
def __repr__(self): | |
return self.__str__() | |
class Future(object): | |
""" | |
Wraps and acts as a proxy for a value to be loaded lazily (on demand). | |
Adapted from https://gist.github.com/sergey-miryanov/2935416 | |
""" | |
def __init__(self, loader, *args, **kwargs): | |
""" | |
:param loader: when called with no arguments, returns the value to be stored | |
:type loader: callable | |
""" | |
super(Future, self).__init__(*args, **kwargs) | |
self._loader = loader | |
self._d = None | |
def _data(self): | |
if callable(self._loader): | |
self._d = self._loader() | |
self._loader = None # the data is now cached | |
return self._d | |
def __nonzero__(self): | |
return bool(self._data()) | |
def __len__(self): | |
return len(self._data()) | |
def __setitem__(self, key, value): | |
return self._data().__setitem__(key, value) | |
def __getitem__(self, key): | |
return self._data().__getitem__(key) | |
def __getattr__(self, key): | |
return self._data().__getattr__(key) | |
def __str__(self): | |
return self._data().__str__() | |
def __repr__(self): | |
return self._data().__repr__() | |
class PrettyDict(AttrDict): | |
""" | |
Displays an abbreviated repr of values where possible. | |
Inherits from AttrDict, so a callable value will | |
be lazily converted to an actual value. | |
""" | |
def __init__(self, *args, **kwargs): | |
_BREAK_LINES = kwargs.pop("breakLines", False) | |
super(PrettyDict, self).__init__(*args, **kwargs) | |
dict.__setattr__(self, "_BREAK_LINES", _BREAK_LINES) | |
def __repr__(self): | |
parts = [] | |
for k, v in sorted(self.items()): | |
kv = repr(k) + ": " | |
try: | |
kv += v._short_repr() | |
except AttributeError: | |
kv += repr(v) | |
parts.append(kv) | |
return "{" + (",\n " if self._BREAK_LINES else ", ").join(parts) + "}" | |
class PrettyList(list): | |
""" | |
Displays an abbreviated repr of only the first several elements, not the whole list. | |
""" | |
# from nltk.util | |
def __init__(self, *args, **kwargs): | |
self._MAX_REPR_SIZE = kwargs.pop("maxReprSize", 60) | |
self._BREAK_LINES = kwargs.pop("breakLines", False) | |
super(PrettyList, self).__init__(*args, **kwargs) | |
def __repr__(self): | |
""" | |
Return a string representation for this corpus view that is | |
similar to a list's representation; but if it would be more | |
than 60 characters long, it is truncated. | |
""" | |
pieces = [] | |
length = 5 | |
for elt in self: | |
pieces.append( | |
elt._short_repr() | |
) # key difference from inherited version: call to _short_repr() | |
length += len(pieces[-1]) + 2 | |
if self._MAX_REPR_SIZE and length > self._MAX_REPR_SIZE and len(pieces) > 2: | |
return "[%s, ...]" % str( | |
",\n " if self._BREAK_LINES else ", " | |
).join(pieces[:-1]) | |
return "[%s]" % str(",\n " if self._BREAK_LINES else ", ").join(pieces) | |
class PrettyLazyMap(LazyMap): | |
""" | |
Displays an abbreviated repr of only the first several elements, not the whole list. | |
""" | |
# from nltk.util | |
_MAX_REPR_SIZE = 60 | |
def __repr__(self): | |
""" | |
Return a string representation for this corpus view that is | |
similar to a list's representation; but if it would be more | |
than 60 characters long, it is truncated. | |
""" | |
pieces = [] | |
length = 5 | |
for elt in self: | |
pieces.append( | |
elt._short_repr() | |
) # key difference from inherited version: call to _short_repr() | |
length += len(pieces[-1]) + 2 | |
if length > self._MAX_REPR_SIZE and len(pieces) > 2: | |
return "[%s, ...]" % str(", ").join(pieces[:-1]) | |
return "[%s]" % str(", ").join(pieces) | |
class PrettyLazyIteratorList(LazyIteratorList): | |
""" | |
Displays an abbreviated repr of only the first several elements, not the whole list. | |
""" | |
# from nltk.util | |
_MAX_REPR_SIZE = 60 | |
def __repr__(self): | |
""" | |
Return a string representation for this corpus view that is | |
similar to a list's representation; but if it would be more | |
than 60 characters long, it is truncated. | |
""" | |
pieces = [] | |
length = 5 | |
for elt in self: | |
pieces.append( | |
elt._short_repr() | |
) # key difference from inherited version: call to _short_repr() | |
length += len(pieces[-1]) + 2 | |
if length > self._MAX_REPR_SIZE and len(pieces) > 2: | |
return "[%s, ...]" % str(", ").join(pieces[:-1]) | |
return "[%s]" % str(", ").join(pieces) | |
class PrettyLazyConcatenation(LazyConcatenation): | |
""" | |
Displays an abbreviated repr of only the first several elements, not the whole list. | |
""" | |
# from nltk.util | |
_MAX_REPR_SIZE = 60 | |
def __repr__(self): | |
""" | |
Return a string representation for this corpus view that is | |
similar to a list's representation; but if it would be more | |
than 60 characters long, it is truncated. | |
""" | |
pieces = [] | |
length = 5 | |
for elt in self: | |
pieces.append( | |
elt._short_repr() | |
) # key difference from inherited version: call to _short_repr() | |
length += len(pieces[-1]) + 2 | |
if length > self._MAX_REPR_SIZE and len(pieces) > 2: | |
return "[%s, ...]" % str(", ").join(pieces[:-1]) | |
return "[%s]" % str(", ").join(pieces) | |
def __add__(self, other): | |
"""Return a list concatenating self with other.""" | |
return PrettyLazyIteratorList(itertools.chain(self, other)) | |
def __radd__(self, other): | |
"""Return a list concatenating other with self.""" | |
return PrettyLazyIteratorList(itertools.chain(other, self)) | |
class FramenetCorpusReader(XMLCorpusReader): | |
"""A corpus reader for the Framenet Corpus. | |
>>> from nltk.corpus import framenet as fn | |
>>> fn.lu(3238).frame.lexUnit['glint.v'] is fn.lu(3238) | |
True | |
>>> fn.frame_by_name('Replacing') is fn.lus('replace.v')[0].frame | |
True | |
>>> fn.lus('prejudice.n')[0].frame.frameRelations == fn.frame_relations('Partiality') | |
True | |
""" | |
_bad_statuses = ["Problem"] | |
""" | |
When loading LUs for a frame, those whose status is in this list will be ignored. | |
Due to caching, if user code modifies this, it should do so before loading any data. | |
'Problem' should always be listed for FrameNet 1.5, as these LUs are not included | |
in the XML index. | |
""" | |
_warnings = False | |
def warnings(self, v): | |
"""Enable or disable warnings of data integrity issues as they are encountered. | |
If v is truthy, warnings will be enabled. | |
(This is a function rather than just an attribute/property to ensure that if | |
enabling warnings is the first action taken, the corpus reader is instantiated first.) | |
""" | |
self._warnings = v | |
def __init__(self, root, fileids): | |
XMLCorpusReader.__init__(self, root, fileids) | |
# framenet corpus sub dirs | |
# sub dir containing the xml files for frames | |
self._frame_dir = "frame" | |
# sub dir containing the xml files for lexical units | |
self._lu_dir = "lu" | |
# sub dir containing the xml files for fulltext annotation files | |
self._fulltext_dir = "fulltext" | |
# location of latest development version of FrameNet | |
self._fnweb_url = "https://framenet2.icsi.berkeley.edu/fnReports/data" | |
# Indexes used for faster look-ups | |
self._frame_idx = None | |
self._cached_frames = {} # name -> ID | |
self._lu_idx = None | |
self._fulltext_idx = None | |
self._semtypes = None | |
self._freltyp_idx = None # frame relation types (Inheritance, Using, etc.) | |
self._frel_idx = None # frame-to-frame relation instances | |
self._ferel_idx = None # FE-to-FE relation instances | |
self._frel_f_idx = None # frame-to-frame relations associated with each frame | |
def help(self, attrname=None): | |
"""Display help information summarizing the main methods.""" | |
if attrname is not None: | |
return help(self.__getattribute__(attrname)) | |
# No need to mention frame_by_name() or frame_by_id(), | |
# as it's easier to just call frame(). | |
# Also not mentioning lu_basic(). | |
msg = """ | |
Citation: Nathan Schneider and Chuck Wooters (2017), | |
"The NLTK FrameNet API: Designing for Discoverability with a Rich Linguistic Resource". | |
Proceedings of EMNLP: System Demonstrations. https://arxiv.org/abs/1703.07438 | |
Use the following methods to access data in FrameNet. | |
Provide a method name to `help()` for more information. | |
FRAMES | |
====== | |
frame() to look up a frame by its exact name or ID | |
frames() to get frames matching a name pattern | |
frames_by_lemma() to get frames containing an LU matching a name pattern | |
frame_ids_and_names() to get a mapping from frame IDs to names | |
FRAME ELEMENTS | |
============== | |
fes() to get frame elements (a.k.a. roles) matching a name pattern, optionally constrained | |
by a frame name pattern | |
LEXICAL UNITS | |
============= | |
lu() to look up an LU by its ID | |
lus() to get lexical units matching a name pattern, optionally constrained by frame | |
lu_ids_and_names() to get a mapping from LU IDs to names | |
RELATIONS | |
========= | |
frame_relation_types() to get the different kinds of frame-to-frame relations | |
(Inheritance, Subframe, Using, etc.). | |
frame_relations() to get the relation instances, optionally constrained by | |
frame(s) or relation type | |
fe_relations() to get the frame element pairs belonging to a frame-to-frame relation | |
SEMANTIC TYPES | |
============== | |
semtypes() to get the different kinds of semantic types that can be applied to | |
FEs, LUs, and entire frames | |
semtype() to look up a particular semtype by name, ID, or abbreviation | |
semtype_inherits() to check whether two semantic types have a subtype-supertype | |
relationship in the semtype hierarchy | |
propagate_semtypes() to apply inference rules that distribute semtypes over relations | |
between FEs | |
ANNOTATIONS | |
=========== | |
annotations() to get annotation sets, in which a token in a sentence is annotated | |
with a lexical unit in a frame, along with its frame elements and their syntactic properties; | |
can be constrained by LU name pattern and limited to lexicographic exemplars or full-text. | |
Sentences of full-text annotation can have multiple annotation sets. | |
sents() to get annotated sentences illustrating one or more lexical units | |
exemplars() to get sentences of lexicographic annotation, most of which have | |
just 1 annotation set; can be constrained by LU name pattern, frame, and overt FE(s) | |
doc() to look up a document of full-text annotation by its ID | |
docs() to get documents of full-text annotation that match a name pattern | |
docs_metadata() to get metadata about all full-text documents without loading them | |
ft_sents() to iterate over sentences of full-text annotation | |
UTILITIES | |
========= | |
buildindexes() loads metadata about all frames, LUs, etc. into memory to avoid | |
delay when one is accessed for the first time. It does not load annotations. | |
readme() gives the text of the FrameNet README file | |
warnings(True) to display corpus consistency warnings when loading data | |
""" | |
print(msg) | |
def _buildframeindex(self): | |
# The total number of Frames in Framenet is fairly small (~1200) so | |
# this index should not be very large | |
if not self._frel_idx: | |
self._buildrelationindex() # always load frame relations before frames, | |
# otherwise weird ordering effects might result in incomplete information | |
self._frame_idx = {} | |
for f in XMLCorpusView( | |
self.abspath("frameIndex.xml"), "frameIndex/frame", self._handle_elt | |
): | |
self._frame_idx[f["ID"]] = f | |
def _buildcorpusindex(self): | |
# The total number of fulltext annotated documents in Framenet | |
# is fairly small (~90) so this index should not be very large | |
self._fulltext_idx = {} | |
for doclist in XMLCorpusView( | |
self.abspath("fulltextIndex.xml"), | |
"fulltextIndex/corpus", | |
self._handle_fulltextindex_elt, | |
): | |
for doc in doclist: | |
self._fulltext_idx[doc.ID] = doc | |
def _buildluindex(self): | |
# The number of LUs in Framenet is about 13,000 so this index | |
# should not be very large | |
self._lu_idx = {} | |
for lu in XMLCorpusView( | |
self.abspath("luIndex.xml"), "luIndex/lu", self._handle_elt | |
): | |
self._lu_idx[ | |
lu["ID"] | |
] = lu # populate with LU index entries. if any of these | |
# are looked up they will be replaced by full LU objects. | |
def _buildrelationindex(self): | |
# print('building relation index...', file=sys.stderr) | |
freltypes = PrettyList( | |
x | |
for x in XMLCorpusView( | |
self.abspath("frRelation.xml"), | |
"frameRelations/frameRelationType", | |
self._handle_framerelationtype_elt, | |
) | |
) | |
self._freltyp_idx = {} | |
self._frel_idx = {} | |
self._frel_f_idx = defaultdict(set) | |
self._ferel_idx = {} | |
for freltyp in freltypes: | |
self._freltyp_idx[freltyp.ID] = freltyp | |
for frel in freltyp.frameRelations: | |
supF = frel.superFrame = frel[freltyp.superFrameName] = Future( | |
(lambda fID: lambda: self.frame_by_id(fID))(frel.supID) | |
) | |
subF = frel.subFrame = frel[freltyp.subFrameName] = Future( | |
(lambda fID: lambda: self.frame_by_id(fID))(frel.subID) | |
) | |
self._frel_idx[frel.ID] = frel | |
self._frel_f_idx[frel.supID].add(frel.ID) | |
self._frel_f_idx[frel.subID].add(frel.ID) | |
for ferel in frel.feRelations: | |
ferel.superFrame = supF | |
ferel.subFrame = subF | |
ferel.superFE = Future( | |
(lambda fer: lambda: fer.superFrame.FE[fer.superFEName])(ferel) | |
) | |
ferel.subFE = Future( | |
(lambda fer: lambda: fer.subFrame.FE[fer.subFEName])(ferel) | |
) | |
self._ferel_idx[ferel.ID] = ferel | |
# print('...done building relation index', file=sys.stderr) | |
def _warn(self, *message, **kwargs): | |
if self._warnings: | |
kwargs.setdefault("file", sys.stderr) | |
print(*message, **kwargs) | |
def readme(self): | |
""" | |
Return the contents of the corpus README.txt (or README) file. | |
""" | |
try: | |
return self.open("README.txt").read() | |
except IOError: | |
return self.open("README").read() | |
def buildindexes(self): | |
""" | |
Build the internal indexes to make look-ups faster. | |
""" | |
# Frames | |
self._buildframeindex() | |
# LUs | |
self._buildluindex() | |
# Fulltext annotation corpora index | |
self._buildcorpusindex() | |
# frame and FE relations | |
self._buildrelationindex() | |
def doc(self, fn_docid): | |
""" | |
Returns the annotated document whose id number is | |
``fn_docid``. This id number can be obtained by calling the | |
Documents() function. | |
The dict that is returned from this function will contain the | |
following keys: | |
- '_type' : 'fulltextannotation' | |
- 'sentence' : a list of sentences in the document | |
- Each item in the list is a dict containing the following keys: | |
- 'ID' : the ID number of the sentence | |
- '_type' : 'sentence' | |
- 'text' : the text of the sentence | |
- 'paragNo' : the paragraph number | |
- 'sentNo' : the sentence number | |
- 'docID' : the document ID number | |
- 'corpID' : the corpus ID number | |
- 'aPos' : the annotation position | |
- 'annotationSet' : a list of annotation layers for the sentence | |
- Each item in the list is a dict containing the following keys: | |
- 'ID' : the ID number of the annotation set | |
- '_type' : 'annotationset' | |
- 'status' : either 'MANUAL' or 'UNANN' | |
- 'luName' : (only if status is 'MANUAL') | |
- 'luID' : (only if status is 'MANUAL') | |
- 'frameID' : (only if status is 'MANUAL') | |
- 'frameName': (only if status is 'MANUAL') | |
- 'layer' : a list of labels for the layer | |
- Each item in the layer is a dict containing the | |
following keys: | |
- '_type': 'layer' | |
- 'rank' | |
- 'name' | |
- 'label' : a list of labels in the layer | |
- Each item is a dict containing the following keys: | |
- 'start' | |
- 'end' | |
- 'name' | |
- 'feID' (optional) | |
:param fn_docid: The Framenet id number of the document | |
:type fn_docid: int | |
:return: Information about the annotated document | |
:rtype: dict | |
""" | |
try: | |
xmlfname = self._fulltext_idx[fn_docid].filename | |
except TypeError: # happens when self._fulltext_idx == None | |
# build the index | |
self._buildcorpusindex() | |
xmlfname = self._fulltext_idx[fn_docid].filename | |
except KeyError: # probably means that fn_docid was not in the index | |
raise FramenetError("Unknown document id: {0}".format(fn_docid)) | |
# construct the path name for the xml file containing the document info | |
locpath = os.path.join("{0}".format(self._root), self._fulltext_dir, xmlfname) | |
# Grab the top-level xml element containing the fulltext annotation | |
elt = XMLCorpusView(locpath, "fullTextAnnotation")[0] | |
info = self._handle_fulltextannotation_elt(elt) | |
# add metadata | |
for k, v in self._fulltext_idx[fn_docid].items(): | |
info[k] = v | |
return info | |
def frame_by_id(self, fn_fid, ignorekeys=[]): | |
""" | |
Get the details for the specified Frame using the frame's id | |
number. | |
Usage examples: | |
>>> from nltk.corpus import framenet as fn | |
>>> f = fn.frame_by_id(256) | |
>>> f.ID | |
256 | |
>>> f.name | |
'Medical_specialties' | |
>>> f.definition | |
"This frame includes words that name ..." | |
:param fn_fid: The Framenet id number of the frame | |
:type fn_fid: int | |
:param ignorekeys: The keys to ignore. These keys will not be | |
included in the output. (optional) | |
:type ignorekeys: list(str) | |
:return: Information about a frame | |
:rtype: dict | |
Also see the ``frame()`` function for details about what is | |
contained in the dict that is returned. | |
""" | |
# get the name of the frame with this id number | |
try: | |
fentry = self._frame_idx[fn_fid] | |
if "_type" in fentry: | |
return fentry # full frame object is cached | |
name = fentry["name"] | |
except TypeError: | |
self._buildframeindex() | |
name = self._frame_idx[fn_fid]["name"] | |
except KeyError: | |
raise FramenetError("Unknown frame id: {0}".format(fn_fid)) | |
return self.frame_by_name(name, ignorekeys, check_cache=False) | |
def frame_by_name(self, fn_fname, ignorekeys=[], check_cache=True): | |
""" | |
Get the details for the specified Frame using the frame's name. | |
Usage examples: | |
>>> from nltk.corpus import framenet as fn | |
>>> f = fn.frame_by_name('Medical_specialties') | |
>>> f.ID | |
256 | |
>>> f.name | |
'Medical_specialties' | |
>>> f.definition | |
"This frame includes words that name ..." | |
:param fn_fname: The name of the frame | |
:type fn_fname: str | |
:param ignorekeys: The keys to ignore. These keys will not be | |
included in the output. (optional) | |
:type ignorekeys: list(str) | |
:return: Information about a frame | |
:rtype: dict | |
Also see the ``frame()`` function for details about what is | |
contained in the dict that is returned. | |
""" | |
if check_cache and fn_fname in self._cached_frames: | |
return self._frame_idx[self._cached_frames[fn_fname]] | |
elif not self._frame_idx: | |
self._buildframeindex() | |
# construct the path name for the xml file containing the Frame info | |
locpath = os.path.join( | |
"{0}".format(self._root), self._frame_dir, fn_fname + ".xml" | |
) | |
# print(locpath, file=sys.stderr) | |
# Grab the xml for the frame | |
try: | |
elt = XMLCorpusView(locpath, "frame")[0] | |
except IOError: | |
raise FramenetError("Unknown frame: {0}".format(fn_fname)) | |
fentry = self._handle_frame_elt(elt, ignorekeys) | |
assert fentry | |
fentry.URL = self._fnweb_url + "/" + self._frame_dir + "/" + fn_fname + ".xml" | |
# INFERENCE RULE: propagate lexical semtypes from the frame to all its LUs | |
for st in fentry.semTypes: | |
if st.rootType.name == "Lexical_type": | |
for lu in fentry.lexUnit.values(): | |
if not any( | |
x is st for x in lu.semTypes | |
): # identity containment check | |
lu.semTypes.append(st) | |
self._frame_idx[fentry.ID] = fentry | |
self._cached_frames[fentry.name] = fentry.ID | |
""" | |
# now set up callables to resolve the LU pointers lazily. | |
# (could also do this here--caching avoids infinite recursion.) | |
for luName,luinfo in fentry.lexUnit.items(): | |
fentry.lexUnit[luName] = (lambda luID: Future(lambda: self.lu(luID)))(luinfo.ID) | |
""" | |
return fentry | |
def frame(self, fn_fid_or_fname, ignorekeys=[]): | |
""" | |
Get the details for the specified Frame using the frame's name | |
or id number. | |
Usage examples: | |
>>> from nltk.corpus import framenet as fn | |
>>> f = fn.frame(256) | |
>>> f.name | |
'Medical_specialties' | |
>>> f = fn.frame('Medical_specialties') | |
>>> f.ID | |
256 | |
>>> # ensure non-ASCII character in definition doesn't trigger an encoding error: | |
>>> fn.frame('Imposing_obligation') | |
frame (1494): Imposing_obligation... | |
The dict that is returned from this function will contain the | |
following information about the Frame: | |
- 'name' : the name of the Frame (e.g. 'Birth', 'Apply_heat', etc.) | |
- 'definition' : textual definition of the Frame | |
- 'ID' : the internal ID number of the Frame | |
- 'semTypes' : a list of semantic types for this frame | |
- Each item in the list is a dict containing the following keys: | |
- 'name' : can be used with the semtype() function | |
- 'ID' : can be used with the semtype() function | |
- 'lexUnit' : a dict containing all of the LUs for this frame. | |
The keys in this dict are the names of the LUs and | |
the value for each key is itself a dict containing | |
info about the LU (see the lu() function for more info.) | |
- 'FE' : a dict containing the Frame Elements that are part of this frame | |
The keys in this dict are the names of the FEs (e.g. 'Body_system') | |
and the values are dicts containing the following keys | |
- 'definition' : The definition of the FE | |
- 'name' : The name of the FE e.g. 'Body_system' | |
- 'ID' : The id number | |
- '_type' : 'fe' | |
- 'abbrev' : Abbreviation e.g. 'bod' | |
- 'coreType' : one of "Core", "Peripheral", or "Extra-Thematic" | |
- 'semType' : if not None, a dict with the following two keys: | |
- 'name' : name of the semantic type. can be used with | |
the semtype() function | |
- 'ID' : id number of the semantic type. can be used with | |
the semtype() function | |
- 'requiresFE' : if not None, a dict with the following two keys: | |
- 'name' : the name of another FE in this frame | |
- 'ID' : the id of the other FE in this frame | |
- 'excludesFE' : if not None, a dict with the following two keys: | |
- 'name' : the name of another FE in this frame | |
- 'ID' : the id of the other FE in this frame | |
- 'frameRelation' : a list of objects describing frame relations | |
- 'FEcoreSets' : a list of Frame Element core sets for this frame | |
- Each item in the list is a list of FE objects | |
:param fn_fid_or_fname: The Framenet name or id number of the frame | |
:type fn_fid_or_fname: int or str | |
:param ignorekeys: The keys to ignore. These keys will not be | |
included in the output. (optional) | |
:type ignorekeys: list(str) | |
:return: Information about a frame | |
:rtype: dict | |
""" | |
# get the frame info by name or id number | |
if isinstance(fn_fid_or_fname, str): | |
f = self.frame_by_name(fn_fid_or_fname, ignorekeys) | |
else: | |
f = self.frame_by_id(fn_fid_or_fname, ignorekeys) | |
return f | |
def frames_by_lemma(self, pat): | |
""" | |
Returns a list of all frames that contain LUs in which the | |
``name`` attribute of the LU matchs the given regular expression | |
``pat``. Note that LU names are composed of "lemma.POS", where | |
the "lemma" part can be made up of either a single lexeme | |
(e.g. 'run') or multiple lexemes (e.g. 'a little'). | |
Note: if you are going to be doing a lot of this type of | |
searching, you'd want to build an index that maps from lemmas to | |
frames because each time frames_by_lemma() is called, it has to | |
search through ALL of the frame XML files in the db. | |
>>> from nltk.corpus import framenet as fn | |
>>> from nltk.corpus.reader.framenet import PrettyList | |
>>> PrettyList(sorted(fn.frames_by_lemma(r'(?i)a little'), key=itemgetter('ID'))) # doctest: +ELLIPSIS | |
[<frame ID=189 name=Quanti...>, <frame ID=2001 name=Degree>] | |
:return: A list of frame objects. | |
:rtype: list(AttrDict) | |
""" | |
return PrettyList( | |
f | |
for f in self.frames() | |
if any(re.search(pat, luName) for luName in f.lexUnit) | |
) | |
def lu_basic(self, fn_luid): | |
""" | |
Returns basic information about the LU whose id is | |
``fn_luid``. This is basically just a wrapper around the | |
``lu()`` function with "subCorpus" info excluded. | |
>>> from nltk.corpus import framenet as fn | |
>>> lu = PrettyDict(fn.lu_basic(256), breakLines=True) | |
>>> # ellipses account for differences between FN 1.5 and 1.7 | |
>>> lu # doctest: +ELLIPSIS | |
{'ID': 256, | |
'POS': 'V', | |
'URL': 'https://framenet2.icsi.berkeley.edu/fnReports/data/lu/lu256.xml', | |
'_type': 'lu', | |
'cBy': ..., | |
'cDate': '02/08/2001 01:27:50 PST Thu', | |
'definition': 'COD: be aware of beforehand; predict.', | |
'definitionMarkup': 'COD: be aware of beforehand; predict.', | |
'frame': <frame ID=26 name=Expectation>, | |
'lemmaID': 15082, | |
'lexemes': [{'POS': 'V', 'breakBefore': 'false', 'headword': 'false', 'name': 'foresee', 'order': 1}], | |
'name': 'foresee.v', | |
'semTypes': [], | |
'sentenceCount': {'annotated': ..., 'total': ...}, | |
'status': 'FN1_Sent'} | |
:param fn_luid: The id number of the desired LU | |
:type fn_luid: int | |
:return: Basic information about the lexical unit | |
:rtype: dict | |
""" | |
return self.lu(fn_luid, ignorekeys=["subCorpus", "exemplars"]) | |
def lu(self, fn_luid, ignorekeys=[], luName=None, frameID=None, frameName=None): | |
""" | |
Access a lexical unit by its ID. luName, frameID, and frameName are used | |
only in the event that the LU does not have a file in the database | |
(which is the case for LUs with "Problem" status); in this case, | |
a placeholder LU is created which just contains its name, ID, and frame. | |
Usage examples: | |
>>> from nltk.corpus import framenet as fn | |
>>> fn.lu(256).name | |
'foresee.v' | |
>>> fn.lu(256).definition | |
'COD: be aware of beforehand; predict.' | |
>>> fn.lu(256).frame.name | |
'Expectation' | |
>>> pprint(list(map(PrettyDict, fn.lu(256).lexemes))) | |
[{'POS': 'V', 'breakBefore': 'false', 'headword': 'false', 'name': 'foresee', 'order': 1}] | |
>>> fn.lu(227).exemplars[23] | |
exemplar sentence (352962): | |
[sentNo] 0 | |
[aPos] 59699508 | |
<BLANKLINE> | |
[LU] (227) guess.v in Coming_to_believe | |
<BLANKLINE> | |
[frame] (23) Coming_to_believe | |
<BLANKLINE> | |
[annotationSet] 2 annotation sets | |
<BLANKLINE> | |
[POS] 18 tags | |
<BLANKLINE> | |
[POS_tagset] BNC | |
<BLANKLINE> | |
[GF] 3 relations | |
<BLANKLINE> | |
[PT] 3 phrases | |
<BLANKLINE> | |
[Other] 1 entry | |
<BLANKLINE> | |
[text] + [Target] + [FE] | |
<BLANKLINE> | |
When he was inside the house , Culley noticed the characteristic | |
------------------ | |
Content | |
<BLANKLINE> | |
he would n't have guessed at . | |
-- ******* -- | |
Co C1 [Evidence:INI] | |
(Co=Cognizer, C1=Content) | |
<BLANKLINE> | |
<BLANKLINE> | |
The dict that is returned from this function will contain most of the | |
following information about the LU. Note that some LUs do not contain | |
all of these pieces of information - particularly 'totalAnnotated' and | |
'incorporatedFE' may be missing in some LUs: | |
- 'name' : the name of the LU (e.g. 'merger.n') | |
- 'definition' : textual definition of the LU | |
- 'ID' : the internal ID number of the LU | |
- '_type' : 'lu' | |
- 'status' : e.g. 'Created' | |
- 'frame' : Frame that this LU belongs to | |
- 'POS' : the part of speech of this LU (e.g. 'N') | |
- 'totalAnnotated' : total number of examples annotated with this LU | |
- 'incorporatedFE' : FE that incorporates this LU (e.g. 'Ailment') | |
- 'sentenceCount' : a dict with the following two keys: | |
- 'annotated': number of sentences annotated with this LU | |
- 'total' : total number of sentences with this LU | |
- 'lexemes' : a list of dicts describing the lemma of this LU. | |
Each dict in the list contains these keys: | |
- 'POS' : part of speech e.g. 'N' | |
- 'name' : either single-lexeme e.g. 'merger' or | |
multi-lexeme e.g. 'a little' | |
- 'order': the order of the lexeme in the lemma (starting from 1) | |
- 'headword': a boolean ('true' or 'false') | |
- 'breakBefore': Can this lexeme be separated from the previous lexeme? | |
Consider: "take over.v" as in: | |
Germany took over the Netherlands in 2 days. | |
Germany took the Netherlands over in 2 days. | |
In this case, 'breakBefore' would be "true" for the lexeme | |
"over". Contrast this with "take after.v" as in: | |
Mary takes after her grandmother. | |
*Mary takes her grandmother after. | |
In this case, 'breakBefore' would be "false" for the lexeme "after" | |
- 'lemmaID' : Can be used to connect lemmas in different LUs | |
- 'semTypes' : a list of semantic type objects for this LU | |
- 'subCorpus' : a list of subcorpora | |
- Each item in the list is a dict containing the following keys: | |
- 'name' : | |
- 'sentence' : a list of sentences in the subcorpus | |
- each item in the list is a dict with the following keys: | |
- 'ID': | |
- 'sentNo': | |
- 'text': the text of the sentence | |
- 'aPos': | |
- 'annotationSet': a list of annotation sets | |
- each item in the list is a dict with the following keys: | |
- 'ID': | |
- 'status': | |
- 'layer': a list of layers | |
- each layer is a dict containing the following keys: | |
- 'name': layer name (e.g. 'BNC') | |
- 'rank': | |
- 'label': a list of labels for the layer | |
- each label is a dict containing the following keys: | |
- 'start': start pos of label in sentence 'text' (0-based) | |
- 'end': end pos of label in sentence 'text' (0-based) | |
- 'name': name of label (e.g. 'NN1') | |
Under the hood, this implementation looks up the lexical unit information | |
in the *frame* definition file. That file does not contain | |
corpus annotations, so the LU files will be accessed on demand if those are | |
needed. In principle, valence patterns could be loaded here too, | |
though these are not currently supported. | |
:param fn_luid: The id number of the lexical unit | |
:type fn_luid: int | |
:param ignorekeys: The keys to ignore. These keys will not be | |
included in the output. (optional) | |
:type ignorekeys: list(str) | |
:return: All information about the lexical unit | |
:rtype: dict | |
""" | |
# look for this LU in cache | |
if not self._lu_idx: | |
self._buildluindex() | |
OOV = object() | |
luinfo = self._lu_idx.get(fn_luid, OOV) | |
if luinfo is OOV: | |
# LU not in the index. We create a placeholder by falling back to | |
# luName, frameID, and frameName. However, this will not be listed | |
# among the LUs for its frame. | |
self._warn( | |
"LU ID not found: {0} ({1}) in {2} ({3})".format( | |
luName, fn_luid, frameName, frameID | |
) | |
) | |
luinfo = AttrDict( | |
{ | |
"_type": "lu", | |
"ID": fn_luid, | |
"name": luName, | |
"frameID": frameID, | |
"status": "Problem", | |
} | |
) | |
f = self.frame_by_id(luinfo.frameID) | |
assert f.name == frameName, (f.name, frameName) | |
luinfo["frame"] = f | |
self._lu_idx[fn_luid] = luinfo | |
elif "_type" not in luinfo: | |
# we only have an index entry for the LU. loading the frame will replace this. | |
f = self.frame_by_id(luinfo.frameID) | |
luinfo = self._lu_idx[fn_luid] | |
if ignorekeys: | |
return AttrDict( | |
dict((k, v) for k, v in luinfo.items() if k not in ignorekeys) | |
) | |
return luinfo | |
def _lu_file(self, lu, ignorekeys=[]): | |
""" | |
Augment the LU information that was loaded from the frame file | |
with additional information from the LU file. | |
""" | |
fn_luid = lu.ID | |
fname = "lu{0}.xml".format(fn_luid) | |
locpath = os.path.join("{0}".format(self._root), self._lu_dir, fname) | |
# print(locpath, file=sys.stderr) | |
if not self._lu_idx: | |
self._buildluindex() | |
try: | |
elt = XMLCorpusView(locpath, "lexUnit")[0] | |
except IOError: | |
raise FramenetError("Unknown LU id: {0}".format(fn_luid)) | |
lu2 = self._handle_lexunit_elt(elt, ignorekeys) | |
lu.URL = self._fnweb_url + "/" + self._lu_dir + "/" + fname | |
lu.subCorpus = lu2.subCorpus | |
lu.exemplars = SpecialList( | |
"luexemplars", [sent for subc in lu.subCorpus for sent in subc.sentence] | |
) | |
for sent in lu.exemplars: | |
sent["LU"] = lu | |
sent["frame"] = lu.frame | |
for aset in sent.annotationSet: | |
aset["LU"] = lu | |
aset["frame"] = lu.frame | |
return lu | |
def _loadsemtypes(self): | |
"""Create the semantic types index.""" | |
self._semtypes = AttrDict() | |
semtypeXML = [ | |
x | |
for x in XMLCorpusView( | |
self.abspath("semTypes.xml"), | |
"semTypes/semType", | |
self._handle_semtype_elt, | |
) | |
] | |
for st in semtypeXML: | |
n = st["name"] | |
a = st["abbrev"] | |
i = st["ID"] | |
# Both name and abbrev should be able to retrieve the | |
# ID. The ID will retrieve the semantic type dict itself. | |
self._semtypes[n] = i | |
self._semtypes[a] = i | |
self._semtypes[i] = st | |
# now that all individual semtype XML is loaded, we can link them together | |
roots = [] | |
for st in self.semtypes(): | |
if st.superType: | |
st.superType = self.semtype(st.superType.supID) | |
st.superType.subTypes.append(st) | |
else: | |
if st not in roots: | |
roots.append(st) | |
st.rootType = st | |
queue = list(roots) | |
assert queue | |
while queue: | |
st = queue.pop(0) | |
for child in st.subTypes: | |
child.rootType = st.rootType | |
queue.append(child) | |
# self.propagate_semtypes() # apply inferencing over FE relations | |
def propagate_semtypes(self): | |
""" | |
Apply inference rules to distribute semtypes over relations between FEs. | |
For FrameNet 1.5, this results in 1011 semtypes being propagated. | |
(Not done by default because it requires loading all frame files, | |
which takes several seconds. If this needed to be fast, it could be rewritten | |
to traverse the neighboring relations on demand for each FE semtype.) | |
>>> from nltk.corpus import framenet as fn | |
>>> x = sum(1 for f in fn.frames() for fe in f.FE.values() if fe.semType) | |
>>> fn.propagate_semtypes() | |
>>> y = sum(1 for f in fn.frames() for fe in f.FE.values() if fe.semType) | |
>>> y-x > 1000 | |
True | |
""" | |
if not self._semtypes: | |
self._loadsemtypes() | |
if not self._ferel_idx: | |
self._buildrelationindex() | |
changed = True | |
i = 0 | |
nPropagations = 0 | |
while changed: | |
# make a pass and see if anything needs to be propagated | |
i += 1 | |
changed = False | |
for ferel in self.fe_relations(): | |
superST = ferel.superFE.semType | |
subST = ferel.subFE.semType | |
try: | |
if superST and superST is not subST: | |
# propagate downward | |
assert subST is None or self.semtype_inherits(subST, superST), ( | |
superST.name, | |
ferel, | |
subST.name, | |
) | |
if subST is None: | |
ferel.subFE.semType = subST = superST | |
changed = True | |
nPropagations += 1 | |
if ( | |
ferel.type.name in ["Perspective_on", "Subframe", "Precedes"] | |
and subST | |
and subST is not superST | |
): | |
# propagate upward | |
assert superST is None, (superST.name, ferel, subST.name) | |
ferel.superFE.semType = superST = subST | |
changed = True | |
nPropagations += 1 | |
except AssertionError as ex: | |
# bug in the data! ignore | |
# print(ex, file=sys.stderr) | |
continue | |
# print(i, nPropagations, file=sys.stderr) | |
def semtype(self, key): | |
""" | |
>>> from nltk.corpus import framenet as fn | |
>>> fn.semtype(233).name | |
'Temperature' | |
>>> fn.semtype(233).abbrev | |
'Temp' | |
>>> fn.semtype('Temperature').ID | |
233 | |
:param key: The name, abbreviation, or id number of the semantic type | |
:type key: string or int | |
:return: Information about a semantic type | |
:rtype: dict | |
""" | |
if isinstance(key, int): | |
stid = key | |
else: | |
try: | |
stid = self._semtypes[key] | |
except TypeError: | |
self._loadsemtypes() | |
stid = self._semtypes[key] | |
try: | |
st = self._semtypes[stid] | |
except TypeError: | |
self._loadsemtypes() | |
st = self._semtypes[stid] | |
return st | |
def semtype_inherits(self, st, superST): | |
if not isinstance(st, dict): | |
st = self.semtype(st) | |
if not isinstance(superST, dict): | |
superST = self.semtype(superST) | |
par = st.superType | |
while par: | |
if par is superST: | |
return True | |
par = par.superType | |
return False | |
def frames(self, name=None): | |
""" | |
Obtain details for a specific frame. | |
>>> from nltk.corpus import framenet as fn | |
>>> len(fn.frames()) in (1019, 1221) # FN 1.5 and 1.7, resp. | |
True | |
>>> x = PrettyList(fn.frames(r'(?i)crim'), maxReprSize=0, breakLines=True) | |
>>> x.sort(key=itemgetter('ID')) | |
>>> x | |
[<frame ID=200 name=Criminal_process>, | |
<frame ID=500 name=Criminal_investigation>, | |
<frame ID=692 name=Crime_scenario>, | |
<frame ID=700 name=Committing_crime>] | |
A brief intro to Frames (excerpted from "FrameNet II: Extended | |
Theory and Practice" by Ruppenhofer et. al., 2010): | |
A Frame is a script-like conceptual structure that describes a | |
particular type of situation, object, or event along with the | |
participants and props that are needed for that Frame. For | |
example, the "Apply_heat" frame describes a common situation | |
involving a Cook, some Food, and a Heating_Instrument, and is | |
evoked by words such as bake, blanch, boil, broil, brown, | |
simmer, steam, etc. | |
We call the roles of a Frame "frame elements" (FEs) and the | |
frame-evoking words are called "lexical units" (LUs). | |
FrameNet includes relations between Frames. Several types of | |
relations are defined, of which the most important are: | |
- Inheritance: An IS-A relation. The child frame is a subtype | |
of the parent frame, and each FE in the parent is bound to | |
a corresponding FE in the child. An example is the | |
"Revenge" frame which inherits from the | |
"Rewards_and_punishments" frame. | |
- Using: The child frame presupposes the parent frame as | |
background, e.g the "Speed" frame "uses" (or presupposes) | |
the "Motion" frame; however, not all parent FEs need to be | |
bound to child FEs. | |
- Subframe: The child frame is a subevent of a complex event | |
represented by the parent, e.g. the "Criminal_process" frame | |
has subframes of "Arrest", "Arraignment", "Trial", and | |
"Sentencing". | |
- Perspective_on: The child frame provides a particular | |
perspective on an un-perspectivized parent frame. A pair of | |
examples consists of the "Hiring" and "Get_a_job" frames, | |
which perspectivize the "Employment_start" frame from the | |
Employer's and the Employee's point of view, respectively. | |
:param name: A regular expression pattern used to match against | |
Frame names. If 'name' is None, then a list of all | |
Framenet Frames will be returned. | |
:type name: str | |
:return: A list of matching Frames (or all Frames). | |
:rtype: list(AttrDict) | |
""" | |
try: | |
fIDs = list(self._frame_idx.keys()) | |
except AttributeError: | |
self._buildframeindex() | |
fIDs = list(self._frame_idx.keys()) | |
if name is not None: | |
return PrettyList( | |
self.frame(fID) for fID, finfo in self.frame_ids_and_names(name).items() | |
) | |
else: | |
return PrettyLazyMap(self.frame, fIDs) | |
def frame_ids_and_names(self, name=None): | |
""" | |
Uses the frame index, which is much faster than looking up each frame definition | |
if only the names and IDs are needed. | |
""" | |
if not self._frame_idx: | |
self._buildframeindex() | |
return dict( | |
(fID, finfo.name) | |
for fID, finfo in self._frame_idx.items() | |
if name is None or re.search(name, finfo.name) is not None | |
) | |
def fes(self, name=None, frame=None): | |
""" | |
Lists frame element objects. If 'name' is provided, this is treated as | |
a case-insensitive regular expression to filter by frame name. | |
(Case-insensitivity is because casing of frame element names is not always | |
consistent across frames.) Specify 'frame' to filter by a frame name pattern, | |
ID, or object. | |
>>> from nltk.corpus import framenet as fn | |
>>> fn.fes('Noise_maker') | |
[<fe ID=6043 name=Noise_maker>] | |
>>> sorted([(fe.frame.name,fe.name) for fe in fn.fes('sound')]) | |
[('Cause_to_make_noise', 'Sound_maker'), ('Make_noise', 'Sound'), | |
('Make_noise', 'Sound_source'), ('Sound_movement', 'Location_of_sound_source'), | |
('Sound_movement', 'Sound'), ('Sound_movement', 'Sound_source'), | |
('Sounds', 'Component_sound'), ('Sounds', 'Location_of_sound_source'), | |
('Sounds', 'Sound_source'), ('Vocalizations', 'Location_of_sound_source'), | |
('Vocalizations', 'Sound_source')] | |
>>> sorted([(fe.frame.name,fe.name) for fe in fn.fes('sound',r'(?i)make_noise')]) | |
[('Cause_to_make_noise', 'Sound_maker'), | |
('Make_noise', 'Sound'), | |
('Make_noise', 'Sound_source')] | |
>>> sorted(set(fe.name for fe in fn.fes('^sound'))) | |
['Sound', 'Sound_maker', 'Sound_source'] | |
>>> len(fn.fes('^sound$')) | |
2 | |
:param name: A regular expression pattern used to match against | |
frame element names. If 'name' is None, then a list of all | |
frame elements will be returned. | |
:type name: str | |
:return: A list of matching frame elements | |
:rtype: list(AttrDict) | |
""" | |
# what frames are we searching in? | |
if frame is not None: | |
if isinstance(frame, int): | |
frames = [self.frame(frame)] | |
elif isinstance(frame, str): | |
frames = self.frames(frame) | |
else: | |
frames = [frame] | |
else: | |
frames = self.frames() | |
return PrettyList( | |
fe | |
for f in frames | |
for fename, fe in f.FE.items() | |
if name is None or re.search(name, fename, re.I) | |
) | |
def lus(self, name=None, frame=None): | |
""" | |
Obtain details for lexical units. | |
Optionally restrict by lexical unit name pattern, and/or to a certain frame | |
or frames whose name matches a pattern. | |
>>> from nltk.corpus import framenet as fn | |
>>> len(fn.lus()) in (11829, 13572) # FN 1.5 and 1.7, resp. | |
True | |
>>> PrettyList(sorted(fn.lus(r'(?i)a little'), key=itemgetter('ID')), maxReprSize=0, breakLines=True) | |
[<lu ID=14733 name=a little.n>, | |
<lu ID=14743 name=a little.adv>, | |
<lu ID=14744 name=a little bit.adv>] | |
>>> PrettyList(sorted(fn.lus(r'interest', r'(?i)stimulus'), key=itemgetter('ID'))) | |
[<lu ID=14894 name=interested.a>, <lu ID=14920 name=interesting.a>] | |
A brief intro to Lexical Units (excerpted from "FrameNet II: | |
Extended Theory and Practice" by Ruppenhofer et. al., 2010): | |
A lexical unit (LU) is a pairing of a word with a meaning. For | |
example, the "Apply_heat" Frame describes a common situation | |
involving a Cook, some Food, and a Heating Instrument, and is | |
_evoked_ by words such as bake, blanch, boil, broil, brown, | |
simmer, steam, etc. These frame-evoking words are the LUs in the | |
Apply_heat frame. Each sense of a polysemous word is a different | |
LU. | |
We have used the word "word" in talking about LUs. The reality | |
is actually rather complex. When we say that the word "bake" is | |
polysemous, we mean that the lemma "bake.v" (which has the | |
word-forms "bake", "bakes", "baked", and "baking") is linked to | |
three different frames: | |
- Apply_heat: "Michelle baked the potatoes for 45 minutes." | |
- Cooking_creation: "Michelle baked her mother a cake for her birthday." | |
- Absorb_heat: "The potatoes have to bake for more than 30 minutes." | |
These constitute three different LUs, with different | |
definitions. | |
Multiword expressions such as "given name" and hyphenated words | |
like "shut-eye" can also be LUs. Idiomatic phrases such as | |
"middle of nowhere" and "give the slip (to)" are also defined as | |
LUs in the appropriate frames ("Isolated_places" and "Evading", | |
respectively), and their internal structure is not analyzed. | |
Framenet provides multiple annotated examples of each sense of a | |
word (i.e. each LU). Moreover, the set of examples | |
(approximately 20 per LU) illustrates all of the combinatorial | |
possibilities of the lexical unit. | |
Each LU is linked to a Frame, and hence to the other words which | |
evoke that Frame. This makes the FrameNet database similar to a | |
thesaurus, grouping together semantically similar words. | |
In the simplest case, frame-evoking words are verbs such as | |
"fried" in: | |
"Matilde fried the catfish in a heavy iron skillet." | |
Sometimes event nouns may evoke a Frame. For example, | |
"reduction" evokes "Cause_change_of_scalar_position" in: | |
"...the reduction of debt levels to $665 million from $2.6 billion." | |
Adjectives may also evoke a Frame. For example, "asleep" may | |
evoke the "Sleep" frame as in: | |
"They were asleep for hours." | |
Many common nouns, such as artifacts like "hat" or "tower", | |
typically serve as dependents rather than clearly evoking their | |
own frames. | |
:param name: A regular expression pattern used to search the LU | |
names. Note that LU names take the form of a dotted | |
string (e.g. "run.v" or "a little.adv") in which a | |
lemma preceeds the "." and a POS follows the | |
dot. The lemma may be composed of a single lexeme | |
(e.g. "run") or of multiple lexemes (e.g. "a | |
little"). If 'name' is not given, then all LUs will | |
be returned. | |
The valid POSes are: | |
v - verb | |
n - noun | |
a - adjective | |
adv - adverb | |
prep - preposition | |
num - numbers | |
intj - interjection | |
art - article | |
c - conjunction | |
scon - subordinating conjunction | |
:type name: str | |
:type frame: str or int or frame | |
:return: A list of selected (or all) lexical units | |
:rtype: list of LU objects (dicts). See the lu() function for info | |
about the specifics of LU objects. | |
""" | |
if not self._lu_idx: | |
self._buildluindex() | |
if name is not None: # match LUs, then restrict by frame | |
result = PrettyList( | |
self.lu(luID) for luID, luName in self.lu_ids_and_names(name).items() | |
) | |
if frame is not None: | |
if isinstance(frame, int): | |
frameIDs = {frame} | |
elif isinstance(frame, str): | |
frameIDs = {f.ID for f in self.frames(frame)} | |
else: | |
frameIDs = {frame.ID} | |
result = PrettyList(lu for lu in result if lu.frame.ID in frameIDs) | |
elif frame is not None: # all LUs in matching frames | |
if isinstance(frame, int): | |
frames = [self.frame(frame)] | |
elif isinstance(frame, str): | |
frames = self.frames(frame) | |
else: | |
frames = [frame] | |
result = PrettyLazyIteratorList( | |
iter(LazyConcatenation(list(f.lexUnit.values()) for f in frames)) | |
) | |
else: # all LUs | |
luIDs = [ | |
luID | |
for luID, lu in self._lu_idx.items() | |
if lu.status not in self._bad_statuses | |
] | |
result = PrettyLazyMap(self.lu, luIDs) | |
return result | |
def lu_ids_and_names(self, name=None): | |
""" | |
Uses the LU index, which is much faster than looking up each LU definition | |
if only the names and IDs are needed. | |
""" | |
if not self._lu_idx: | |
self._buildluindex() | |
return { | |
luID: luinfo.name | |
for luID, luinfo in self._lu_idx.items() | |
if luinfo.status not in self._bad_statuses | |
and (name is None or re.search(name, luinfo.name) is not None) | |
} | |
def docs_metadata(self, name=None): | |
""" | |
Return an index of the annotated documents in Framenet. | |
Details for a specific annotated document can be obtained using this | |
class's doc() function and pass it the value of the 'ID' field. | |
>>> from nltk.corpus import framenet as fn | |
>>> len(fn.docs()) in (78, 107) # FN 1.5 and 1.7, resp. | |
True | |
>>> set([x.corpname for x in fn.docs_metadata()])>=set(['ANC', 'KBEval', \ | |
'LUCorpus-v0.3', 'Miscellaneous', 'NTI', 'PropBank']) | |
True | |
:param name: A regular expression pattern used to search the | |
file name of each annotated document. The document's | |
file name contains the name of the corpus that the | |
document is from, followed by two underscores "__" | |
followed by the document name. So, for example, the | |
file name "LUCorpus-v0.3__20000410_nyt-NEW.xml" is | |
from the corpus named "LUCorpus-v0.3" and the | |
document name is "20000410_nyt-NEW.xml". | |
:type name: str | |
:return: A list of selected (or all) annotated documents | |
:rtype: list of dicts, where each dict object contains the following | |
keys: | |
- 'name' | |
- 'ID' | |
- 'corpid' | |
- 'corpname' | |
- 'description' | |
- 'filename' | |
""" | |
try: | |
ftlist = PrettyList(self._fulltext_idx.values()) | |
except AttributeError: | |
self._buildcorpusindex() | |
ftlist = PrettyList(self._fulltext_idx.values()) | |
if name is None: | |
return ftlist | |
else: | |
return PrettyList( | |
x for x in ftlist if re.search(name, x["filename"]) is not None | |
) | |
def docs(self, name=None): | |
""" | |
Return a list of the annotated full-text documents in FrameNet, | |
optionally filtered by a regex to be matched against the document name. | |
""" | |
return PrettyLazyMap((lambda x: self.doc(x.ID)), self.docs_metadata(name)) | |
def sents(self, exemplars=True, full_text=True): | |
""" | |
Annotated sentences matching the specified criteria. | |
""" | |
if exemplars: | |
if full_text: | |
return self.exemplars() + self.ft_sents() | |
else: | |
return self.exemplars() | |
elif full_text: | |
return self.ft_sents() | |
def annotations(self, luNamePattern=None, exemplars=True, full_text=True): | |
""" | |
Frame annotation sets matching the specified criteria. | |
""" | |
if exemplars: | |
epart = PrettyLazyIteratorList( | |
sent.frameAnnotation for sent in self.exemplars(luNamePattern) | |
) | |
else: | |
epart = [] | |
if full_text: | |
if luNamePattern is not None: | |
matchedLUIDs = set(self.lu_ids_and_names(luNamePattern).keys()) | |
ftpart = PrettyLazyIteratorList( | |
aset | |
for sent in self.ft_sents() | |
for aset in sent.annotationSet[1:] | |
if luNamePattern is None or aset.get("luID", "CXN_ASET") in matchedLUIDs | |
) | |
else: | |
ftpart = [] | |
if exemplars: | |
if full_text: | |
return epart + ftpart | |
else: | |
return epart | |
elif full_text: | |
return ftpart | |
def exemplars(self, luNamePattern=None, frame=None, fe=None, fe2=None): | |
""" | |
Lexicographic exemplar sentences, optionally filtered by LU name and/or 1-2 FEs that | |
are realized overtly. 'frame' may be a name pattern, frame ID, or frame instance. | |
'fe' may be a name pattern or FE instance; if specified, 'fe2' may also | |
be specified to retrieve sentences with both overt FEs (in either order). | |
""" | |
if fe is None and fe2 is not None: | |
raise FramenetError("exemplars(..., fe=None, fe2=<value>) is not allowed") | |
elif fe is not None and fe2 is not None: | |
if not isinstance(fe2, str): | |
if isinstance(fe, str): | |
# fe2 is specific to a particular frame. swap fe and fe2 so fe is always used to determine the frame. | |
fe, fe2 = fe2, fe | |
elif fe.frame is not fe2.frame: # ensure frames match | |
raise FramenetError( | |
"exemplars() call with inconsistent `fe` and `fe2` specification (frames must match)" | |
) | |
if frame is None and fe is not None and not isinstance(fe, str): | |
frame = fe.frame | |
# narrow down to frames matching criteria | |
lusByFrame = defaultdict( | |
list | |
) # frame name -> matching LUs, if luNamePattern is specified | |
if frame is not None or luNamePattern is not None: | |
if frame is None or isinstance(frame, str): | |
if luNamePattern is not None: | |
frames = set() | |
for lu in self.lus(luNamePattern, frame=frame): | |
frames.add(lu.frame.ID) | |
lusByFrame[lu.frame.name].append(lu) | |
frames = LazyMap(self.frame, list(frames)) | |
else: | |
frames = self.frames(frame) | |
else: | |
if isinstance(frame, int): | |
frames = [self.frame(frame)] | |
else: # frame object | |
frames = [frame] | |
if luNamePattern is not None: | |
lusByFrame = {frame.name: self.lus(luNamePattern, frame=frame)} | |
if fe is not None: # narrow to frames that define this FE | |
if isinstance(fe, str): | |
frames = PrettyLazyIteratorList( | |
f | |
for f in frames | |
if fe in f.FE | |
or any(re.search(fe, ffe, re.I) for ffe in f.FE.keys()) | |
) | |
else: | |
if fe.frame not in frames: | |
raise FramenetError( | |
"exemplars() call with inconsistent `frame` and `fe` specification" | |
) | |
frames = [fe.frame] | |
if fe2 is not None: # narrow to frames that ALSO define this FE | |
if isinstance(fe2, str): | |
frames = PrettyLazyIteratorList( | |
f | |
for f in frames | |
if fe2 in f.FE | |
or any(re.search(fe2, ffe, re.I) for ffe in f.FE.keys()) | |
) | |
# else we already narrowed it to a single frame | |
else: # frame, luNamePattern are None. fe, fe2 are None or strings | |
if fe is not None: | |
frames = {ffe.frame.ID for ffe in self.fes(fe)} | |
if fe2 is not None: | |
frames2 = {ffe.frame.ID for ffe in self.fes(fe2)} | |
frames = frames & frames2 | |
frames = LazyMap(self.frame, list(frames)) | |
else: | |
frames = self.frames() | |
# we've narrowed down 'frames' | |
# now get exemplars for relevant LUs in those frames | |
def _matching_exs(): | |
for f in frames: | |
fes = fes2 = None # FEs of interest | |
if fe is not None: | |
fes = ( | |
{ffe for ffe in f.FE.keys() if re.search(fe, ffe, re.I)} | |
if isinstance(fe, str) | |
else {fe.name} | |
) | |
if fe2 is not None: | |
fes2 = ( | |
{ffe for ffe in f.FE.keys() if re.search(fe2, ffe, re.I)} | |
if isinstance(fe2, str) | |
else {fe2.name} | |
) | |
for lu in ( | |
lusByFrame[f.name] | |
if luNamePattern is not None | |
else f.lexUnit.values() | |
): | |
for ex in lu.exemplars: | |
if (fes is None or self._exemplar_of_fes(ex, fes)) and ( | |
fes2 is None or self._exemplar_of_fes(ex, fes2) | |
): | |
yield ex | |
return PrettyLazyIteratorList(_matching_exs()) | |
def _exemplar_of_fes(self, ex, fes=None): | |
""" | |
Given an exemplar sentence and a set of FE names, return the subset of FE names | |
that are realized overtly in the sentence on the FE, FE2, or FE3 layer. | |
If 'fes' is None, returns all overt FE names. | |
""" | |
overtNames = set(list(zip(*ex.FE[0]))[2]) if ex.FE[0] else set() | |
if "FE2" in ex: | |
overtNames |= set(list(zip(*ex.FE2[0]))[2]) if ex.FE2[0] else set() | |
if "FE3" in ex: | |
overtNames |= set(list(zip(*ex.FE3[0]))[2]) if ex.FE3[0] else set() | |
return overtNames & fes if fes is not None else overtNames | |
def ft_sents(self, docNamePattern=None): | |
""" | |
Full-text annotation sentences, optionally filtered by document name. | |
""" | |
return PrettyLazyIteratorList( | |
sent for d in self.docs(docNamePattern) for sent in d.sentence | |
) | |
def frame_relation_types(self): | |
""" | |
Obtain a list of frame relation types. | |
>>> from nltk.corpus import framenet as fn | |
>>> frts = sorted(fn.frame_relation_types(), key=itemgetter('ID')) | |
>>> isinstance(frts, list) | |
True | |
>>> len(frts) in (9, 10) # FN 1.5 and 1.7, resp. | |
True | |
>>> PrettyDict(frts[0], breakLines=True) | |
{'ID': 1, | |
'_type': 'framerelationtype', | |
'frameRelations': [<Parent=Event -- Inheritance -> Child=Change_of_consistency>, <Parent=Event -- Inheritance -> Child=Rotting>, ...], | |
'name': 'Inheritance', | |
'subFrameName': 'Child', | |
'superFrameName': 'Parent'} | |
:return: A list of all of the frame relation types in framenet | |
:rtype: list(dict) | |
""" | |
if not self._freltyp_idx: | |
self._buildrelationindex() | |
return self._freltyp_idx.values() | |
def frame_relations(self, frame=None, frame2=None, type=None): | |
""" | |
:param frame: (optional) frame object, name, or ID; only relations involving | |
this frame will be returned | |
:param frame2: (optional; 'frame' must be a different frame) only show relations | |
between the two specified frames, in either direction | |
:param type: (optional) frame relation type (name or object); show only relations | |
of this type | |
:type frame: int or str or AttrDict | |
:return: A list of all of the frame relations in framenet | |
:rtype: list(dict) | |
>>> from nltk.corpus import framenet as fn | |
>>> frels = fn.frame_relations() | |
>>> isinstance(frels, list) | |
True | |
>>> len(frels) in (1676, 2070) # FN 1.5 and 1.7, resp. | |
True | |
>>> PrettyList(fn.frame_relations('Cooking_creation'), maxReprSize=0, breakLines=True) | |
[<Parent=Intentionally_create -- Inheritance -> Child=Cooking_creation>, | |
<Parent=Apply_heat -- Using -> Child=Cooking_creation>, | |
<MainEntry=Apply_heat -- See_also -> ReferringEntry=Cooking_creation>] | |
>>> PrettyList(fn.frame_relations(274), breakLines=True) | |
[<Parent=Avoiding -- Inheritance -> Child=Dodging>, | |
<Parent=Avoiding -- Inheritance -> Child=Evading>, ...] | |
>>> PrettyList(fn.frame_relations(fn.frame('Cooking_creation')), breakLines=True) | |
[<Parent=Intentionally_create -- Inheritance -> Child=Cooking_creation>, | |
<Parent=Apply_heat -- Using -> Child=Cooking_creation>, ...] | |
>>> PrettyList(fn.frame_relations('Cooking_creation', type='Inheritance')) | |
[<Parent=Intentionally_create -- Inheritance -> Child=Cooking_creation>] | |
>>> PrettyList(fn.frame_relations('Cooking_creation', 'Apply_heat'), breakLines=True) | |
[<Parent=Apply_heat -- Using -> Child=Cooking_creation>, | |
<MainEntry=Apply_heat -- See_also -> ReferringEntry=Cooking_creation>] | |
""" | |
relation_type = type | |
if not self._frel_idx: | |
self._buildrelationindex() | |
rels = None | |
if relation_type is not None: | |
if not isinstance(relation_type, dict): | |
type = [rt for rt in self.frame_relation_types() if rt.name == type][0] | |
assert isinstance(type, dict) | |
# lookup by 'frame' | |
if frame is not None: | |
if isinstance(frame, dict) and "frameRelations" in frame: | |
rels = PrettyList(frame.frameRelations) | |
else: | |
if not isinstance(frame, int): | |
if isinstance(frame, dict): | |
frame = frame.ID | |
else: | |
frame = self.frame_by_name(frame).ID | |
rels = [self._frel_idx[frelID] for frelID in self._frel_f_idx[frame]] | |
# filter by 'type' | |
if type is not None: | |
rels = [rel for rel in rels if rel.type is type] | |
elif type is not None: | |
# lookup by 'type' | |
rels = type.frameRelations | |
else: | |
rels = self._frel_idx.values() | |
# filter by 'frame2' | |
if frame2 is not None: | |
if frame is None: | |
raise FramenetError( | |
"frame_relations(frame=None, frame2=<value>) is not allowed" | |
) | |
if not isinstance(frame2, int): | |
if isinstance(frame2, dict): | |
frame2 = frame2.ID | |
else: | |
frame2 = self.frame_by_name(frame2).ID | |
if frame == frame2: | |
raise FramenetError( | |
"The two frame arguments to frame_relations() must be different frames" | |
) | |
rels = [ | |
rel | |
for rel in rels | |
if rel.superFrame.ID == frame2 or rel.subFrame.ID == frame2 | |
] | |
return PrettyList( | |
sorted( | |
rels, | |
key=lambda frel: (frel.type.ID, frel.superFrameName, frel.subFrameName), | |
) | |
) | |
def fe_relations(self): | |
""" | |
Obtain a list of frame element relations. | |
>>> from nltk.corpus import framenet as fn | |
>>> ferels = fn.fe_relations() | |
>>> isinstance(ferels, list) | |
True | |
>>> len(ferels) in (10020, 12393) # FN 1.5 and 1.7, resp. | |
True | |
>>> PrettyDict(ferels[0], breakLines=True) | |
{'ID': 14642, | |
'_type': 'ferelation', | |
'frameRelation': <Parent=Abounding_with -- Inheritance -> Child=Lively_place>, | |
'subFE': <fe ID=11370 name=Degree>, | |
'subFEName': 'Degree', | |
'subFrame': <frame ID=1904 name=Lively_place>, | |
'subID': 11370, | |
'supID': 2271, | |
'superFE': <fe ID=2271 name=Degree>, | |
'superFEName': 'Degree', | |
'superFrame': <frame ID=262 name=Abounding_with>, | |
'type': <framerelationtype ID=1 name=Inheritance>} | |
:return: A list of all of the frame element relations in framenet | |
:rtype: list(dict) | |
""" | |
if not self._ferel_idx: | |
self._buildrelationindex() | |
return PrettyList( | |
sorted( | |
self._ferel_idx.values(), | |
key=lambda ferel: ( | |
ferel.type.ID, | |
ferel.frameRelation.superFrameName, | |
ferel.superFEName, | |
ferel.frameRelation.subFrameName, | |
ferel.subFEName, | |
), | |
) | |
) | |
def semtypes(self): | |
""" | |
Obtain a list of semantic types. | |
>>> from nltk.corpus import framenet as fn | |
>>> stypes = fn.semtypes() | |
>>> len(stypes) in (73, 109) # FN 1.5 and 1.7, resp. | |
True | |
>>> sorted(stypes[0].keys()) | |
['ID', '_type', 'abbrev', 'definition', 'definitionMarkup', 'name', 'rootType', 'subTypes', 'superType'] | |
:return: A list of all of the semantic types in framenet | |
:rtype: list(dict) | |
""" | |
if not self._semtypes: | |
self._loadsemtypes() | |
return PrettyList( | |
self._semtypes[i] for i in self._semtypes if isinstance(i, int) | |
) | |
def _load_xml_attributes(self, d, elt): | |
""" | |
Extracts a subset of the attributes from the given element and | |
returns them in a dictionary. | |
:param d: A dictionary in which to store the attributes. | |
:type d: dict | |
:param elt: An ElementTree Element | |
:type elt: Element | |
:return: Returns the input dict ``d`` possibly including attributes from ``elt`` | |
:rtype: dict | |
""" | |
d = type(d)(d) | |
try: | |
attr_dict = elt.attrib | |
except AttributeError: | |
return d | |
if attr_dict is None: | |
return d | |
# Ignore these attributes when loading attributes from an xml node | |
ignore_attrs = [ #'cBy', 'cDate', 'mDate', # <-- annotation metadata that could be of interest | |
"xsi", | |
"schemaLocation", | |
"xmlns", | |
"bgColor", | |
"fgColor", | |
] | |
for attr in attr_dict: | |
if any(attr.endswith(x) for x in ignore_attrs): | |
continue | |
val = attr_dict[attr] | |
if val.isdigit(): | |
d[attr] = int(val) | |
else: | |
d[attr] = val | |
return d | |
def _strip_tags(self, data): | |
""" | |
Gets rid of all tags and newline characters from the given input | |
:return: A cleaned-up version of the input string | |
:rtype: str | |
""" | |
try: | |
""" | |
# Look for boundary issues in markup. (Sometimes FEs are pluralized in definitions.) | |
m = re.search(r'\w[<][^/]|[<][/][^>]+[>](s\w|[a-rt-z0-9])', data) | |
if m: | |
print('Markup boundary:', data[max(0,m.start(0)-10):m.end(0)+10].replace('\n',' '), file=sys.stderr) | |
""" | |
data = data.replace("<t>", "") | |
data = data.replace("</t>", "") | |
data = re.sub('<fex name="[^"]+">', "", data) | |
data = data.replace("</fex>", "") | |
data = data.replace("<fen>", "") | |
data = data.replace("</fen>", "") | |
data = data.replace("<m>", "") | |
data = data.replace("</m>", "") | |
data = data.replace("<ment>", "") | |
data = data.replace("</ment>", "") | |
data = data.replace("<ex>", "'") | |
data = data.replace("</ex>", "'") | |
data = data.replace("<gov>", "") | |
data = data.replace("</gov>", "") | |
data = data.replace("<x>", "") | |
data = data.replace("</x>", "") | |
# Get rid of <def-root> and </def-root> tags | |
data = data.replace("<def-root>", "") | |
data = data.replace("</def-root>", "") | |
data = data.replace("\n", " ") | |
except AttributeError: | |
pass | |
return data | |
def _handle_elt(self, elt, tagspec=None): | |
"""Extracts and returns the attributes of the given element""" | |
return self._load_xml_attributes(AttrDict(), elt) | |
def _handle_fulltextindex_elt(self, elt, tagspec=None): | |
""" | |
Extracts corpus/document info from the fulltextIndex.xml file. | |
Note that this function "flattens" the information contained | |
in each of the "corpus" elements, so that each "document" | |
element will contain attributes for the corpus and | |
corpusid. Also, each of the "document" items will contain a | |
new attribute called "filename" that is the base file name of | |
the xml file for the document in the "fulltext" subdir of the | |
Framenet corpus. | |
""" | |
ftinfo = self._load_xml_attributes(AttrDict(), elt) | |
corpname = ftinfo.name | |
corpid = ftinfo.ID | |
retlist = [] | |
for sub in elt: | |
if sub.tag.endswith("document"): | |
doc = self._load_xml_attributes(AttrDict(), sub) | |
if "name" in doc: | |
docname = doc.name | |
else: | |
docname = doc.description | |
doc.filename = "{0}__{1}.xml".format(corpname, docname) | |
doc.URL = ( | |
self._fnweb_url + "/" + self._fulltext_dir + "/" + doc.filename | |
) | |
doc.corpname = corpname | |
doc.corpid = corpid | |
retlist.append(doc) | |
return retlist | |
def _handle_frame_elt(self, elt, ignorekeys=[]): | |
"""Load the info for a Frame from a frame xml file""" | |
frinfo = self._load_xml_attributes(AttrDict(), elt) | |
frinfo["_type"] = "frame" | |
frinfo["definition"] = "" | |
frinfo["definitionMarkup"] = "" | |
frinfo["FE"] = PrettyDict() | |
frinfo["FEcoreSets"] = [] | |
frinfo["lexUnit"] = PrettyDict() | |
frinfo["semTypes"] = [] | |
for k in ignorekeys: | |
if k in frinfo: | |
del frinfo[k] | |
for sub in elt: | |
if sub.tag.endswith("definition") and "definition" not in ignorekeys: | |
frinfo["definitionMarkup"] = sub.text | |
frinfo["definition"] = self._strip_tags(sub.text) | |
elif sub.tag.endswith("FE") and "FE" not in ignorekeys: | |
feinfo = self._handle_fe_elt(sub) | |
frinfo["FE"][feinfo.name] = feinfo | |
feinfo["frame"] = frinfo # backpointer | |
elif sub.tag.endswith("FEcoreSet") and "FEcoreSet" not in ignorekeys: | |
coreset = self._handle_fecoreset_elt(sub) | |
# assumes all FEs have been loaded before coresets | |
frinfo["FEcoreSets"].append( | |
PrettyList(frinfo["FE"][fe.name] for fe in coreset) | |
) | |
elif sub.tag.endswith("lexUnit") and "lexUnit" not in ignorekeys: | |
luentry = self._handle_framelexunit_elt(sub) | |
if luentry["status"] in self._bad_statuses: | |
# problematic LU entry; ignore it | |
continue | |
luentry["frame"] = frinfo | |
luentry["URL"] = ( | |
self._fnweb_url | |
+ "/" | |
+ self._lu_dir | |
+ "/" | |
+ "lu{0}.xml".format(luentry["ID"]) | |
) | |
luentry["subCorpus"] = Future( | |
(lambda lu: lambda: self._lu_file(lu).subCorpus)(luentry) | |
) | |
luentry["exemplars"] = Future( | |
(lambda lu: lambda: self._lu_file(lu).exemplars)(luentry) | |
) | |
frinfo["lexUnit"][luentry.name] = luentry | |
if not self._lu_idx: | |
self._buildluindex() | |
self._lu_idx[luentry.ID] = luentry | |
elif sub.tag.endswith("semType") and "semTypes" not in ignorekeys: | |
semtypeinfo = self._load_xml_attributes(AttrDict(), sub) | |
frinfo["semTypes"].append(self.semtype(semtypeinfo.ID)) | |
frinfo["frameRelations"] = self.frame_relations(frame=frinfo) | |
# resolve 'requires' and 'excludes' links between FEs of this frame | |
for fe in frinfo.FE.values(): | |
if fe.requiresFE: | |
name, ID = fe.requiresFE.name, fe.requiresFE.ID | |
fe.requiresFE = frinfo.FE[name] | |
assert fe.requiresFE.ID == ID | |
if fe.excludesFE: | |
name, ID = fe.excludesFE.name, fe.excludesFE.ID | |
fe.excludesFE = frinfo.FE[name] | |
assert fe.excludesFE.ID == ID | |
return frinfo | |
def _handle_fecoreset_elt(self, elt): | |
"""Load fe coreset info from xml.""" | |
info = self._load_xml_attributes(AttrDict(), elt) | |
tmp = [] | |
for sub in elt: | |
tmp.append(self._load_xml_attributes(AttrDict(), sub)) | |
return tmp | |
def _handle_framerelationtype_elt(self, elt, *args): | |
"""Load frame-relation element and its child fe-relation elements from frRelation.xml.""" | |
info = self._load_xml_attributes(AttrDict(), elt) | |
info["_type"] = "framerelationtype" | |
info["frameRelations"] = PrettyList() | |
for sub in elt: | |
if sub.tag.endswith("frameRelation"): | |
frel = self._handle_framerelation_elt(sub) | |
frel["type"] = info # backpointer | |
for ferel in frel.feRelations: | |
ferel["type"] = info | |
info["frameRelations"].append(frel) | |
return info | |
def _handle_framerelation_elt(self, elt): | |
"""Load frame-relation element and its child fe-relation elements from frRelation.xml.""" | |
info = self._load_xml_attributes(AttrDict(), elt) | |
assert info["superFrameName"] != info["subFrameName"], (elt, info) | |
info["_type"] = "framerelation" | |
info["feRelations"] = PrettyList() | |
for sub in elt: | |
if sub.tag.endswith("FERelation"): | |
ferel = self._handle_elt(sub) | |
ferel["_type"] = "ferelation" | |
ferel["frameRelation"] = info # backpointer | |
info["feRelations"].append(ferel) | |
return info | |
def _handle_fulltextannotation_elt(self, elt): | |
"""Load full annotation info for a document from its xml | |
file. The main element (fullTextAnnotation) contains a 'header' | |
element (which we ignore here) and a bunch of 'sentence' | |
elements.""" | |
info = AttrDict() | |
info["_type"] = "fulltext_annotation" | |
info["sentence"] = [] | |
for sub in elt: | |
if sub.tag.endswith("header"): | |
continue # not used | |
elif sub.tag.endswith("sentence"): | |
try: | |
s = self._handle_fulltext_sentence_elt(sub) | |
s.doc = info | |
info["sentence"].append(s) | |
except FramenetError as e: | |
print(e) | |
return info | |
def _handle_fulltext_sentence_elt(self, elt): | |
"""Load information from the given 'sentence' element. Each | |
'sentence' element contains a "text" and "annotationSet" sub | |
elements.""" | |
info = self._load_xml_attributes(AttrDict(), elt) | |
info["_type"] = "fulltext_sentence" | |
info["annotationSet"] = [] | |
info["targets"] = [] | |
target_spans = set() | |
info["_ascii"] = types.MethodType( | |
_annotation_ascii, info | |
) # attach a method for this instance | |
info["text"] = "" | |
for sub in elt: | |
if sub.tag.endswith("text"): | |
info["text"] = self._strip_tags(sub.text) | |
elif sub.tag.endswith("annotationSet"): | |
a = self._handle_fulltextannotationset_elt( | |
sub, is_pos=(len(info["annotationSet"]) == 0) | |
) | |
if "cxnID" in a: # ignoring construction annotations for now | |
continue | |
a.sent = info | |
a.text = info.text | |
info["annotationSet"].append(a) | |
if "Target" in a: | |
for tspan in a.Target: | |
if tspan in target_spans: | |
self._warn( | |
'Duplicate target span "{0}"'.format( | |
info.text[slice(*tspan)] | |
), | |
tspan, | |
"in sentence", | |
info["ID"], | |
info.text, | |
) | |
# this can happen in cases like "chemical and biological weapons" | |
# being annotated as "chemical weapons" and "biological weapons" | |
else: | |
target_spans.add(tspan) | |
info["targets"].append((a.Target, a.luName, a.frameName)) | |
assert info["annotationSet"][0].status == "UNANN" | |
info["POS"] = info["annotationSet"][0].POS | |
info["POS_tagset"] = info["annotationSet"][0].POS_tagset | |
return info | |
def _handle_fulltextannotationset_elt(self, elt, is_pos=False): | |
"""Load information from the given 'annotationSet' element. Each | |
'annotationSet' contains several "layer" elements.""" | |
info = self._handle_luannotationset_elt(elt, is_pos=is_pos) | |
if not is_pos: | |
info["_type"] = "fulltext_annotationset" | |
if "cxnID" not in info: # ignoring construction annotations for now | |
info["LU"] = self.lu( | |
info.luID, | |
luName=info.luName, | |
frameID=info.frameID, | |
frameName=info.frameName, | |
) | |
info["frame"] = info.LU.frame | |
return info | |
def _handle_fulltextlayer_elt(self, elt): | |
"""Load information from the given 'layer' element. Each | |
'layer' contains several "label" elements.""" | |
info = self._load_xml_attributes(AttrDict(), elt) | |
info["_type"] = "layer" | |
info["label"] = [] | |
for sub in elt: | |
if sub.tag.endswith("label"): | |
l = self._load_xml_attributes(AttrDict(), sub) | |
info["label"].append(l) | |
return info | |
def _handle_framelexunit_elt(self, elt): | |
"""Load the lexical unit info from an xml element in a frame's xml file.""" | |
luinfo = AttrDict() | |
luinfo["_type"] = "lu" | |
luinfo = self._load_xml_attributes(luinfo, elt) | |
luinfo["definition"] = "" | |
luinfo["definitionMarkup"] = "" | |
luinfo["sentenceCount"] = PrettyDict() | |
luinfo["lexemes"] = PrettyList() # multiword LUs have multiple lexemes | |
luinfo["semTypes"] = PrettyList() # an LU can have multiple semtypes | |
for sub in elt: | |
if sub.tag.endswith("definition"): | |
luinfo["definitionMarkup"] = sub.text | |
luinfo["definition"] = self._strip_tags(sub.text) | |
elif sub.tag.endswith("sentenceCount"): | |
luinfo["sentenceCount"] = self._load_xml_attributes(PrettyDict(), sub) | |
elif sub.tag.endswith("lexeme"): | |
lexemeinfo = self._load_xml_attributes(PrettyDict(), sub) | |
if not isinstance(lexemeinfo.name, str): | |
# some lexeme names are ints by default: e.g., | |
# thousand.num has lexeme with name="1000" | |
lexemeinfo.name = str(lexemeinfo.name) | |
luinfo["lexemes"].append(lexemeinfo) | |
elif sub.tag.endswith("semType"): | |
semtypeinfo = self._load_xml_attributes(PrettyDict(), sub) | |
luinfo["semTypes"].append(self.semtype(semtypeinfo.ID)) | |
# sort lexemes by 'order' attribute | |
# otherwise, e.g., 'write down.v' may have lexemes in wrong order | |
luinfo["lexemes"].sort(key=lambda x: x.order) | |
return luinfo | |
def _handle_lexunit_elt(self, elt, ignorekeys): | |
""" | |
Load full info for a lexical unit from its xml file. | |
This should only be called when accessing corpus annotations | |
(which are not included in frame files). | |
""" | |
luinfo = self._load_xml_attributes(AttrDict(), elt) | |
luinfo["_type"] = "lu" | |
luinfo["definition"] = "" | |
luinfo["definitionMarkup"] = "" | |
luinfo["subCorpus"] = PrettyList() | |
luinfo["lexemes"] = PrettyList() # multiword LUs have multiple lexemes | |
luinfo["semTypes"] = PrettyList() # an LU can have multiple semtypes | |
for k in ignorekeys: | |
if k in luinfo: | |
del luinfo[k] | |
for sub in elt: | |
if sub.tag.endswith("header"): | |
continue # not used | |
elif sub.tag.endswith("valences"): | |
continue # not used | |
elif sub.tag.endswith("definition") and "definition" not in ignorekeys: | |
luinfo["definitionMarkup"] = sub.text | |
luinfo["definition"] = self._strip_tags(sub.text) | |
elif sub.tag.endswith("subCorpus") and "subCorpus" not in ignorekeys: | |
sc = self._handle_lusubcorpus_elt(sub) | |
if sc is not None: | |
luinfo["subCorpus"].append(sc) | |
elif sub.tag.endswith("lexeme") and "lexeme" not in ignorekeys: | |
luinfo["lexemes"].append(self._load_xml_attributes(PrettyDict(), sub)) | |
elif sub.tag.endswith("semType") and "semType" not in ignorekeys: | |
semtypeinfo = self._load_xml_attributes(AttrDict(), sub) | |
luinfo["semTypes"].append(self.semtype(semtypeinfo.ID)) | |
return luinfo | |
def _handle_lusubcorpus_elt(self, elt): | |
"""Load a subcorpus of a lexical unit from the given xml.""" | |
sc = AttrDict() | |
try: | |
sc["name"] = elt.get("name") | |
except AttributeError: | |
return None | |
sc["_type"] = "lusubcorpus" | |
sc["sentence"] = [] | |
for sub in elt: | |
if sub.tag.endswith("sentence"): | |
s = self._handle_lusentence_elt(sub) | |
if s is not None: | |
sc["sentence"].append(s) | |
return sc | |
def _handle_lusentence_elt(self, elt): | |
"""Load a sentence from a subcorpus of an LU from xml.""" | |
info = self._load_xml_attributes(AttrDict(), elt) | |
info["_type"] = "lusentence" | |
info["annotationSet"] = [] | |
info["_ascii"] = types.MethodType( | |
_annotation_ascii, info | |
) # attach a method for this instance | |
for sub in elt: | |
if sub.tag.endswith("text"): | |
info["text"] = self._strip_tags(sub.text) | |
elif sub.tag.endswith("annotationSet"): | |
annset = self._handle_luannotationset_elt( | |
sub, is_pos=(len(info["annotationSet"]) == 0) | |
) | |
if annset is not None: | |
assert annset.status == "UNANN" or "FE" in annset, annset | |
if annset.status != "UNANN": | |
info["frameAnnotation"] = annset | |
# copy layer info up to current level | |
for k in ( | |
"Target", | |
"FE", | |
"FE2", | |
"FE3", | |
"GF", | |
"PT", | |
"POS", | |
"POS_tagset", | |
"Other", | |
"Sent", | |
"Verb", | |
"Noun", | |
"Adj", | |
"Adv", | |
"Prep", | |
"Scon", | |
"Art", | |
): | |
if k in annset: | |
info[k] = annset[k] | |
info["annotationSet"].append(annset) | |
annset["sent"] = info | |
annset["text"] = info.text | |
return info | |
def _handle_luannotationset_elt(self, elt, is_pos=False): | |
"""Load an annotation set from a sentence in an subcorpus of an LU""" | |
info = self._load_xml_attributes(AttrDict(), elt) | |
info["_type"] = "posannotationset" if is_pos else "luannotationset" | |
info["layer"] = [] | |
info["_ascii"] = types.MethodType( | |
_annotation_ascii, info | |
) # attach a method for this instance | |
if "cxnID" in info: # ignoring construction annotations for now. | |
return info | |
for sub in elt: | |
if sub.tag.endswith("layer"): | |
l = self._handle_lulayer_elt(sub) | |
if l is not None: | |
overt = [] | |
ni = {} # null instantiations | |
info["layer"].append(l) | |
for lbl in l.label: | |
if "start" in lbl: | |
thespan = (lbl.start, lbl.end + 1, lbl.name) | |
if l.name not in ( | |
"Sent", | |
"Other", | |
): # 'Sent' and 'Other' layers sometimes contain accidental duplicate spans | |
assert thespan not in overt, (info.ID, l.name, thespan) | |
overt.append(thespan) | |
else: # null instantiation | |
if lbl.name in ni: | |
self._warn( | |
"FE with multiple NI entries:", | |
lbl.name, | |
ni[lbl.name], | |
lbl.itype, | |
) | |
else: | |
ni[lbl.name] = lbl.itype | |
overt = sorted(overt) | |
if l.name == "Target": | |
if not overt: | |
self._warn( | |
"Skipping empty Target layer in annotation set ID={0}".format( | |
info.ID | |
) | |
) | |
continue | |
assert all(lblname == "Target" for i, j, lblname in overt) | |
if "Target" in info: | |
self._warn( | |
"Annotation set {0} has multiple Target layers".format( | |
info.ID | |
) | |
) | |
else: | |
info["Target"] = [(i, j) for (i, j, _) in overt] | |
elif l.name == "FE": | |
if l.rank == 1: | |
assert "FE" not in info | |
info["FE"] = (overt, ni) | |
# assert False,info | |
else: | |
# sometimes there are 3 FE layers! e.g. Change_position_on_a_scale.fall.v | |
assert 2 <= l.rank <= 3, l.rank | |
k = "FE" + str(l.rank) | |
assert k not in info | |
info[k] = (overt, ni) | |
elif l.name in ("GF", "PT"): | |
assert l.rank == 1 | |
info[l.name] = overt | |
elif l.name in ("BNC", "PENN"): | |
assert l.rank == 1 | |
info["POS"] = overt | |
info["POS_tagset"] = l.name | |
else: | |
if is_pos: | |
if l.name not in ("NER", "WSL"): | |
self._warn( | |
"Unexpected layer in sentence annotationset:", | |
l.name, | |
) | |
else: | |
if l.name not in ( | |
"Sent", | |
"Verb", | |
"Noun", | |
"Adj", | |
"Adv", | |
"Prep", | |
"Scon", | |
"Art", | |
"Other", | |
): | |
self._warn( | |
"Unexpected layer in frame annotationset:", l.name | |
) | |
info[l.name] = overt | |
if not is_pos and "cxnID" not in info: | |
if "Target" not in info: | |
self._warn("Missing target in annotation set ID={0}".format(info.ID)) | |
assert "FE" in info | |
if "FE3" in info: | |
assert "FE2" in info | |
return info | |
def _handle_lulayer_elt(self, elt): | |
"""Load a layer from an annotation set""" | |
layer = self._load_xml_attributes(AttrDict(), elt) | |
layer["_type"] = "lulayer" | |
layer["label"] = [] | |
for sub in elt: | |
if sub.tag.endswith("label"): | |
l = self._load_xml_attributes(AttrDict(), sub) | |
if l is not None: | |
layer["label"].append(l) | |
return layer | |
def _handle_fe_elt(self, elt): | |
feinfo = self._load_xml_attributes(AttrDict(), elt) | |
feinfo["_type"] = "fe" | |
feinfo["definition"] = "" | |
feinfo["definitionMarkup"] = "" | |
feinfo["semType"] = None | |
feinfo["requiresFE"] = None | |
feinfo["excludesFE"] = None | |
for sub in elt: | |
if sub.tag.endswith("definition"): | |
feinfo["definitionMarkup"] = sub.text | |
feinfo["definition"] = self._strip_tags(sub.text) | |
elif sub.tag.endswith("semType"): | |
stinfo = self._load_xml_attributes(AttrDict(), sub) | |
feinfo["semType"] = self.semtype(stinfo.ID) | |
elif sub.tag.endswith("requiresFE"): | |
feinfo["requiresFE"] = self._load_xml_attributes(AttrDict(), sub) | |
elif sub.tag.endswith("excludesFE"): | |
feinfo["excludesFE"] = self._load_xml_attributes(AttrDict(), sub) | |
return feinfo | |
def _handle_semtype_elt(self, elt, tagspec=None): | |
semt = self._load_xml_attributes(AttrDict(), elt) | |
semt["_type"] = "semtype" | |
semt["superType"] = None | |
semt["subTypes"] = PrettyList() | |
for sub in elt: | |
if sub.text is not None: | |
semt["definitionMarkup"] = sub.text | |
semt["definition"] = self._strip_tags(sub.text) | |
else: | |
supertypeinfo = self._load_xml_attributes(AttrDict(), sub) | |
semt["superType"] = supertypeinfo | |
# the supertype may not have been loaded yet | |
return semt | |
# | |
# Demo | |
# | |
def demo(): | |
from nltk.corpus import framenet as fn | |
# | |
# It is not necessary to explicitly build the indexes by calling | |
# buildindexes(). We do this here just for demo purposes. If the | |
# indexes are not built explicitely, they will be built as needed. | |
# | |
print("Building the indexes...") | |
fn.buildindexes() | |
# | |
# Get some statistics about the corpus | |
# | |
print("Number of Frames:", len(fn.frames())) | |
print("Number of Lexical Units:", len(fn.lus())) | |
print("Number of annotated documents:", len(fn.docs())) | |
print() | |
# | |
# Frames | |
# | |
print( | |
'getting frames whose name matches the (case insensitive) regex: "(?i)medical"' | |
) | |
medframes = fn.frames(r"(?i)medical") | |
print('Found {0} Frames whose name matches "(?i)medical":'.format(len(medframes))) | |
print([(f.name, f.ID) for f in medframes]) | |
# | |
# store the first frame in the list of frames | |
# | |
tmp_id = medframes[0].ID | |
m_frame = fn.frame(tmp_id) # reads all info for the frame | |
# | |
# get the frame relations | |
# | |
print( | |
'\nNumber of frame relations for the "{0}" ({1}) frame:'.format( | |
m_frame.name, m_frame.ID | |
), | |
len(m_frame.frameRelations), | |
) | |
for fr in m_frame.frameRelations: | |
print(" ", fr) | |
# | |
# get the names of the Frame Elements | |
# | |
print( | |
'\nNumber of Frame Elements in the "{0}" frame:'.format(m_frame.name), | |
len(m_frame.FE), | |
) | |
print(" ", [x for x in m_frame.FE]) | |
# | |
# get the names of the "Core" Frame Elements | |
# | |
print('\nThe "core" Frame Elements in the "{0}" frame:'.format(m_frame.name)) | |
print(" ", [x.name for x in m_frame.FE.values() if x.coreType == "Core"]) | |
# | |
# get all of the Lexical Units that are incorporated in the | |
# 'Ailment' FE of the 'Medical_conditions' frame (id=239) | |
# | |
print('\nAll Lexical Units that are incorporated in the "Ailment" FE:') | |
m_frame = fn.frame(239) | |
ailment_lus = [ | |
x | |
for x in m_frame.lexUnit.values() | |
if "incorporatedFE" in x and x.incorporatedFE == "Ailment" | |
] | |
print(" ", [x.name for x in ailment_lus]) | |
# | |
# get all of the Lexical Units for the frame | |
# | |
print( | |
'\nNumber of Lexical Units in the "{0}" frame:'.format(m_frame.name), | |
len(m_frame.lexUnit), | |
) | |
print(" ", [x.name for x in m_frame.lexUnit.values()][:5], "...") | |
# | |
# get basic info on the second LU in the frame | |
# | |
tmp_id = m_frame.lexUnit["ailment.n"].ID # grab the id of the specified LU | |
luinfo = fn.lu_basic(tmp_id) # get basic info on the LU | |
print("\nInformation on the LU: {0}".format(luinfo.name)) | |
pprint(luinfo) | |
# | |
# Get a list of all of the corpora used for fulltext annotation | |
# | |
print("\nNames of all of the corpora used for fulltext annotation:") | |
allcorpora = set(x.corpname for x in fn.docs_metadata()) | |
pprint(list(allcorpora)) | |
# | |
# Get the names of the annotated documents in the first corpus | |
# | |
firstcorp = list(allcorpora)[0] | |
firstcorp_docs = fn.docs(firstcorp) | |
print('\nNames of the annotated documents in the "{0}" corpus:'.format(firstcorp)) | |
pprint([x.filename for x in firstcorp_docs]) | |
# | |
# Search for frames containing LUs whose name attribute matches a | |
# regexp pattern. | |
# | |
# Note: if you were going to be doing a lot of this type of | |
# searching, you'd want to build an index that maps from | |
# lemmas to frames because each time frames_by_lemma() is | |
# called, it has to search through ALL of the frame XML files | |
# in the db. | |
print( | |
'\nSearching for all Frames that have a lemma that matches the regexp: "^run.v$":' | |
) | |
pprint(fn.frames_by_lemma(r"^run.v$")) | |
if __name__ == "__main__": | |
demo() | |
framenet15 = LazyCorpusLoader( | |
"framenet_v15", | |
FramenetCorpusReader, | |
[ | |
"frRelation.xml", | |
"frameIndex.xml", | |
"fulltextIndex.xml", | |
"luIndex.xml", | |
"semTypes.xml", | |
], | |
) | |
framenet = LazyCorpusLoader( | |
"framenet_v17", | |
FramenetCorpusReader, | |
[ | |
"frRelation.xml", | |
"frameIndex.xml", | |
"fulltextIndex.xml", | |
"luIndex.xml", | |
"semTypes.xml", | |
], | |
) | |