larkkin's picture
Add code and readme
c45d283
raw
history blame
No virus
10.3 kB
from operator import itemgetter;
import os.path;
import re;
import sys;
from graph import Graph;
conditions = {"APX": "โ‰ˆ", "EQU": "=", "LEQ": "โ‰ค", "LES": "<", "NEQ": "โ‰ ",
"SXN": "ยซ", "SXP": "ยป", "SXY": "โ‰–", "SZN": "\\", "SZP": "/",
"STI": "โŠ", "STO": "โŠ", "SY1": "โˆฅ", "SY2": "โšฎ",
"TAB": "โ‹ˆ", "TPR": "โ‰บ"};
#
# in parsing the clauses, patterns are ordered by specificity
#
id_matcher = re.compile(r'^%%% bin/boxer --input (?:[^/]+/)?p([0-9]+)/d([0-9]+)/');
referent_matcher = re.compile(r'^(b[0-9]+) REF ([enpstx][0-9]+) +%(?: .* \[([0-9]+)\.\.\.([0-9]+)\])?$');
condition_matcher = re.compile(r'^(b[0-9]+) (EQU|NEQ|APX|LE[SQ]|TPR|TAB|S[ZX][PN]|ST[IO]|SY[12]|SXY) ([enpstx][0-9]+|"[^"]+") ([enpstx][0-9]+|"[^"]+") +%(?: .* \[([0-9]+)\.\.\.([0-9]+)\])?$');
role_matcher = re.compile(r'^(b[0-9]+) ([^ ]+) ([enpstx][0-9]+) ([enpstx][0-9]+|"[^"]+") +%(?: .* \[([0-9]+)\.\.\.([0-9]+)\])?$');
concept_matcher = re.compile(r'^(b[0-9]+) ([^ ]+) ("[^ ]+") ([enpstx][0-9]+) +%(?: .* \[([0-9]+)\.\.\.([0-9]+)\])?$');
discourse_matcher = re.compile(r'^(b[0-9]+) ([^ ]+) (b[0-9]+)(?: (b[0-9]+))? +%(?: .* \[[0-9]+\.\.\.[0-9]+\])?$');
empty_matcher = re.compile(r'^ *%(?: .* \[[0-9]+\.\.\.[0-9]+\])?$');
def read(fp, text = None, full = False, reify = False, trace = 0, strict = 0):
def finish(graph, mapping, finis, scopes):
if reify:
for box, referent, node in finis:
#
# in full reification mode, or when the corresponding box cannot be
# easily inferred for a reified role (including when the source node is
# a constant, as e.g. in a 'future' temporal discourse conditions),
# add an explicit box membership edge.
#
if full \
or referent[0] == referent[-1] == "\"" \
or box not in scopes[referent]:
graph.add_edge(mapping[box].id, node.id, "โˆˆ");
else:
for referent in scopes:
if len(scopes[referent]) > 1:
print("pbm.read(): [graph #{}] stray referent โ€˜{}โ€™ in boxes {}."
"".format(graph.id, referent, scopes[referent]),
file=sys.stderr);
#
# after the fact, mark all boxes that structurally are roots as top nodes.
#
for node in graph.nodes:
if node.type == 0 and node.is_root(): node.is_top = True;
graph = None; id = None; sentence = None;
mapping = dict(); scopes = dict(); finis = list();
i = 0;
header = 3;
for line in fp:
line = line.rstrip(); i += 1;
if trace: print("{}: {}".format(i, line));
#
# to support newline-separated concatenations of clause files (a format not
# used in the native PMB 3.0 release),
#
if len(line) == 0:
finish(graph, mapping, finis, scopes);
yield graph, None;
graph = None; id = None;
mapping = dict(); scopes = dict(); finis = list();
header = 3;
continue;
#
# each block of clauses is preceded by three comment lines, which we use to
# extract the sentence identifier and underlying string.
#
if header:
if header == 3: pass;
elif header == 2:
match = id_matcher.match(line);
if match is None:
raise Exception("pbm.read(): "
"[line {}] missing identifier in โ€˜{}โ€™; exit."
"".format(i, line));
part, document = match.groups();
id = "{:02d}{:04d}".format(int(part), int(document));
elif header == 1:
if text is not None and id in text: sentence = text[id];
else: sentence = line[5:-1];
graph = Graph(id, flavor = 2, framework = "drg");
graph.add_input(sentence);
header -= 1;
continue;
#
# from here onwards, we are looking at genuine, contentful clauses. from
# inspecting some of the files, it appears they are organized according to
# surface (reading) order, and we cannot assume that discourse referents
# are 'introduced' (in some box) prior to their first occurance in e.g. a
# role or concept clause.
#
anchor = None;
match = referent_matcher.match(line);
if match is not None:
box, referent, start, end = match.groups();
if referent in scopes:
if strict and box not in scopes[referent] and reify:
raise Exception("pbm.read(): "
"[line {}] stray referent โ€˜{}โ€™ in box โ€˜{}โ€™ "
"(instead of โ€˜{}โ€™); exit."
"".format(i, referent, box, scopes[referent]));
else: scopes[referent] = {box};
if box not in mapping: mapping[box] = graph.add_node(type = 0);
if start is not None and end is not None:
anchor = {"from": int(start), "to": int(end)};
if referent not in mapping:
mapping[referent] \
= graph.add_node(anchors = [anchor] if anchor else None);
else:
node = mapping[referent];
node.add_anchor(anchor);
graph.add_edge(mapping[box].id, mapping[referent].id, "โˆˆ");
else:
match = condition_matcher.match(line);
if match is not None:
box, condition, source, target, start, end = match.groups();
condition = conditions[condition];
if source[0] == "\"" and source[-1] == "\"" and source not in mapping:
if start is not None and end is not None:
anchor = {"from": int(start), "to": int(end)};
mapping[source] \
= graph.add_node(label = source,
anchors = [anchor] if anchor else None);
elif source not in mapping: mapping[source] = graph.add_node();
if target[0] == "\"" and target[-1] == "\"" and target not in mapping:
if start is not None and end is not None:
anchor = {"from": int(start), "to": int(end)};
mapping[target] \
= graph.add_node(label = target,
anchors = [anchor] if anchor else None);
elif target not in mapping: mapping[target] = graph.add_node();
if reify:
if box not in mapping: mapping[box] = graph.add_node(type = 0);
node = graph.add_node(label = condition, type = 3);
finis.append((box, source, node));
graph.add_edge(mapping[source].id, node.id, None);
graph.add_edge(node.id, mapping[target].id, None);
else:
if source in scopes: scopes[source].add(box);
else: scopes[source] = {box};
graph.add_edge(mapping[source].id, mapping[target].id, condition);
else:
match = role_matcher.match(line);
if match is not None:
box, role, source, target, start, end = match.groups();
if source not in mapping: mapping[source] = graph.add_node();
if target[0] == "\"" and target[-1] == "\"" and target not in mapping:
if start is not None and end is not None:
anchor = {"from": int(start), "to": int(end)};
mapping[target] \
= graph.add_node(label = target,
anchors = [anchor] if anchor else None);
elif target not in mapping: mapping[target] = graph.add_node();
if reify:
if box not in mapping: mapping[box] = graph.add_node(type = 0);
node = graph.add_node(label = role, type = 2);
finis.append((box, source, node));
graph.add_edge(mapping[source].id, node.id, None);
graph.add_edge(node.id, mapping[target].id, None);
else:
if source in scopes: scopes[source].add(box);
else: scopes[source] = {box};
graph.add_edge(mapping[source].id, mapping[target].id, role);
else:
match = concept_matcher.match(line);
if match is not None:
box, lemma, sense, referent, start, end = match.groups();
if referent in scopes:
if strict and box not in scopes[referent] and reify:
raise Exception("pbm.read(): "
"[line {}] stray referent โ€˜{}โ€™ in box โ€˜{}โ€™ "
"(instead of โ€˜{}โ€™); exit."
"".format(i, referent, box, scopes[referent]));
else: scopes[referent] = {box};
if start is not None and end is not None:
anchor = {"from": int(start), "to": int(end)};
if referent not in mapping:
mapping[referent] = node \
= graph.add_node(anchors = [anchor] if anchor else None);
else:
node = mapping[referent];
node.add_anchor(anchor);
if strict and node.label is not None:
raise Exception("pbm.read(): "
"[line {}] duplicate label โ€˜{}โ€™ on referent โ€˜{}โ€™ "
"(instead of โ€˜{}โ€™); exit."
"".format(i, lemma, referent, node.label));
node.label = lemma;
if sense[0] == sense[-1] == "\"": sense = sense[1:-1];
node.set_property("sense", sense);
else:
match = discourse_matcher.match(line);
if match is not None:
top, relation, one, two = match.groups();
if one not in mapping: mapping[one] = graph.add_node(type = 0);
if two is not None:
if trace > 1: print("ternary discourse relation");
if two not in mapping: mapping[two] = graph.add_node(type = 0);
graph.add_edge(mapping[one].id, mapping[two].id, relation);
else:
if top not in mapping: mapping[top] = graph.add_node(type = 0);
graph.add_edge(mapping[top].id, mapping[one].id, relation);
elif empty_matcher.search(line) is None:
raise Exception("pmb.read(): [line {}] invalid clause โ€˜{}โ€™."
"".format(i, line));
#
# finally, as we reach an end of file (without an empty line terminating the
# preceding block of clauses, as is the standard format in PMB), finalize the
# graph and return it.
#
if graph is not None:
finish(graph, mapping, finis, scopes);
yield graph, None;