File size: 10,193 Bytes

c45d283

import re;
import sys;

import codec.mrp;
from graph import Edge, Graph;
from smatch.amr import AMR;

STASH = re.compile(r'__[0-9]+__');
INDEX = re.compile(r'x([0-9]+)((:?_[0-9]+)*)');

def amr_lines(fp, camr, alignment):
    id, snt, lines = None, None, [];
    stash = dict();
    def _stash_(match):
        prefix, constant, suffix = match.groups();
        fields = constant.split("/");
        if fields[0] in stash:
            if stash[fields[0]][2] != fields[1]:
                raise Exception("amr_lines(): "
                                "ambiguously defined constant in graph #{}, "
                                "‘{}’: ‘{}’ vs. ‘{}’; exit."
                                "".format(id, fields[0],
                                          stash[fields[0]][2], fields[1]));
        else:
                stash[fields[0]] = (len(stash), fields[0], fields[1]);
        return "{}__{}__{}".format(prefix, stash[fields[0]][0], suffix);

    alignment = read_alignment(alignment);
    for line in fp:
        line = line.strip();
        if len(line) == 0:
            if len(lines) > 0:
                i = mapping = None;
                try:
                    i, mapping = next(alignment);
                except Exception as error:
                    print("amr_lines(): missing alignment for graph #{}."
                          "".format(id), file = sys.stderr);
                    pass;
                yield id, snt, " ".join(lines), stash.values(), \
                    mapping if mapping is not None and i == id else None;
            id, lines = None, []; stash.clear();
        else:
            if line.startswith("#"):
                if line.startswith("# ::id"):
                    id = line.split()[2];
                if line.startswith("# ::snt"):
                   snt = line[8:].strip();
            else:
                if camr:
                    line = re.sub(r'((?:^|[ \t]):[^( ]+)\([^ \t]*\)([ \t]|$)',
                                  "\\1\\2", line, count = 0);
                    line = re.sub(r'(^|[ \t])(x[0-9]+/[^ \t]+)([ \t]|$)',
                                  _stash_, line, count = 0);
                lines.append(line)
    if len(lines) > 0:
        i = mapping = None;
        try:
            i, mapping = next(alignment);
        except:
            print("amr_lines(): missing alignment for graph #{}."
                  "".format(id), file = sys.stderr);
            pass;
        yield id, snt, " ".join(lines), stash.values(), \
            mapping if mapping is not None and i == id else None;

def read_alignment(stream):
    if stream is None:
        while True: yield None, None;
    else: 
        id = None;
        alignment = dict();
        for line in stream:
            line = line.strip();
            if len(line) == 0:
                yield id, alignment;
                id = None;
                alignment.clear();
            else:
                if line.startswith("#"):
                    if line.startswith("# ::id"):
                        id = line.split()[2];
                else:
                    fields = line.split("\t");
                    if len(fields) == 2:
                        start, end = fields[1].split("-");
                        span = set(range(int(start), int(end) + 1));
                        fields = fields[0].split();
                        if len(fields) > 1 and fields[1].startswith(":"):
                            fields[1] = fields[1][1:];
                            if fields[1] == "wiki": continue;
                        if fields[0] not in alignment:
                            alignment[fields[0]] = bucket = dict();
                        else: bucket = alignment[fields[0]];
                        path = tuple(fields[1:]);
                        if path not in bucket: bucket[path] = can = set();
                        else: can =  bucket[path];
                        can |= span;
        yield id, alignment;

def amr2graph(id, amr, text, stash, camr = False,
              full = False, reify = False, quiet = False, alignment = None):
    graph = Graph(id, flavor = 2, framework = "amr");
    node2id = dict();
    anchoring = list();

    i = 0;
    def _anchor_(form):
        nonlocal i;
        m = None;
        j = graph.input.find(form, i);
        if j >= i:
            i, m = j, len(form);
        else:
            base = form;
            k, l = len(graph.input), 0;
            for old, new in {("‘", "`"), ("‘", "'"), ("’", "'"), ("`", "'"),
                             ("“", "\""), ("”", "\""),
                             ("–", "--"), ("–", "---"), ("—", "---"),
                             ("…", "..."), ("…", ". . .")}:
                form = base.replace(old, new);
                j = graph.input.find(form, i);
                if j >= i and j < k: k, l = j, len(form);
            if k < len(graph.input): i, m = k, l;
        if m:
            match = {"from": i, "to": i + m}; 
            i += m;
            return match;
        else:
            raise Exception("failed to anchor |{}| in |{}|{}| ({})"
                            "".format(form, graph.input[:i],
                                      graph.input[i:], i));

    if text:
        graph.add_input(text, quiet = quiet);
        if camr:
            for token in graph.input.split(" "):
                anchoring.append(_anchor_(token));
    i = 0;
    for n, v, a in zip(amr.nodes, amr.node_values, amr.attributes):
        j = i;
        node2id[n] = j;
        top = False;
        for key, val in a:
            if key == "TOP":
                top = True;
        anchors = find_anchors(n, anchoring) if camr else None;
        node = graph.add_node(j, label = v, top = top, anchors = anchors);
        i += 1
        for key, val in a:
            if STASH.match(val) is not None:
                index = int(val[2:-2]);
                val = next(v for k, x, v in stash if k == index);
            if key != "TOP" and (key not in {"wiki"} or full):
                if val.endswith("¦"):
                    val = val[:-1];
                if reify:
                    graph.add_node(i, label = val);
                    graph.add_edge(j, i, key);
                    i += 1
                else:
                    #
                    # _fix_me_ 
                    # this assumes that properties are unique.  (1-apr-20; oe)
                    #
                    node.set_property(key.lower(), str(val).lower());

    for src, r in zip(amr.nodes, amr.relations):
        for label, tgt in r:
            normal = None;
            if label == "mod":
                normal = "domain";
            elif label.endswith("-of-of") \
                 or label.endswith("-of") \
                   and label not in {"consist-of" "subset-of"} \
                   and not label.startswith("prep-"):
                normal = label[:-3];
            graph.add_edge(node2id[src], node2id[tgt], label, normal)

    overlay = None;
    if alignment is not None:
        overlay = Graph(id, flavor = -1, framework = "anchoring");
        for node in alignment:
            for path, span in alignment[node].items():
                if len(path) == 0:
                    anchors = [{"#": token} for token in span];
                    node = overlay.add_node(node2id[node], anchors = anchors);
        for node in alignment:
            id = node2id[node];
            for path, span in alignment[node].items():
                if len(path) == 1:
                    key = path[0].lower();
                    node = overlay.find_node(id);
                    if node is None: node = overlay.add_node(id);
                    reference = graph.find_node(id);
                    anchors = [{"#": token} for token in span];
                    if reference.properties is not None \
                       and key in reference.properties:
                        node.set_anchoring(key, anchors);
                    else:
                        edge = next(edge for edge in graph.edges if edge.lab.lower() == key and edge.src == id);
                        overlay.edges.add(Edge(edge.id, None, None, None, anchors = anchors));
                elif len(path) > 1:
                    print("amr2graph(): "
                          "ignoring alignment path {} on node #{} ({})"
                          "".format(path, id, node));

    return graph, overlay;

def find_anchors(index, anchors):
    result = list();
    for match in INDEX.finditer(index):
        i, suffix = match.group(1), match.group(2);
        i = int(i) - 1;
        if i >= len(anchors): continue;
        anchor = anchors[i];
        if suffix != "":
            fields = suffix[1:].split("_");
            start = anchor["from"];
            for field in fields:
                j = int(field);
                result.append({"from": start + j - 1, "to": start + j});
        else:
            result.append(anchor);
    return result if len(result) > 0 else None;

def convert_amr_id(id):
    m = re.search(r'wsj_([0-9]+)\.([0-9]+)', id);
    if m:
        return "2%04d%03d" % (int(m.group(1)), int(m.group(2)));
    m = re.search(r'lpp_1943\.([0-9]+)', id);
    if m:
        return "1%04d0" % (int(m.group(1)));
    else:
        raise Exception('Could not convert id: %s' % id);

def read(fp, full = False, reify = False, camr = False,
         text = None, alignment = None,
         quiet = False, trace = 0):
    n = 0;
    for id, snt, amr_line, stash, mapping in amr_lines(fp, camr, alignment):
        if trace:
            print("{}: {}".format(id, amr_line), file = sys.stderr);
        amr = AMR.parse_AMR_line(amr_line);
        if not amr:
            raise Exception("failed to parse #{} ‘{}’; exit."
                            "".format(id, amr_line));
        if id is not None:
            try:
                id = convert_amr_id(id);
            except:
                pass;
        else:
            id = n;
            n += 1;
        graph, overlay = amr2graph(id, amr, text or snt, stash,
                                   camr, full, reify, quiet, mapping);
        yield graph, overlay;