#!/usr/bin/env python3 # -*- coding: utf-8; -*- import argparse; import json; import multiprocessing as mp; import re; import sys; import time; from pathlib import Path; from zipfile import ZipFile; import codec.amr; import codec.conllu; import codec.eds; import codec.mrp; import codec.norec; import codec.pmb; import codec.sdp; import codec.treex; import codec.ucca; import inspector; import score.edm; import score.mces; import score.sdp; import score.smatch; import score.ucca; import validate.core; from analyzer import analyze; __author__ = "oe" ENCODING = "utf-8"; NORMALIZATIONS = {"anchors", "case", "edges", "attributes"}; VALIDATIONS = {"input", "anchors", "edges", "amr", "eds", "sdp", "ucca"} def read_graphs(stream, format = None, full = False, normalize = False, reify = False, node_centric = False, frameworks = None, prefix = None, text = None, filter = None, trace = 0, strict = 0, quiet = False, robust = False, alignment = None, anchors = None, pretty = False, id = None, n = None, i = None): name = getattr(stream, "name", ""); if name.endswith(".zip"): with ZipFile(name) as zip: stream = None; for entry in zip.namelist(): if entry.endswith(".mrp"): if stream is not None: print("read_graphs(): multiple MRP entries in ‘{}’; exit." "".format(name), file = sys.stderr); sys.exit(1); stream = zip.open(entry); if stream is None: print("read_graphs(): missing MRP entry in ‘{}’; exit." "".format(name), file = sys.stderr); sys.exit(1); generator = None; if format in {"amr", "camr"}: generator \ = codec.amr.read(stream, full = full, reify = reify, text = text, camr = format == "camr", alignment = alignment, quiet = quiet, trace = trace); elif format in {"ccd", "dm", "pas", "psd"}: generator = codec.sdp.read(stream, framework = format, text = text); elif format == "eds": generator = codec.eds.read(stream, reify = reify, text = text); elif format == "mrp": generator = codec.mrp.read(stream, text = text, robust = robust); elif format == "norec": generator = codec.norec.read(stream, text = text, node_centric = node_centric); elif format == "pmb": generator = codec.pmb.read(stream, full = full, reify = reify, text = text, trace = trace, strict = strict); elif format == "treex": generator = codec.treex.read(stream) elif format == "ucca": generator = codec.ucca.read(stream, text = text, prefix = prefix); elif format == "conllu" or format == "ud": generator = codec.conllu.read(stream, framework = format, text = text, anchors = anchors, trace = trace); elif format == "eud": generator = codec.conllu.read(stream, framework = format, text = text, anchors = anchors, trace = trace, enhanced_graph = True); else: print("read_graphs(): invalid input codec {}; exit." "".format(format), file = sys.stderr); sys.exit(1); if generator is None: return None, None; # # (for now) break out of the generators, for downstream simplicity # graphs = []; overlays = []; j = 0; while n is None or n < 1 or j < n: try: graph, overlay = next(generator); if frameworks is not None and graph.framework not in frameworks: continue; if filter is not None and graph.id not in filter: continue; if id is not None: if graph.id == id: graphs.append(graph); overlays.append(overlay); elif i is not None and i >= 0: if j == i: graphs.append(graph); overlays.append(overlay); break; else: graphs.append(graph); overlays.append(overlay); j += 1; except StopIteration: break; except Exception as error: print(error, file = sys.stderr); pass; if pretty: for graph in graphs: graph.prettify(trace); if normalize: for graph in graphs: graph.normalize(normalize, trace); return graphs, overlays; def main(args=None): parser = argparse.ArgumentParser(description = "MRP Graph Toolkit"); parser.add_argument("--inspect", action = "store_true"); parser.add_argument("--analyze", action = "store_true"); parser.add_argument("--normalize", action = "append", default = []); parser.add_argument("--full", action = "store_true"); parser.add_argument("--reify", action = "store_true"); parser.add_argument("--node_centric", action = "store_true"); parser.add_argument("--unique", action = "store_true"); parser.add_argument("--ids", action = "store_true"); parser.add_argument("--strings", action = "store_true"); parser.add_argument("--framework", action = "append", default = []); parser.add_argument("--gold", type = argparse.FileType("r", encoding = ENCODING)); parser.add_argument("--alignment", type = argparse.FileType("r", encoding = ENCODING)); parser.add_argument("--overlay", type = argparse.FileType("w", encoding = ENCODING)); parser.add_argument("--format"); parser.add_argument("--score"); parser.add_argument("--validate", action = "append", default = []); parser.add_argument("--limit"); parser.add_argument("--read", required = True); parser.add_argument("--write"); parser.add_argument("--text"); parser.add_argument("--inverse", action = "store_true"); parser.add_argument("--anchors", type = argparse.FileType("r", encoding = ENCODING)); parser.add_argument("--prefix"); parser.add_argument("--source"); parser.add_argument("--targets"); parser.add_argument("--pretty", action = "store_true"); parser.add_argument("--inject"); parser.add_argument("--version", type = float, default = 1.1); parser.add_argument("--cores", type = int, default = 1); parser.add_argument("--i", type = int); parser.add_argument("--n", type = int); parser.add_argument("--id"); parser.add_argument("--filter"); parser.add_argument("--quiet", action = "store_true"); parser.add_argument("--robust", action = "store_true"); parser.add_argument("--trace", "-t", action = "count", default = 0); parser.add_argument("--strict", action = "count", default = 0); parser.add_argument("--errors", type = argparse.FileType("w", encoding = ENCODING)); parser.add_argument("input", nargs = "?", type = argparse.FileType("r", encoding = ENCODING), default = sys.stdin); parser.add_argument("output", nargs = "?", type = argparse.FileType("w", encoding = ENCODING), default = sys.stdout); if args is None: args = sys.argv arguments = parser.parse_args(args); text = None; if arguments.text is not None: path = Path(arguments.text); if path.is_file(): text = {}; with path.open() as stream: for line in stream: id, string = line.split("\t", maxsplit = 1); if string.endswith("\n"): string = string[:len(string) - 1]; if arguments.inverse: text[string] = id; else: text[id] = string; elif path.is_dir(): text = path; elif arguments.inverse: print("main.py(): option ‘--inverse’ requires ‘--text’; exit.", file = sys.stderr); sys.exit(1); if arguments.read not in {"mrp", "ccd", "dm", "pas", "psd", "treex", "eds", "ucca", "amr", "camr", "pmb", "conllu", "ud", "eud", "norec"}: print("main.py(): invalid input format: {}; exit." "".format(arguments.read), file = sys.stderr); sys.exit(1); filter = None; if arguments.filter is not None: try: path = Path(arguments.filter); filter = set(); with path.open() as stream: for line in stream: filter.add(line.split("\t", maxsplit = 1)[0]); except: print("main.py(): invalid ‘--filter’: {}; exit." "".format(arguments.write), file = sys.stderr); sys.exit(1); if filter is not None and len(filter) == 0: filter = None; if arguments.write is not None and \ arguments.write not in \ {"dot", "tikz", "displacy", "evaluation", "id", "json", "mrp", "source", "targets", "txt", "ucca", "norec"}: print("main.py(): invalid output format: {}; exit." "".format(arguments.write), file = sys.stderr); sys.exit(1); # # backwards compatibility: desirable until august 2019, say # if arguments.score == "mces": arguments.score = "mrp"; if arguments.score is not None and \ arguments.score not in {"mrp", "sdp", "edm", "ucca", "smatch"}: print("main.py(): invalid evaluation metric: {}; exit." "".format(arguments.score), file = sys.stderr); sys.exit(1); if arguments.format and \ arguments.format not in {"mrp", "ccd", "dm", "pas", "psd", "eds", "ucca", "amr", "camr", "pmb", "conllu", "ud", "eud"}: print("main.py(): invalid gold format: {}; exit." "".format(arguments.read), file = sys.stderr); sys.exit(1); if len(arguments.normalize) == 1 and arguments.normalize[0] == "all": normalize = NORMALIZATIONS; else: normalize = set(); for action in arguments.normalize: if action in NORMALIZATIONS: normalize.add(action); else: print("main.py(): invalid type of normalization: {}; exit." "".format(action), file = sys.stderr); sys.exit(1); if arguments.score is not None and len(normalize) == 0: normalize = NORMALIZATIONS; if arguments.targets == "gather" and not arguments.unique: print("main.py(): option ‘--targets gather’ requires ‘--unique’; exit.", file = sys.stderr); sys.exit(1); if arguments.alignment is not None and arguments.overlay is None: print("main.py(): option ‘--alignment’ requires ‘--overlay’; exit.", file = sys.stderr); sys.exit(1); if len(arguments.framework) == 0: arguments.framework = None; if arguments.cores == 0: arguments.cores = mp.cpu_count(); graphs, overlays \ = read_graphs(arguments.input, format = arguments.read, full = arguments.full, normalize = normalize, reify = arguments.reify, frameworks = arguments.framework, text = text, filter = filter, alignment = arguments.alignment, anchors = arguments.anchors, pretty = arguments.pretty, trace = arguments.trace, strict = arguments.strict, node_centric = arguments.node_centric, quiet = arguments.quiet, robust = arguments.robust, id = arguments.id, n = arguments.n, i = arguments.i); if graphs is None: print("main.py(): unable to read input graphs: {}; exit." "".format(arguments.input.name), file = sys.stderr); sys.exit(1); if arguments.unique: targets = dict(); if arguments.targets == "gather": for graph in graphs: if graph.id in targets: targets[graph.id].add(graph.framework); else: targets[graph.id] = {graph.framework}; arguments.targets = None; unique = list(); ids = set(); for graph in graphs: id = graph.id; if id in targets: graph.targets(list(targets[id])); if id not in ids: ids.add(id); unique.append(graph); graphs = unique; # # inject any additional information provided on the command line # if arguments.source: for graph in graphs: graph.source(arguments.source); if arguments.inject: for graph in graphs: graph.inject(arguments.inject); if arguments.validate == ["all"]: actions = VALIDATIONS; else: actions = set(); for action in arguments.validate: if action in VALIDATIONS: actions.add(action); else: print("main.py(): invalid type of validation: {}; exit." "".format(action), file = sys.stderr); sys.exit(1); if arguments.quiet: arguments.trace = 0; if actions: for graph in graphs: validate.core.test(graph, actions, stream = sys.stderr); if arguments.analyze: analyze(graphs); gold = None; if arguments.gold and arguments.score or arguments.inspect: if arguments.format is None: arguments.format = arguments.read; gold, _ = read_graphs(arguments.gold, format = arguments.format, full = arguments.full, normalize = normalize, reify = arguments.reify, node_centric = arguments.node_centric, frameworks = arguments.framework, text = text, filter = filter, trace = arguments.trace, quiet = arguments.quiet, robust = arguments.robust, id = arguments.id, n = arguments.n, i = arguments.i); if gold is None: print("main.py(): unable to read gold graphs: {}; exit." "".format(arguments.gold.name), file = sys.stderr); sys.exit(1); if arguments.inspect: result = inspector.summarize(graphs, gold); if arguments.write == "json" or True: json.dump(result, arguments.output, indent = None); print(file = arguments.output); sys.exit(0); if arguments.score: limits = {"rrhc": None, "mces": None}; for metric in arguments.score.split(","): if arguments.limit is not None: try: match = re.search(r"([0-9]+):([0-9]+)", arguments.limit) if match: limits["rrhc"] = int(match.group(1)); limits["mces"] = int(match.group(2)); else: if metric == "smatch": limits["rrhc"] = int(arguments.limit); else: limits["mces"] = int(arguments.limit); except: print("main.py(): invalid ‘--limit’ {}; exit." "".format(arguments.limit), file = sys.stderr); sys.exit(1); errors = dict() if arguments.errors else None; result = None; launch = time.time(), time.process_time(); if metric == "edm": result = score.edm.evaluate(gold, graphs, format = arguments.write, trace = arguments.trace); elif metric == "mrp": result = score.mces.evaluate(gold, graphs, format = arguments.write, limits = limits, cores = arguments.cores, trace = arguments.trace, errors = errors, quiet = arguments.quiet); elif metric == "sdp": result = score.sdp.evaluate(gold, graphs, format = arguments.write, trace = arguments.trace); elif metric == "smatch": result = score.smatch.evaluate(gold, graphs, format = arguments.write, limit = limits["rrhc"], values = {"tops", "labels", "properties", "anchors", "edges", "attributes"}, trace = arguments.trace); elif metric == "ucca": result = score.ucca.evaluate(gold, graphs, format = arguments.write, trace = arguments.trace); if result is not None: result["time"] = time.time() - launch[0]; result["cpu"] = time.process_time() - launch[1]; if arguments.write == "json" or True: # # _fix_me_ # we should write a genuine custom JSON encoder # print("{", file = arguments.output, end = ""); start = True; for key in result: if start: start = False; else: print(",\n ", file = arguments.output, end = ""); print("\"{}\": ".format(key), file = arguments.output, end = ""); json.dump(result[key], arguments.output, indent = None); print("}", file = arguments.output); if errors is not None: if arguments.write == "dot": for graph in gold: graph.dot(arguments.errors, ids = arguments.ids, strings = arguments.strings, errors = errors[graph.framework][graph.id]); elif arguments.write == "json" or True: json.dump(errors, arguments.errors, indent = None); sys.exit(0); for graph in graphs: if arguments.write in {"mrp", "evaluation"}: if arguments.write == "evaluation": graph.flavor = graph.framework = graph.nodes = graph.edges = None; if arguments.targets is not None: graph.targets(arguments.targets.split(",")); json.dump(graph.encode(arguments.version), arguments.output, indent = None, ensure_ascii = False); print(file = arguments.output); elif arguments.write == "dot": graph.dot(arguments.output, ids = arguments.ids, strings = arguments.strings); print(file = arguments.output); elif arguments.write == "tikz": graph.tikz(arguments.output); elif arguments.write == "displacy": graph.displacy(arguments.output); elif arguments.write == "id": print("{}".format(graph.id), file = arguments.output); elif arguments.write == "source": print("{}\t{}".format(graph.id, graph.source()), file = arguments.output); elif arguments.write == "targets": for target in graph.targets() or (""): print("{}\t{}".format(graph.id, target), file = arguments.output); elif arguments.write == "txt": print("{}\t{}".format(graph.id, graph.input), file = arguments.output); elif arguments.write == "ucca": # Prints everything to one long file. To split to separate XML files, use, e.g., # csplit -zk output.xml '/^