ltg
/

larkkin's picture
Add code and readme
c45d283
raw
history blame
2.07 kB
import json;
import operator;
import os;
import sys;
from graph import Graph
def read(fp, text = None, robust = False):
input, i = None, 0;
def compute(form):
nonlocal i;
m = None;
j = input.find(form, i);
if j >= i:
i, m = j, len(form);
else:
base = form;
k, l = len(input), 0;
for old, new in {("β€˜", "`"), ("β€˜", "'"), ("’", "'"), ("`", "'"),
("β€œ", "\""), ("”", "\""),
("–", "--"), ("–", "---"), ("β€”", "---"),
("…", "..."), ("…", ". . .")}:
form = base.replace(old, new);
j = input.find(form, i);
if j >= i and j < k: k, l = j, len(form);
if k < len(input): i, m = k, l;
if m:
match = {"from": i, "to": i + m};
i += m;
return match;
else:
raise Exception("failed to anchor |{}| in |{}|{}| ({})"
"".format(form, input[:i], input[i:], i));
def anchor(graph, old, new):
nonlocal input, i;
strings = dict();
for node in graph.nodes:
for j in range(len(node.anchors) if node.anchors else 0):
start, end = node.anchors[j]["from"], node.anchors[j]["to"];
strings[(start, end)] = old[start:end];
input, i = new, 0;
for key in sorted(strings.keys(), key = operator.itemgetter(0, 1)):
strings[key] = compute(strings[key]);
for node in graph.nodes:
for j in range(len(node.anchors) if node.anchors else 0):
node.anchors[j] \
= strings[(node.anchors[j]["from"], node.anchors[j]["to"])];
for j, line in enumerate(fp):
try:
graph = Graph.decode(json.loads(line.rstrip()), robust = robust);
if text is not None:
if graph.input in text:
graph.id = text[graph.input];
else:
old = graph.input;
graph.add_input(text);
anchor(graph, old, graph.input);
yield graph, None;
except Exception as error:
print("codec.mrp.read(): ignoring line {}: {}"
"".format(j, error), file = sys.stderr);