File size: 3,585 Bytes
c45d283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os.path;
import re;

from graph import Graph;

EDS_MATCHER = re.compile(r'(.+?)(?<!\\):(.+)(?<!\\)\[(.*)(?<!\\)\]')
PROPERTIES_MATCHER = re.compile(r"{(.+)}$");
CARG_MATCHER = re.compile(r'\(\"(.+)(?<!\\)"\)$');
LNK_MATCHER = re.compile(r"<([0-9]+):([0-9]+)>$");

def read_instances(fp):
    top_handle, predicates = None, [];
    sentence_id = None;
    try:
      sentence_id = int(os.path.splitext(os.path.basename(fp.name))[0]);
    except:
      pass;
    first_curly = True
    for line in fp:
        line = line.strip()
        if len(line) == 0:
            pass
        elif line.startswith("#"):
            sentence_id = line[1:]
            first_curly = True
        elif line.startswith("{"):
            colon = line.index(":")
            assert colon >= 0
            top_handle = line[1:colon].strip()
        elif line.endswith("}"):
            assert len(line) == 1
            if first_curly:
                assert sentence_id is not None
                assert top_handle is not None
                assert len(predicates) > 0
                yield (sentence_id, top_handle, predicates)
                sentence_id, top_handle, predicates = None, None, []
                first_curly = False
        else:
            match = EDS_MATCHER.match(line)
            assert match is not None
            node_id, label, arguments = match.groups()
            arguments = [tuple(arg.split()) for arg in arguments.split(',') if len(arg) > 0]
            predicates.append((node_id, label.strip(), arguments))

def instance2graph(instance, reify = False, text = None):
    sentence_id, top, predicates = instance;
    anchors = None;
    graph = Graph(sentence_id, flavor = 1, framework = "eds");
    if text: graph.add_input(text);
    handle2node = {};
    for handle, label, _ in predicates:
        assert handle not in handle2node
        properties = None;
        values = None;
        match = PROPERTIES_MATCHER.search(label);
        if match:
            label = label[:match.start()];
            fields = match.group(1).replace(",", "").split();
            properties, values = list(), list();
            for i, field in enumerate(fields[1:]):
                if i % 2 == 0: properties.append(field);
                else: values.append(field);
        carg = None;
        match = CARG_MATCHER.search(label);
        if match:
            label = label[:match.start()];
            if not reify:
                properties = ["CARG"] + properties;
                values = [match.group(1)] + values;
            else:
                carg = match.group(1);
        anchors = None;
        match = LNK_MATCHER.search(label);
        if match:
            label = label[:match.start()];
            anchors = [{"from": int(match.group(1)), "to": int(match.group(2))}];
        handle2node[handle] = \
          graph.add_node(label = label, properties = properties, values = values, anchors = anchors);
        if carg and reify:
            carg = graph.add_node(label = carg, anchors = anchors);
            source = handle2node[handle].id;
            target = carg.id;
            graph.add_edge(source, target, "CARG");
    handle2node[top].is_top = True
    for src_handle, _, arguments in predicates:
        src = handle2node[src_handle].id
        for relation, tgt_handle in arguments:
            tgt = handle2node[tgt_handle].id
            graph.add_edge(src, tgt, relation)
    return graph

def read(fp, reify = False, text = None):
    for instance in read_instances(fp):
        yield instance2graph(instance, reify, text), None