|
|
|
|
|
|
|
""" |
|
AMR (Abstract Meaning Representation) structure |
|
For detailed description of AMR, see http://www.isi.edu/natural-language/amr/a.pdf |
|
|
|
""" |
|
|
|
from __future__ import print_function |
|
from collections import defaultdict |
|
import sys |
|
|
|
|
|
ERROR_LOG = sys.stderr |
|
|
|
|
|
DEBUG_LOG = sys.stderr |
|
|
|
|
|
class AMR(object): |
|
""" |
|
AMR is a rooted, labeled graph to represent semantics. |
|
This class has the following members: |
|
nodes: list of node in the graph. Its ith element is the name of the ith node. For example, a node name |
|
could be "a1", "b", "g2", .etc |
|
node_values: list of node labels (values) of the graph. Its ith element is the value associated with node i in |
|
nodes list. In AMR, such value is usually a semantic concept (e.g. "boy", "want-01") |
|
root: root node name |
|
relations: list of edges connecting two nodes in the graph. Each entry is a link between two nodes, i.e. a triple |
|
<relation name, node1 name, node 2 name>. In AMR, such link denotes the relation between two semantic |
|
concepts. For example, "arg0" means that one of the concepts is the 0th argument of the other. |
|
attributes: list of edges connecting a node to an attribute name and its value. For example, if the polarity of |
|
some node is negative, there should be an edge connecting this node and "-". A triple < attribute name, |
|
node name, attribute value> is used to represent such attribute. It can also be viewed as a relation. |
|
|
|
""" |
|
def __init__(self, node_list=None, node_value_list=None, relation_list=None, attribute_list=None): |
|
""" |
|
node_list: names of nodes in AMR graph, e.g. "a11", "n" |
|
node_value_list: values of nodes in AMR graph, e.g. "group" for a node named "g" |
|
relation_list: list of relations between two nodes |
|
attribute_list: list of attributes (links between one node and one constant value) |
|
|
|
""" |
|
|
|
|
|
|
|
if node_list is None: |
|
self.nodes = [] |
|
self.root = None |
|
else: |
|
self.nodes = node_list[:] |
|
if len(node_list) != 0: |
|
self.root = node_list[0] |
|
else: |
|
self.root = None |
|
if node_value_list is None: |
|
self.node_values = [] |
|
else: |
|
self.node_values = node_value_list[:] |
|
if relation_list is None: |
|
self.relations = [] |
|
else: |
|
self.relations = relation_list[:] |
|
if attribute_list is None: |
|
self.attributes = [] |
|
else: |
|
self.attributes = attribute_list[:] |
|
|
|
def rename_node(self, prefix): |
|
""" |
|
Rename AMR graph nodes to prefix + node_index to avoid nodes with the same name in two different AMRs. |
|
|
|
""" |
|
node_map_dict = {} |
|
|
|
for i in range(0, len(self.nodes)): |
|
node_map_dict[self.nodes[i]] = prefix + str(i) |
|
|
|
for i, v in enumerate(self.nodes): |
|
self.nodes[i] = node_map_dict[v] |
|
|
|
for node_relations in self.relations: |
|
for i, l in enumerate(node_relations): |
|
node_relations[i][1] = node_map_dict[l[1]] |
|
|
|
def get_triples(self): |
|
""" |
|
Get the triples in three lists. |
|
instance_triple: a triple representing an instance. E.g. instance(w, want-01) |
|
attribute triple: relation of attributes, e.g. polarity(w, - ) |
|
and relation triple, e.g. arg0 (w, b) |
|
|
|
""" |
|
instance_triple = [] |
|
relation_triple = [] |
|
attribute_triple = [] |
|
for i in range(len(self.nodes)): |
|
instance_triple.append(("instance", self.nodes[i], self.node_values[i])) |
|
|
|
|
|
for l in self.relations[i]: |
|
relation_triple.append((l[0], self.nodes[i], l[1])) |
|
|
|
|
|
for l in self.attributes[i]: |
|
attribute_triple.append((l[0], self.nodes[i], l[1])) |
|
return instance_triple, attribute_triple, relation_triple |
|
|
|
|
|
def get_triples2(self): |
|
""" |
|
Get the triples in two lists: |
|
instance_triple: a triple representing an instance. E.g. instance(w, want-01) |
|
relation_triple: a triple representing all relations. E.g arg0 (w, b) or E.g. polarity(w, - ) |
|
Note that we do not differentiate between attribute triple and relation triple. Both are considered as relation |
|
triples. |
|
All triples are represented by (triple_type, argument 1 of the triple, argument 2 of the triple) |
|
|
|
""" |
|
instance_triple = [] |
|
relation_triple = [] |
|
for i in range(len(self.nodes)): |
|
|
|
|
|
instance_triple.append(("instance", self.nodes[i], self.node_values[i])) |
|
|
|
|
|
for l in self.relations[i]: |
|
relation_triple.append((l[0], self.nodes[i], l[1])) |
|
|
|
|
|
for l in self.attributes[i]: |
|
relation_triple.append((l[0], self.nodes[i], l[1])) |
|
return instance_triple, relation_triple |
|
|
|
|
|
def __str__(self): |
|
""" |
|
Generate AMR string for better readability |
|
|
|
""" |
|
lines = [] |
|
for i in range(len(self.nodes)): |
|
lines.append("Node "+ str(i) + " " + self.nodes[i]) |
|
lines.append("Value: " + self.node_values[i]) |
|
lines.append("Relations:") |
|
for relation in self.relations[i]: |
|
lines.append("Node " + relation[1] + " via " + relation[0]) |
|
for attribute in self.attributes[i]: |
|
lines.append("Attribute: " + attribute[0] + " value " + attribute[1]) |
|
return "\n".join(lines) |
|
|
|
def __repr__(self): |
|
return self.__str__() |
|
|
|
def output_amr(self): |
|
""" |
|
Output AMR string |
|
|
|
""" |
|
print(self.__str__(), file=DEBUG_LOG) |
|
|
|
@staticmethod |
|
def get_amr_line(input_f): |
|
""" |
|
Read the file containing AMRs. AMRs are separated by a blank line. |
|
Each call of get_amr_line() returns the next available AMR (in one-line form). |
|
Note: this function does not verify if the AMR is valid |
|
|
|
""" |
|
cur_amr = [] |
|
has_content = False |
|
for line in input_f: |
|
line = line.strip() |
|
if line == "": |
|
if not has_content: |
|
|
|
continue |
|
else: |
|
|
|
break |
|
if line.strip().startswith("#"): |
|
|
|
continue |
|
else: |
|
has_content = True |
|
cur_amr.append(line.strip()) |
|
return "".join(cur_amr) |
|
|
|
@staticmethod |
|
def parse_AMR_line(line): |
|
""" |
|
Parse a AMR from line representation to an AMR object. |
|
This parsing algorithm scans the line once and process each character, in a shift-reduce style. |
|
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
state = 0 |
|
|
|
stack = [] |
|
|
|
cur_charseq = [] |
|
|
|
node_dict = {} |
|
|
|
node_name_list = [] |
|
|
|
node_relation_dict1 = defaultdict(list) |
|
|
|
node_relation_dict2 = defaultdict(list) |
|
|
|
cur_relation_name = "" |
|
|
|
in_quote = False |
|
for i, c in enumerate(line.strip()): |
|
if c == " ": |
|
|
|
if state == 2: |
|
cur_charseq.append(c) |
|
continue |
|
if c == "\"": |
|
|
|
|
|
if in_quote: |
|
cur_charseq.append('¦') |
|
in_quote = not in_quote |
|
elif c == "(": |
|
|
|
if in_quote: |
|
cur_charseq.append(c) |
|
continue |
|
|
|
|
|
|
|
if state == 2: |
|
|
|
if cur_relation_name != "": |
|
print("Format error when processing ", line[0:i + 1], file=ERROR_LOG) |
|
return None |
|
|
|
cur_relation_name = "".join(cur_charseq).strip() |
|
cur_charseq[:] = [] |
|
state = 1 |
|
elif c == ":": |
|
|
|
if in_quote: |
|
cur_charseq.append(c) |
|
continue |
|
|
|
|
|
|
|
|
|
|
|
if state == 3: |
|
node_value = "".join(cur_charseq) |
|
|
|
cur_charseq[:] = [] |
|
|
|
cur_node_name = stack[-1] |
|
|
|
node_dict[cur_node_name] = node_value |
|
|
|
|
|
|
|
|
|
|
|
|
|
elif state == 2: |
|
temp_attr_value = "".join(cur_charseq) |
|
cur_charseq[:] = [] |
|
parts = temp_attr_value.split() |
|
if len(parts) < 2: |
|
print("Error in processing; part len < 2", line[0:i + 1], file=ERROR_LOG) |
|
return None |
|
|
|
|
|
relation_name = parts[0].strip() |
|
relation_value = parts[1].strip() |
|
|
|
|
|
if len(stack) == 0: |
|
print("Error in processing", line[:i], relation_name, relation_value, file=ERROR_LOG) |
|
return None |
|
|
|
if relation_value not in node_dict: |
|
node_relation_dict2[stack[-1]].append((relation_name, relation_value)) |
|
else: |
|
node_relation_dict1[stack[-1]].append((relation_name, relation_value)) |
|
state = 2 |
|
elif c == "/": |
|
if in_quote: |
|
cur_charseq.append(c) |
|
continue |
|
|
|
|
|
|
|
|
|
if state == 1: |
|
node_name = "".join(cur_charseq) |
|
cur_charseq[:] = [] |
|
|
|
if node_name in node_dict: |
|
print("Duplicate node name ", node_name, " in parsing AMR", file=ERROR_LOG) |
|
return None |
|
|
|
stack.append(node_name) |
|
|
|
node_name_list.append(node_name) |
|
|
|
|
|
|
|
|
|
|
|
|
|
if cur_relation_name != "": |
|
|
|
|
|
|
|
|
|
if True or not cur_relation_name.endswith("-of"): |
|
|
|
node_relation_dict1[stack[-2]].append((cur_relation_name, node_name)) |
|
else: |
|
|
|
node_relation_dict1[node_name].append((cur_relation_name[:-3], stack[-2])) |
|
|
|
cur_relation_name = "" |
|
else: |
|
|
|
print("Error in parsing AMR", line[0:i + 1], file=ERROR_LOG) |
|
return None |
|
state = 3 |
|
elif c == ")": |
|
if in_quote: |
|
cur_charseq.append(c) |
|
continue |
|
|
|
if len(stack) == 0: |
|
print("Unmatched parenthesis at position", i, "in processing", line[0:i + 1], file=ERROR_LOG) |
|
return None |
|
|
|
|
|
|
|
|
|
if state == 2: |
|
temp_attr_value = "".join(cur_charseq) |
|
cur_charseq[:] = [] |
|
parts = temp_attr_value.split() |
|
if len(parts) < 2: |
|
print("Error processing", line[:i + 1], temp_attr_value, file=ERROR_LOG) |
|
return None |
|
relation_name = parts[0].strip() |
|
relation_value = parts[1].strip() |
|
|
|
|
|
if False and relation_name.endswith("-of"): |
|
node_relation_dict1[relation_value].append((relation_name[:-3], stack[-1])) |
|
|
|
|
|
|
|
elif relation_value not in node_dict: |
|
node_relation_dict2[stack[-1]].append((relation_name, relation_value)) |
|
else: |
|
node_relation_dict1[stack[-1]].append((relation_name, relation_value)) |
|
|
|
|
|
|
|
|
|
elif state == 3: |
|
node_value = "".join(cur_charseq) |
|
cur_charseq[:] = [] |
|
cur_node_name = stack[-1] |
|
|
|
node_dict[cur_node_name] = node_value |
|
|
|
stack.pop() |
|
cur_relation_name = "" |
|
state = 0 |
|
else: |
|
|
|
cur_charseq.append(c) |
|
|
|
node_value_list = [] |
|
relation_list = [] |
|
attribute_list = [] |
|
for v in node_name_list: |
|
if v not in node_dict: |
|
print("Error: Node name not found", v, file=ERROR_LOG) |
|
return None |
|
else: |
|
node_value_list.append(node_dict[v]) |
|
|
|
node_rel_list = [] |
|
node_attr_list = [] |
|
if v in node_relation_dict1: |
|
for v1 in node_relation_dict1[v]: |
|
node_rel_list.append([v1[0], v1[1]]) |
|
if v in node_relation_dict2: |
|
for v2 in node_relation_dict2[v]: |
|
|
|
|
|
if v2[1][0] == "\"" and v2[1][-1] == "\"": |
|
node_attr_list.append([[v2[0]], v2[1][1:-1]]) |
|
|
|
elif v2[1] in node_dict: |
|
node_rel_list.append([v2[0], v2[1]]) |
|
else: |
|
node_attr_list.append([v2[0], v2[1]]) |
|
|
|
relation_list.append(node_rel_list) |
|
attribute_list.append(node_attr_list) |
|
|
|
attribute_list[0].append(["TOP", node_value_list[0]]) |
|
result_amr = AMR(node_name_list, node_value_list, relation_list, attribute_list) |
|
return result_amr |
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
if len(sys.argv) < 2: |
|
print("No file given", file=ERROR_LOG) |
|
exit(1) |
|
amr_count = 1 |
|
for line in open(sys.argv[1]): |
|
cur_line = line.strip() |
|
if cur_line == "" or cur_line.startswith("#"): |
|
continue |
|
print("AMR", amr_count, file=DEBUG_LOG) |
|
current = AMR.parse_AMR_line(cur_line) |
|
current.output_amr() |
|
amr_count += 1 |
|
|