| from collections import deque |
| import subprocess |
|
|
| from stanza.models.common.utils import misc_to_space_after |
| from stanza.models.constituency.parse_tree import Tree |
| from stanza.protobuf import DependencyGraph, FlattenedParseTree |
| from stanza.server.client import resolve_classpath |
|
|
| def send_request(request, response_type, java_main, classpath=None): |
| """ |
| Use subprocess to run a Java protobuf processor on the given request |
| |
| Returns the protobuf response |
| """ |
| classpath = resolve_classpath(classpath) |
| if classpath is None: |
| raise ValueError("Classpath is None, Perhaps you need to set the $CLASSPATH or $CORENLP_HOME environment variable to point to a CoreNLP install.") |
| pipe = subprocess.run(["java", "-cp", classpath, java_main], |
| input=request.SerializeToString(), |
| stdout=subprocess.PIPE, |
| check=True) |
| response = response_type() |
| response.ParseFromString(pipe.stdout) |
| return response |
|
|
| def add_tree_nodes(proto_tree, tree, score): |
| |
| node = proto_tree.nodes.add() |
| node.openNode = True |
| if score is not None: |
| node.score = score |
|
|
| |
| node = proto_tree.nodes.add() |
| node.value = tree.label |
|
|
| |
| |
| |
| for child in tree.children: |
| if child.is_leaf(): |
| node = proto_tree.nodes.add() |
| node.value = child.label |
| else: |
| add_tree_nodes(proto_tree, child, None) |
|
|
| node = proto_tree.nodes.add() |
| node.closeNode = True |
|
|
| def build_tree(tree, score): |
| """ |
| Builds a FlattenedParseTree from CoreNLP.proto |
| |
| Populates the value field from tree.label and iterates through the |
| children via tree.children. Should work on any tree structure |
| which follows that layout |
| |
| The score will be added to the top node (if it is not None) |
| |
| Operates by recursively calling add_tree_nodes |
| """ |
| proto_tree = FlattenedParseTree() |
| add_tree_nodes(proto_tree, tree, score) |
| return proto_tree |
|
|
| def from_tree(proto_tree): |
| """ |
| Convert a FlattenedParseTree back into a Tree |
| |
| returns Tree, score |
| (score might be None if it is missing) |
| """ |
| score = None |
| stack = deque() |
| for node in proto_tree.nodes: |
| if node.HasField("score") and score is None: |
| score = node.score |
|
|
| if node.openNode: |
| if len(stack) > 0 and isinstance(stack[-1], FlattenedParseTree.Node) and stack[-1].openNode: |
| raise ValueError("Got a proto with no label on a node: {}".format(proto_tree)) |
| stack.append(node) |
| continue |
| if not node.closeNode: |
| child = Tree(label=node.value) |
| |
| stack.append(child) |
| continue |
|
|
| |
| if len(stack) <= 1: |
| raise ValueError("Got a proto with too many close operations: {}".format(proto_tree)) |
| |
| |
| children = [] |
| nextNode = stack.pop() |
| while not isinstance(nextNode, FlattenedParseTree.Node): |
| children.append(nextNode) |
| nextNode = stack.pop() |
| if len(children) == 0: |
| raise ValueError("Got a proto with an open immediately followed by a close: {}".format(proto_tree)) |
| children.reverse() |
| label = children[0] |
| children = children[1:] |
| subtree = Tree(label=label.label, children=children) |
| stack.append(subtree) |
|
|
| if len(stack) > 1: |
| raise ValueError("Got a proto which does not close all of the nodes: {}".format(proto_tree)) |
| tree = stack.pop() |
| if not isinstance(tree, Tree): |
| raise ValueError("Got a proto which was just one Open operation: {}".format(proto_tree)) |
| return tree, score |
|
|
| def add_token(token_list, word, token): |
| """ |
| Add a token to a proto request. |
| |
| CoreNLP tokens have components of both word and token from stanza. |
| |
| We pass along "after" but not "before" |
| """ |
| if token is None and isinstance(word.id, int): |
| raise AssertionError("Only expected word w/o token for 'extra' words") |
|
|
| query_token = token_list.add() |
| query_token.word = word.text |
| query_token.value = word.text |
| if word.lemma is not None: |
| query_token.lemma = word.lemma |
| if word.xpos is not None: |
| query_token.pos = word.xpos |
| if word.upos is not None: |
| query_token.coarseTag = word.upos |
| if word.feats and word.feats != "_": |
| for feature in word.feats.split("|"): |
| key, value = feature.split("=", maxsplit=1) |
| query_token.conllUFeatures.key.append(key) |
| query_token.conllUFeatures.value.append(value) |
| if token is not None: |
| if token.ner is not None: |
| query_token.ner = token.ner |
| if token is not None and len(token.id) > 1: |
| query_token.mwtText = token.text |
| query_token.isMWT = True |
| query_token.isFirstMWT = token.id[0] == word.id |
| if token.id[-1] != word.id: |
| |
| |
| pass |
| else: |
| query_token.after = token.spaces_after |
|
|
| query_token.index = word.id |
| else: |
| |
| |
| query_token.after = misc_to_space_after(word.misc) |
|
|
| query_token.index = word.id[0] |
| query_token.emptyIndex = word.id[1] |
|
|
| if word.misc and word.misc != "_": |
| query_token.conllUMisc = word.misc |
| if token is not None and token.misc and token.misc != "_": |
| query_token.mwtMisc = token.misc |
|
|
| def add_sentence(request_sentences, sentence, num_tokens): |
| """ |
| Add the tokens for this stanza sentence to a list of protobuf sentences |
| """ |
| request_sentence = request_sentences.add() |
| request_sentence.tokenOffsetBegin = num_tokens |
| request_sentence.tokenOffsetEnd = num_tokens + sum(len(token.words) for token in sentence.tokens) |
| for token in sentence.tokens: |
| for word in token.words: |
| add_token(request_sentence.token, word, token) |
| return request_sentence |
|
|
| def add_word_to_graph(graph, word, sent_idx): |
| """ |
| Add a node and possibly an edge for a word in a basic dependency graph. |
| """ |
| node = graph.node.add() |
| node.sentenceIndex = sent_idx+1 |
| if isinstance(word.id, int): |
| node.index = word.id |
| else: |
| node.index = word.id[0] |
| node.emptyIndex = word.id[1] |
|
|
| if word.head != 0 and word.head is not None: |
| edge = graph.edge.add() |
| edge.source = word.head |
| if isinstance(word.id, int): |
| edge.target = word.id |
| else: |
| edge.target = word.id[0] |
| edge.targetEmpty = word.id[1] |
| if word.deprel is not None: |
| edge.dep = word.deprel |
| else: |
| |
| edge.dep = "_" |
|
|
| def convert_networkx_graph(graph_proto, sentence, sent_idx): |
| """ |
| Turns a networkx graph into a DependencyGraph from the proto file |
| """ |
| for token in sentence.tokens: |
| for word in token.words: |
| add_token(graph_proto.token, word, token) |
| for word in sentence.empty_words: |
| add_token(graph_proto.token, word, None) |
|
|
| dependencies = sentence._enhanced_dependencies |
| for target in dependencies: |
| if target == 0: |
| |
| continue |
| for source in dependencies.predecessors(target): |
| if source == 0: |
| |
| |
| graph_proto.rootNode.append(len(graph_proto.node)) |
| continue |
| for deprel in dependencies.get_edge_data(source, target): |
| edge = graph_proto.edge.add() |
| if isinstance(source, int): |
| edge.source = source |
| else: |
| edge.source = source[0] |
| if source[1] != 0: |
| edge.sourceEmpty = source[1] |
| if isinstance(target, int): |
| edge.target = target |
| else: |
| edge.target = target[0] |
| if target[1] != 0: |
| edge.targetEmpty = target[1] |
| edge.dep = deprel |
| node = graph_proto.node.add() |
| node.sentenceIndex = sent_idx + 1 |
| |
| if isinstance(target, int): |
| node.index = target |
| else: |
| node.index = target[0] |
| if target[1] != 0: |
| node.emptyIndex = target[1] |
| return graph_proto |
|
|
| def features_to_string(features): |
| if not features: |
| return None |
| if len(features.key) == 0: |
| return None |
| return "|".join("%s=%s" % (key, value) for key, value in zip(features.key, features.value)) |
|
|
| def misc_space_pieces(misc): |
| """ |
| Return only the space-related misc pieces |
| """ |
| if misc is None or misc == "" or misc == "_": |
| return misc |
| pieces = misc.split("|") |
| pieces = [x for x in pieces if x.split("=", maxsplit=1)[0] in ("SpaceAfter", "SpacesAfter", "SpacesBefore")] |
| if len(pieces) > 0: |
| return "|".join(pieces) |
| return None |
|
|
| def remove_space_misc(misc): |
| """ |
| Remove any pieces from misc which are space-related |
| """ |
| if misc is None or misc == "" or misc == "_": |
| return misc |
| pieces = misc.split("|") |
| pieces = [x for x in pieces if x.split("=", maxsplit=1)[0] not in ("SpaceAfter", "SpacesAfter", "SpacesBefore")] |
| if len(pieces) > 0: |
| return "|".join(pieces) |
| return None |
|
|
| def substitute_space_misc(misc, space_misc): |
| space_misc_pieces = space_misc.split("|") if space_misc else [] |
| space_misc_after = None |
| space_misc_before = None |
| for piece in space_misc_pieces: |
| if piece.startswith("SpaceBefore"): |
| space_misc_before = piece |
| elif piece.startswith("SpaceAfter") or piece.startswith("SpacesAfter"): |
| space_misc_after = piece |
| else: |
| raise AssertionError("An unknown piece wound up in the misc space fields: %s" % piece) |
|
|
| pieces = misc.split("|") |
| new_pieces = [] |
| for piece in pieces: |
| if piece.startswith("SpaceBefore"): |
| if space_misc_before: |
| new_pieces.append(space_misc_before) |
| space_misc_before = None |
| elif piece.startswith("SpaceAfter") or piece.startswith("SpacesAfter"): |
| if space_misc_after: |
| new_pieces.append(space_misc_after) |
| space_misc_after = None |
| else: |
| new_pieces.append(piece) |
| if space_misc_after: |
| new_pieces.append(space_misc_after) |
| if space_misc_before: |
| new_pieces.append(space_misc_before) |
| if len(new_pieces) == 0: |
| return None |
| return "|".join(new_pieces) |
|
|
| class JavaProtobufContext(object): |
| """ |
| A generic context for sending requests to a java program using protobufs in a subprocess |
| """ |
| def __init__(self, classpath, build_response, java_main, extra_args=None): |
| self.classpath = resolve_classpath(classpath) |
| self.build_response = build_response |
| self.java_main = java_main |
|
|
| if extra_args is None: |
| extra_args = [] |
| self.extra_args = extra_args |
| self.pipe = None |
|
|
| def open_pipe(self): |
| self.pipe = subprocess.Popen(["java", "-cp", self.classpath, self.java_main, "-multiple"] + self.extra_args, |
| stdin=subprocess.PIPE, |
| stdout=subprocess.PIPE) |
|
|
| def close_pipe(self): |
| if self.pipe.poll() is None: |
| self.pipe.stdin.write((0).to_bytes(4, 'big')) |
| self.pipe.stdin.flush() |
| self.pipe = None |
|
|
| def __enter__(self): |
| self.open_pipe() |
| return self |
|
|
| def __exit__(self, type, value, traceback): |
| self.close_pipe() |
|
|
| def process_request(self, request): |
| if self.pipe is None: |
| raise RuntimeError("Pipe to java process is not open or was closed") |
|
|
| text = request.SerializeToString() |
| self.pipe.stdin.write(len(text).to_bytes(4, 'big')) |
| self.pipe.stdin.write(text) |
| self.pipe.stdin.flush() |
| response_length = self.pipe.stdout.read(4) |
| if len(response_length) < 4: |
| raise BrokenPipeError("Could not communicate with java process!") |
| response_length = int.from_bytes(response_length, "big") |
| response_text = self.pipe.stdout.read(response_length) |
| response = self.build_response() |
| response.ParseFromString(response_text) |
| return response |
|
|
|
|