#!/usr/bin/env python # Compatible with Python 2.7 and 3.2+, can be used either as a module # or a standalone executable. # # Copyright 2017, 2018 Institute of Formal and Applied Linguistics (UFAL), # Faculty of Mathematics and Physics, Charles University, Czech Republic. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. # # Authors: Milan Straka, Martin Popel # # Changelog: # - [12 Apr 2018] Version 0.9: Initial release. # - [19 Apr 2018] Version 1.0: Fix bug in MLAS (duplicate entries in functional_children). # Add --counts option. # - [02 May 2018] Version 1.1: When removing spaces to match gold and system characters, # consider all Unicode characters of category Zs instead of # just ASCII space. # - [12 Sep 2020] Version 1.2: Renamed to udpipe2_eval when releasing with UDPipe 2 # Allow evaluation with underscores in HEAD. # - [26 Jul 2022] Version 1.3: Allow evaluation with multiple roots, if requested. # Command line usage # ------------------ # udpipe2_eval.py [-v] gold_conllu_file system_conllu_file # # - if no -v is given, only the official CoNLL18 UD Shared Task evaluation metrics # are printed # - if -v is given, more metrics are printed (as precision, recall, F1 score, # and in case the metric is computed on aligned words also accuracy on these): # - Tokens: how well do the gold tokens match system tokens # - Sentences: how well do the gold sentences match system sentences # - Words: how well can the gold words be aligned to system words # - UPOS: using aligned words, how well does UPOS match # - XPOS: using aligned words, how well does XPOS match # - UFeats: using aligned words, how well does universal FEATS match # - AllTags: using aligned words, how well does UPOS+XPOS+FEATS match # - Lemmas: using aligned words, how well does LEMMA match # - UAS: using aligned words, how well does HEAD match # - LAS: using aligned words, how well does HEAD+DEPREL(ignoring subtypes) match # - CLAS: using aligned words with content DEPREL, how well does # HEAD+DEPREL(ignoring subtypes) match # - MLAS: using aligned words with content DEPREL, how well does # HEAD+DEPREL(ignoring subtypes)+UPOS+UFEATS+FunctionalChildren(DEPREL+UPOS+UFEATS) match # - BLEX: using aligned words with content DEPREL, how well does # HEAD+DEPREL(ignoring subtypes)+LEMMAS match # - if -c is given, raw counts of correct/gold_total/system_total/aligned words are printed # instead of precision/recall/F1/AlignedAccuracy for all metrics. # API usage # --------- # - load_conllu(file) # - loads CoNLL-U file from given file object to an internal representation # - the file object should return str in both Python 2 and Python 3 # - raises UDError exception if the given file cannot be loaded # - evaluate(gold_ud, system_ud) # - evaluate the given gold and system CoNLL-U files (loaded with load_conllu) # - raises UDError if the concatenated tokens of gold and system file do not match # - returns a dictionary with the metrics described above, each metric having # three fields: precision, recall and f1 # Description of token matching # ----------------------------- # In order to match tokens of gold file and system file, we consider the text # resulting from concatenation of gold tokens and text resulting from # concatenation of system tokens. These texts should match -- if they do not, # the evaluation fails. # # If the texts do match, every token is represented as a range in this original # text, and tokens are equal only if their range is the same. # Description of word matching # ---------------------------- # When matching words of gold file and system file, we first match the tokens. # The words which are also tokens are matched as tokens, but words in multi-word # tokens have to be handled differently. # # To handle multi-word tokens, we start by finding "multi-word spans". # Multi-word span is a span in the original text such that # - it contains at least one multi-word token # - all multi-word tokens in the span (considering both gold and system ones) # are completely inside the span (i.e., they do not "stick out") # - the multi-word span is as small as possible # # For every multi-word span, we align the gold and system words completely # inside this span using LCS on their FORMs. The words not intersecting # (even partially) any multi-word span are then aligned as tokens. from __future__ import division from __future__ import print_function import argparse import io import sys import unicodedata import unittest __version__ = "2.1.1-dev" # CoNLL-U column names ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10) # Content and functional relations CONTENT_DEPRELS = { "nsubj", "obj", "iobj", "csubj", "ccomp", "xcomp", "obl", "vocative", "expl", "dislocated", "advcl", "advmod", "discourse", "nmod", "appos", "nummod", "acl", "amod", "conj", "fixed", "flat", "compound", "list", "parataxis", "orphan", "goeswith", "reparandum", "root", "dep" } FUNCTIONAL_DEPRELS = { "aux", "cop", "mark", "det", "clf", "case", "cc" } UNIVERSAL_FEATURES = { "PronType", "NumType", "Poss", "Reflex", "Foreign", "Abbr", "Gender", "Animacy", "Number", "Case", "Definite", "Degree", "VerbForm", "Mood", "Tense", "Aspect", "Voice", "Evident", "Polarity", "Person", "Polite" } # UD Error is used when raising exceptions in this module class UDError(Exception): pass # Load given CoNLL-U file into internal representation def load_conllu(file, single_root=1): # Internal representation classes class UDRepresentation: def __init__(self): # Characters of all the tokens in the whole file. # Whitespace between tokens is not included. self.characters = [] # List of UDSpan instances with start&end indices into `characters`. self.tokens = [] # List of UDWord instances. self.words = [] # List of UDSpan instances with start&end indices into `characters`. self.sentences = [] class UDSpan: def __init__(self, start, end): self.start = start # Note that self.end marks the first position **after the end** of span, # so we can use characters[start:end] or range(start, end). self.end = end class UDWord: def __init__(self, span, columns, is_multiword): # Span of this word (or MWT, see below) within ud_representation.characters. self.span = span # 10 columns of the CoNLL-U file: ID, FORM, LEMMA,... self.columns = columns # is_multiword==True means that this word is part of a multi-word token. # In that case, self.span marks the span of the whole multi-word token. self.is_multiword = is_multiword # Reference to the UDWord instance representing the HEAD (or None if root). self.parent = None # List of references to UDWord instances representing functional-deprel children. self.functional_children = [] # Only consider universal FEATS. self.columns[FEATS] = "|".join(sorted(feat for feat in columns[FEATS].split("|") if feat.split("=", 1)[0] in UNIVERSAL_FEATURES)) # Let's ignore language-specific deprel subtypes. self.columns[DEPREL] = columns[DEPREL].split(":")[0] # Precompute which deprels are CONTENT_DEPRELS and which FUNCTIONAL_DEPRELS self.is_content_deprel = self.columns[DEPREL] in CONTENT_DEPRELS self.is_functional_deprel = self.columns[DEPREL] in FUNCTIONAL_DEPRELS ud = UDRepresentation() # Load the CoNLL-U file index, sentence_start = 0, None while True: line = file.readline() if not line: break line = line.rstrip("\r\n") # Handle sentence start boundaries if sentence_start is None: # Skip comments if line.startswith("#"): continue # Start a new sentence ud.sentences.append(UDSpan(index, 0)) sentence_start = len(ud.words) if not line: # Add parent and children UDWord links and check there are no cycles def process_word(word): if word.parent == "remapping": raise UDError("There is a cycle in a sentence") if word.parent is None: if word.columns[HEAD] == "_": word.parent = "missing" else: head = int(word.columns[HEAD]) if head < 0 or head > len(ud.words) - sentence_start: raise UDError("HEAD '{}' points outside of the sentence".format(word.columns[HEAD])) if head: parent = ud.words[sentence_start + head - 1] word.parent = "remapping" process_word(parent) word.parent = parent for word in ud.words[sentence_start:]: process_word(word) # func_children cannot be assigned within process_word # because it is called recursively and may result in adding one child twice. for word in ud.words[sentence_start:]: if word.parent and word.is_functional_deprel: word.parent.functional_children.append(word) # Check there is a single root node if single_root: if len([word for word in ud.words[sentence_start:] if word.parent is None]) > 1: raise UDError("There are multiple roots in a sentence") # End the sentence ud.sentences[-1].end = index sentence_start = None continue # Read next token/word columns = line.split("\t") if len(columns) != 10: raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(line)) # Skip empty nodes if "." in columns[ID]: continue # Delete spaces from FORM, so gold.characters == system.characters # even if one of them tokenizes the space. Use any Unicode character # with category Zs. if sys.version_info < (3, 0) and isinstance(line, str): columns[FORM] = columns[FORM].decode("utf-8") columns[FORM] = "".join(filter(lambda c: unicodedata.category(c) != "Zs", columns[FORM])) if sys.version_info < (3, 0) and isinstance(line, str): columns[FORM] = columns[FORM].encode("utf-8") if not columns[FORM]: raise UDError("There is an empty FORM in the CoNLL-U file") # Save token ud.characters.extend(columns[FORM]) ud.tokens.append(UDSpan(index, index + len(columns[FORM]))) index += len(columns[FORM]) # Handle multi-word tokens to save word(s) if "-" in columns[ID]: try: start, end = map(int, columns[ID].split("-")) except: raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID])) for _ in range(start, end + 1): word_line = file.readline().rstrip("\r\n") word_columns = word_line.split("\t") if len(word_columns) != 10: raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(word_line)) ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True)) # Basic tokens/words else: try: word_id = int(columns[ID]) except: raise UDError("Cannot parse word ID '{}'".format(columns[ID])) if word_id != len(ud.words) - sentence_start + 1: raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format(columns[ID], columns[FORM], len(ud.words) - sentence_start + 1)) if columns[HEAD] != "_": try: head_id = int(columns[HEAD]) except: raise UDError("Cannot parse HEAD '{}'".format(columns[HEAD])) if head_id < 0: raise UDError("HEAD cannot be negative") ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False)) if sentence_start is not None: raise UDError("The CoNLL-U file does not end with empty line") return ud # Evaluate the gold and system treebanks (loaded using load_conllu). def evaluate(gold_ud, system_ud): class Score: def __init__(self, gold_total, system_total, correct, aligned_total=None): self.correct = correct self.gold_total = gold_total self.system_total = system_total self.aligned_total = aligned_total self.precision = correct / system_total if system_total else 0.0 self.recall = correct / gold_total if gold_total else 0.0 self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0 self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total class AlignmentWord: def __init__(self, gold_word, system_word): self.gold_word = gold_word self.system_word = system_word class Alignment: def __init__(self, gold_words, system_words): self.gold_words = gold_words self.system_words = system_words self.matched_words = [] self.matched_words_map = {} def append_aligned_words(self, gold_word, system_word): self.matched_words.append(AlignmentWord(gold_word, system_word)) self.matched_words_map[system_word] = gold_word def lower(text): if sys.version_info < (3, 0) and isinstance(text, str): return text.decode("utf-8").lower() return text.lower() def spans_score(gold_spans, system_spans): correct, gi, si = 0, 0, 0 while gi < len(gold_spans) and si < len(system_spans): if system_spans[si].start < gold_spans[gi].start: si += 1 elif gold_spans[gi].start < system_spans[si].start: gi += 1 else: correct += gold_spans[gi].end == system_spans[si].end si += 1 gi += 1 return Score(len(gold_spans), len(system_spans), correct) def alignment_score(alignment, key_fn=None, filter_fn=None): if filter_fn is not None: gold = sum(1 for gold in alignment.gold_words if filter_fn(gold)) system = sum(1 for system in alignment.system_words if filter_fn(system)) aligned = sum(1 for word in alignment.matched_words if filter_fn(word.gold_word)) else: gold = len(alignment.gold_words) system = len(alignment.system_words) aligned = len(alignment.matched_words) if key_fn is None: # Return score for whole aligned words return Score(gold, system, aligned) def gold_aligned_gold(word): return word def gold_aligned_system(word): return alignment.matched_words_map.get(word, "NotAligned") if word is not None else None correct = 0 for words in alignment.matched_words: if filter_fn is None or filter_fn(words.gold_word): if key_fn(words.gold_word, gold_aligned_gold) == key_fn(words.system_word, gold_aligned_system): correct += 1 return Score(gold, system, correct, aligned) def beyond_end(words, i, multiword_span_end): if i >= len(words): return True if words[i].is_multiword: return words[i].span.start >= multiword_span_end return words[i].span.end > multiword_span_end def extend_end(word, multiword_span_end): if word.is_multiword and word.span.end > multiword_span_end: return word.span.end return multiword_span_end def find_multiword_span(gold_words, system_words, gi, si): # We know gold_words[gi].is_multiword or system_words[si].is_multiword. # Find the start of the multiword span (gs, ss), so the multiword span is minimal. # Initialize multiword_span_end characters index. if gold_words[gi].is_multiword: multiword_span_end = gold_words[gi].span.end if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start: si += 1 else: # if system_words[si].is_multiword multiword_span_end = system_words[si].span.end if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start: gi += 1 gs, ss = gi, si # Find the end of the multiword span # (so both gi and si are pointing to the word following the multiword span end). while not beyond_end(gold_words, gi, multiword_span_end) or \ not beyond_end(system_words, si, multiword_span_end): if gi < len(gold_words) and (si >= len(system_words) or gold_words[gi].span.start <= system_words[si].span.start): multiword_span_end = extend_end(gold_words[gi], multiword_span_end) gi += 1 else: multiword_span_end = extend_end(system_words[si], multiword_span_end) si += 1 return gs, ss, gi, si def compute_lcs(gold_words, system_words, gi, si, gs, ss): lcs = [[0] * (si - ss) for i in range(gi - gs)] for g in reversed(range(gi - gs)): for s in reversed(range(si - ss)): if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]): lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0) lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0) lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0) return lcs def align_words(gold_words, system_words): alignment = Alignment(gold_words, system_words) gi, si = 0, 0 while gi < len(gold_words) and si < len(system_words): if gold_words[gi].is_multiword or system_words[si].is_multiword: # A: Multi-word tokens => align via LCS within the whole "multiword span". gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si) if si > ss and gi > gs: lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss) # Store aligned words s, g = 0, 0 while g < gi - gs and s < si - ss: if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]): alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s]) g += 1 s += 1 elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0): g += 1 else: s += 1 else: # B: No multi-word token => align according to spans. if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end): alignment.append_aligned_words(gold_words[gi], system_words[si]) gi += 1 si += 1 elif gold_words[gi].span.start <= system_words[si].span.start: gi += 1 else: si += 1 return alignment # Check that the underlying character sequences do match. if gold_ud.characters != system_ud.characters: index = 0 while index < len(gold_ud.characters) and index < len(system_ud.characters) and \ gold_ud.characters[index] == system_ud.characters[index]: index += 1 raise UDError( "The concatenation of tokens in gold file and in system file differ!\n" + "First 20 differing characters in gold file: '{}' and system file: '{}'".format( "".join(gold_ud.characters[index:index + 20]), "".join(system_ud.characters[index:index + 20]) ) ) # Align words alignment = align_words(gold_ud.words, system_ud.words) # Compute the F1-scores return { "Tokens": spans_score(gold_ud.tokens, system_ud.tokens), "Sentences": spans_score(gold_ud.sentences, system_ud.sentences), "Words": alignment_score(alignment), "UPOS": alignment_score(alignment, lambda w, _: w.columns[UPOS]), "XPOS": alignment_score(alignment, lambda w, _: w.columns[XPOS]), "UFeats": alignment_score(alignment, lambda w, _: w.columns[FEATS]), "AllTags": alignment_score(alignment, lambda w, _: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])), "Lemmas": alignment_score(alignment, lambda w, ga: w.columns[LEMMA] if ga(w).columns[LEMMA] != "_" else "_"), "UAS": alignment_score(alignment, lambda w, ga: ga(w.parent)), "LAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL])), "CLAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL]), filter_fn=lambda w: w.is_content_deprel), "MLAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL], w.columns[UPOS], w.columns[FEATS], [(ga(c), c.columns[DEPREL], c.columns[UPOS], c.columns[FEATS]) for c in w.functional_children]), filter_fn=lambda w: w.is_content_deprel), "BLEX": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL], w.columns[LEMMA] if ga(w).columns[LEMMA] != "_" else "_"), filter_fn=lambda w: w.is_content_deprel), } def load_conllu_file(path, single_root=1): _file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {})) return load_conllu(_file, single_root) def evaluate_wrapper(args): # Load CoNLL-U files gold_ud = load_conllu_file(args.gold_file, args.single_root) system_ud = load_conllu_file(args.system_file, args.single_root) return evaluate(gold_ud, system_ud) def main(): # Parse arguments parser = argparse.ArgumentParser() parser.add_argument("gold_file", type=str, help="Name of the CoNLL-U file with the gold data.") parser.add_argument("system_file", type=str, help="Name of the CoNLL-U file with the predicted data.") parser.add_argument("--verbose", "-v", default=False, action="store_true", help="Print all metrics.") parser.add_argument("--counts", "-c", default=False, action="store_true", help="Print raw counts of correct/gold/system/aligned words instead of prec/rec/F1 for all metrics.") parser.add_argument("--no_single_root", dest="single_root", default=True, action="store_false", help="Allow multiple roots in a sentence.") args = parser.parse_args() # Evaluate evaluation = evaluate_wrapper(args) # Print the evaluation if not args.verbose and not args.counts: print("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1)) print("MLAS Score: {:.2f}".format(100 * evaluation["MLAS"].f1)) print("BLEX Score: {:.2f}".format(100 * evaluation["BLEX"].f1)) else: if args.counts: print("Metric | Correct | Gold | Predicted | Aligned") else: print("Metric | Precision | Recall | F1 Score | AligndAcc") print("-----------+-----------+-----------+-----------+-----------") for metric in["Tokens", "Sentences", "Words", "UPOS", "XPOS", "UFeats", "AllTags", "Lemmas", "UAS", "LAS", "CLAS", "MLAS", "BLEX"]: if args.counts: print("{:11}|{:10} |{:10} |{:10} |{:10}".format( metric, evaluation[metric].correct, evaluation[metric].gold_total, evaluation[metric].system_total, evaluation[metric].aligned_total or (evaluation[metric].correct if metric == "Words" else "") )) else: print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format( metric, 100 * evaluation[metric].precision, 100 * evaluation[metric].recall, 100 * evaluation[metric].f1, "{:10.2f}".format(100 * evaluation[metric].aligned_accuracy) if evaluation[metric].aligned_accuracy is not None else "" )) if __name__ == "__main__": main() # Tests, which can be executed with `python -m unittest udpipe2_eval`. class TestAlignment(unittest.TestCase): @staticmethod def _load_words(words): """Prepare fake CoNLL-U files with fake HEAD to prevent multiple roots errors.""" lines, num_words = [], 0 for w in words: parts = w.split(" ") if len(parts) == 1: num_words += 1 lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, parts[0], int(num_words>1))) else: lines.append("{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_".format(num_words + 1, num_words + len(parts) - 1, parts[0])) for part in parts[1:]: num_words += 1 lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, part, int(num_words>1))) return load_conllu((io.StringIO if sys.version_info >= (3, 0) else io.BytesIO)("\n".join(lines+["\n"]))) def _test_exception(self, gold, system): self.assertRaises(UDError, evaluate, self._load_words(gold), self._load_words(system)) def _test_ok(self, gold, system, correct): metrics = evaluate(self._load_words(gold), self._load_words(system)) gold_words = sum((max(1, len(word.split(" ")) - 1) for word in gold)) system_words = sum((max(1, len(word.split(" ")) - 1) for word in system)) self.assertEqual((metrics["Words"].precision, metrics["Words"].recall, metrics["Words"].f1), (correct / system_words, correct / gold_words, 2 * correct / (gold_words + system_words))) def test_exception(self): self._test_exception(["a"], ["b"]) def test_equal(self): self._test_ok(["a"], ["a"], 1) self._test_ok(["a", "b", "c"], ["a", "b", "c"], 3) def test_equal_with_multiword(self): self._test_ok(["abc a b c"], ["a", "b", "c"], 3) self._test_ok(["a", "bc b c", "d"], ["a", "b", "c", "d"], 4) self._test_ok(["abcd a b c d"], ["ab a b", "cd c d"], 4) self._test_ok(["abc a b c", "de d e"], ["a", "bcd b c d", "e"], 5) def test_alignment(self): self._test_ok(["abcd"], ["a", "b", "c", "d"], 0) self._test_ok(["abc", "d"], ["a", "b", "c", "d"], 1) self._test_ok(["a", "bc", "d"], ["a", "b", "c", "d"], 2) self._test_ok(["a", "bc b c", "d"], ["a", "b", "cd"], 2) self._test_ok(["abc a BX c", "def d EX f"], ["ab a b", "cd c d", "ef e f"], 4) self._test_ok(["ab a b", "cd bc d"], ["a", "bc", "d"], 2) self._test_ok(["a", "bc b c", "d"], ["ab AX BX", "cd CX a"], 1)