|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
import argparse |
|
import io |
|
import sys |
|
import unicodedata |
|
import unittest |
|
|
|
__version__ = "2.1.1-dev" |
|
|
|
|
|
|
|
ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10) |
|
|
|
|
|
CONTENT_DEPRELS = { |
|
"nsubj", "obj", "iobj", "csubj", "ccomp", "xcomp", "obl", "vocative", |
|
"expl", "dislocated", "advcl", "advmod", "discourse", "nmod", "appos", |
|
"nummod", "acl", "amod", "conj", "fixed", "flat", "compound", "list", |
|
"parataxis", "orphan", "goeswith", "reparandum", "root", "dep" |
|
} |
|
|
|
FUNCTIONAL_DEPRELS = { |
|
"aux", "cop", "mark", "det", "clf", "case", "cc" |
|
} |
|
|
|
UNIVERSAL_FEATURES = { |
|
"PronType", "NumType", "Poss", "Reflex", "Foreign", "Abbr", "Gender", |
|
"Animacy", "Number", "Case", "Definite", "Degree", "VerbForm", "Mood", |
|
"Tense", "Aspect", "Voice", "Evident", "Polarity", "Person", "Polite" |
|
} |
|
|
|
|
|
class UDError(Exception): |
|
pass |
|
|
|
|
|
def load_conllu(file, single_root=1): |
|
|
|
class UDRepresentation: |
|
def __init__(self): |
|
|
|
|
|
self.characters = [] |
|
|
|
self.tokens = [] |
|
|
|
self.words = [] |
|
|
|
self.sentences = [] |
|
class UDSpan: |
|
def __init__(self, start, end): |
|
self.start = start |
|
|
|
|
|
self.end = end |
|
class UDWord: |
|
def __init__(self, span, columns, is_multiword): |
|
|
|
self.span = span |
|
|
|
self.columns = columns |
|
|
|
|
|
self.is_multiword = is_multiword |
|
|
|
self.parent = None |
|
|
|
self.functional_children = [] |
|
|
|
self.columns[FEATS] = "|".join(sorted(feat for feat in columns[FEATS].split("|") |
|
if feat.split("=", 1)[0] in UNIVERSAL_FEATURES)) |
|
|
|
self.columns[DEPREL] = columns[DEPREL].split(":")[0] |
|
|
|
self.is_content_deprel = self.columns[DEPREL] in CONTENT_DEPRELS |
|
self.is_functional_deprel = self.columns[DEPREL] in FUNCTIONAL_DEPRELS |
|
|
|
ud = UDRepresentation() |
|
|
|
|
|
index, sentence_start = 0, None |
|
while True: |
|
line = file.readline() |
|
if not line: |
|
break |
|
line = line.rstrip("\r\n") |
|
|
|
|
|
if sentence_start is None: |
|
|
|
if line.startswith("#"): |
|
continue |
|
|
|
ud.sentences.append(UDSpan(index, 0)) |
|
sentence_start = len(ud.words) |
|
if not line: |
|
|
|
def process_word(word): |
|
if word.parent == "remapping": |
|
raise UDError("There is a cycle in a sentence") |
|
if word.parent is None: |
|
if word.columns[HEAD] == "_": |
|
word.parent = "missing" |
|
else: |
|
head = int(word.columns[HEAD]) |
|
if head < 0 or head > len(ud.words) - sentence_start: |
|
raise UDError("HEAD '{}' points outside of the sentence".format(word.columns[HEAD])) |
|
if head: |
|
parent = ud.words[sentence_start + head - 1] |
|
word.parent = "remapping" |
|
process_word(parent) |
|
word.parent = parent |
|
|
|
for word in ud.words[sentence_start:]: |
|
process_word(word) |
|
|
|
|
|
for word in ud.words[sentence_start:]: |
|
if word.parent and word.is_functional_deprel: |
|
word.parent.functional_children.append(word) |
|
|
|
|
|
if single_root: |
|
if len([word for word in ud.words[sentence_start:] if word.parent is None]) > 1: |
|
raise UDError("There are multiple roots in a sentence") |
|
|
|
|
|
ud.sentences[-1].end = index |
|
sentence_start = None |
|
continue |
|
|
|
|
|
columns = line.split("\t") |
|
if len(columns) != 10: |
|
raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(line)) |
|
|
|
|
|
if "." in columns[ID]: |
|
continue |
|
|
|
|
|
|
|
|
|
if sys.version_info < (3, 0) and isinstance(line, str): |
|
columns[FORM] = columns[FORM].decode("utf-8") |
|
columns[FORM] = "".join(filter(lambda c: unicodedata.category(c) != "Zs", columns[FORM])) |
|
if sys.version_info < (3, 0) and isinstance(line, str): |
|
columns[FORM] = columns[FORM].encode("utf-8") |
|
if not columns[FORM]: |
|
raise UDError("There is an empty FORM in the CoNLL-U file") |
|
|
|
|
|
ud.characters.extend(columns[FORM]) |
|
ud.tokens.append(UDSpan(index, index + len(columns[FORM]))) |
|
index += len(columns[FORM]) |
|
|
|
|
|
if "-" in columns[ID]: |
|
try: |
|
start, end = map(int, columns[ID].split("-")) |
|
except: |
|
raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID])) |
|
|
|
for _ in range(start, end + 1): |
|
word_line = file.readline().rstrip("\r\n") |
|
word_columns = word_line.split("\t") |
|
if len(word_columns) != 10: |
|
raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(word_line)) |
|
ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True)) |
|
|
|
else: |
|
try: |
|
word_id = int(columns[ID]) |
|
except: |
|
raise UDError("Cannot parse word ID '{}'".format(columns[ID])) |
|
if word_id != len(ud.words) - sentence_start + 1: |
|
raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format(columns[ID], columns[FORM], len(ud.words) - sentence_start + 1)) |
|
|
|
if columns[HEAD] != "_": |
|
try: |
|
head_id = int(columns[HEAD]) |
|
except: |
|
raise UDError("Cannot parse HEAD '{}'".format(columns[HEAD])) |
|
if head_id < 0: |
|
raise UDError("HEAD cannot be negative") |
|
|
|
ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False)) |
|
|
|
if sentence_start is not None: |
|
raise UDError("The CoNLL-U file does not end with empty line") |
|
|
|
return ud |
|
|
|
|
|
def evaluate(gold_ud, system_ud): |
|
class Score: |
|
def __init__(self, gold_total, system_total, correct, aligned_total=None): |
|
self.correct = correct |
|
self.gold_total = gold_total |
|
self.system_total = system_total |
|
self.aligned_total = aligned_total |
|
self.precision = correct / system_total if system_total else 0.0 |
|
self.recall = correct / gold_total if gold_total else 0.0 |
|
self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0 |
|
self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total |
|
class AlignmentWord: |
|
def __init__(self, gold_word, system_word): |
|
self.gold_word = gold_word |
|
self.system_word = system_word |
|
class Alignment: |
|
def __init__(self, gold_words, system_words): |
|
self.gold_words = gold_words |
|
self.system_words = system_words |
|
self.matched_words = [] |
|
self.matched_words_map = {} |
|
def append_aligned_words(self, gold_word, system_word): |
|
self.matched_words.append(AlignmentWord(gold_word, system_word)) |
|
self.matched_words_map[system_word] = gold_word |
|
|
|
def lower(text): |
|
if sys.version_info < (3, 0) and isinstance(text, str): |
|
return text.decode("utf-8").lower() |
|
return text.lower() |
|
|
|
def spans_score(gold_spans, system_spans): |
|
correct, gi, si = 0, 0, 0 |
|
while gi < len(gold_spans) and si < len(system_spans): |
|
if system_spans[si].start < gold_spans[gi].start: |
|
si += 1 |
|
elif gold_spans[gi].start < system_spans[si].start: |
|
gi += 1 |
|
else: |
|
correct += gold_spans[gi].end == system_spans[si].end |
|
si += 1 |
|
gi += 1 |
|
|
|
return Score(len(gold_spans), len(system_spans), correct) |
|
|
|
def alignment_score(alignment, key_fn=None, filter_fn=None): |
|
if filter_fn is not None: |
|
gold = sum(1 for gold in alignment.gold_words if filter_fn(gold)) |
|
system = sum(1 for system in alignment.system_words if filter_fn(system)) |
|
aligned = sum(1 for word in alignment.matched_words if filter_fn(word.gold_word)) |
|
else: |
|
gold = len(alignment.gold_words) |
|
system = len(alignment.system_words) |
|
aligned = len(alignment.matched_words) |
|
|
|
if key_fn is None: |
|
|
|
return Score(gold, system, aligned) |
|
|
|
def gold_aligned_gold(word): |
|
return word |
|
def gold_aligned_system(word): |
|
return alignment.matched_words_map.get(word, "NotAligned") if word is not None else None |
|
correct = 0 |
|
for words in alignment.matched_words: |
|
if filter_fn is None or filter_fn(words.gold_word): |
|
if key_fn(words.gold_word, gold_aligned_gold) == key_fn(words.system_word, gold_aligned_system): |
|
correct += 1 |
|
|
|
return Score(gold, system, correct, aligned) |
|
|
|
def beyond_end(words, i, multiword_span_end): |
|
if i >= len(words): |
|
return True |
|
if words[i].is_multiword: |
|
return words[i].span.start >= multiword_span_end |
|
return words[i].span.end > multiword_span_end |
|
|
|
def extend_end(word, multiword_span_end): |
|
if word.is_multiword and word.span.end > multiword_span_end: |
|
return word.span.end |
|
return multiword_span_end |
|
|
|
def find_multiword_span(gold_words, system_words, gi, si): |
|
|
|
|
|
|
|
if gold_words[gi].is_multiword: |
|
multiword_span_end = gold_words[gi].span.end |
|
if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start: |
|
si += 1 |
|
else: |
|
multiword_span_end = system_words[si].span.end |
|
if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start: |
|
gi += 1 |
|
gs, ss = gi, si |
|
|
|
|
|
|
|
while not beyond_end(gold_words, gi, multiword_span_end) or \ |
|
not beyond_end(system_words, si, multiword_span_end): |
|
if gi < len(gold_words) and (si >= len(system_words) or |
|
gold_words[gi].span.start <= system_words[si].span.start): |
|
multiword_span_end = extend_end(gold_words[gi], multiword_span_end) |
|
gi += 1 |
|
else: |
|
multiword_span_end = extend_end(system_words[si], multiword_span_end) |
|
si += 1 |
|
return gs, ss, gi, si |
|
|
|
def compute_lcs(gold_words, system_words, gi, si, gs, ss): |
|
lcs = [[0] * (si - ss) for i in range(gi - gs)] |
|
for g in reversed(range(gi - gs)): |
|
for s in reversed(range(si - ss)): |
|
if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]): |
|
lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0) |
|
lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0) |
|
lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0) |
|
return lcs |
|
|
|
def align_words(gold_words, system_words): |
|
alignment = Alignment(gold_words, system_words) |
|
|
|
gi, si = 0, 0 |
|
while gi < len(gold_words) and si < len(system_words): |
|
if gold_words[gi].is_multiword or system_words[si].is_multiword: |
|
|
|
gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si) |
|
|
|
if si > ss and gi > gs: |
|
lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss) |
|
|
|
|
|
s, g = 0, 0 |
|
while g < gi - gs and s < si - ss: |
|
if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]): |
|
alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s]) |
|
g += 1 |
|
s += 1 |
|
elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0): |
|
g += 1 |
|
else: |
|
s += 1 |
|
else: |
|
|
|
if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end): |
|
alignment.append_aligned_words(gold_words[gi], system_words[si]) |
|
gi += 1 |
|
si += 1 |
|
elif gold_words[gi].span.start <= system_words[si].span.start: |
|
gi += 1 |
|
else: |
|
si += 1 |
|
|
|
return alignment |
|
|
|
|
|
if gold_ud.characters != system_ud.characters: |
|
index = 0 |
|
while index < len(gold_ud.characters) and index < len(system_ud.characters) and \ |
|
gold_ud.characters[index] == system_ud.characters[index]: |
|
index += 1 |
|
|
|
raise UDError( |
|
"The concatenation of tokens in gold file and in system file differ!\n" + |
|
"First 20 differing characters in gold file: '{}' and system file: '{}'".format( |
|
"".join(gold_ud.characters[index:index + 20]), |
|
"".join(system_ud.characters[index:index + 20]) |
|
) |
|
) |
|
|
|
|
|
alignment = align_words(gold_ud.words, system_ud.words) |
|
|
|
|
|
return { |
|
"Tokens": spans_score(gold_ud.tokens, system_ud.tokens), |
|
"Sentences": spans_score(gold_ud.sentences, system_ud.sentences), |
|
"Words": alignment_score(alignment), |
|
"UPOS": alignment_score(alignment, lambda w, _: w.columns[UPOS]), |
|
"XPOS": alignment_score(alignment, lambda w, _: w.columns[XPOS]), |
|
"UFeats": alignment_score(alignment, lambda w, _: w.columns[FEATS]), |
|
"AllTags": alignment_score(alignment, lambda w, _: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])), |
|
"Lemmas": alignment_score(alignment, lambda w, ga: w.columns[LEMMA] if ga(w).columns[LEMMA] != "_" else "_"), |
|
"UAS": alignment_score(alignment, lambda w, ga: ga(w.parent)), |
|
"LAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL])), |
|
"CLAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL]), |
|
filter_fn=lambda w: w.is_content_deprel), |
|
"MLAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL], w.columns[UPOS], w.columns[FEATS], |
|
[(ga(c), c.columns[DEPREL], c.columns[UPOS], c.columns[FEATS]) |
|
for c in w.functional_children]), |
|
filter_fn=lambda w: w.is_content_deprel), |
|
"BLEX": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL], |
|
w.columns[LEMMA] if ga(w).columns[LEMMA] != "_" else "_"), |
|
filter_fn=lambda w: w.is_content_deprel), |
|
} |
|
|
|
|
|
def load_conllu_file(path, single_root=1): |
|
_file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {})) |
|
return load_conllu(_file, single_root) |
|
|
|
def evaluate_wrapper(args): |
|
|
|
gold_ud = load_conllu_file(args.gold_file, args.single_root) |
|
system_ud = load_conllu_file(args.system_file, args.single_root) |
|
return evaluate(gold_ud, system_ud) |
|
|
|
def main(): |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("gold_file", type=str, |
|
help="Name of the CoNLL-U file with the gold data.") |
|
parser.add_argument("system_file", type=str, |
|
help="Name of the CoNLL-U file with the predicted data.") |
|
parser.add_argument("--verbose", "-v", default=False, action="store_true", |
|
help="Print all metrics.") |
|
parser.add_argument("--counts", "-c", default=False, action="store_true", |
|
help="Print raw counts of correct/gold/system/aligned words instead of prec/rec/F1 for all metrics.") |
|
parser.add_argument("--no_single_root", dest="single_root", default=True, action="store_false", |
|
help="Allow multiple roots in a sentence.") |
|
args = parser.parse_args() |
|
|
|
|
|
evaluation = evaluate_wrapper(args) |
|
|
|
|
|
if not args.verbose and not args.counts: |
|
print("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1)) |
|
print("MLAS Score: {:.2f}".format(100 * evaluation["MLAS"].f1)) |
|
print("BLEX Score: {:.2f}".format(100 * evaluation["BLEX"].f1)) |
|
else: |
|
if args.counts: |
|
print("Metric | Correct | Gold | Predicted | Aligned") |
|
else: |
|
print("Metric | Precision | Recall | F1 Score | AligndAcc") |
|
print("-----------+-----------+-----------+-----------+-----------") |
|
for metric in["Tokens", "Sentences", "Words", "UPOS", "XPOS", "UFeats", "AllTags", "Lemmas", "UAS", "LAS", "CLAS", "MLAS", "BLEX"]: |
|
if args.counts: |
|
print("{:11}|{:10} |{:10} |{:10} |{:10}".format( |
|
metric, |
|
evaluation[metric].correct, |
|
evaluation[metric].gold_total, |
|
evaluation[metric].system_total, |
|
evaluation[metric].aligned_total or (evaluation[metric].correct if metric == "Words" else "") |
|
)) |
|
else: |
|
print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format( |
|
metric, |
|
100 * evaluation[metric].precision, |
|
100 * evaluation[metric].recall, |
|
100 * evaluation[metric].f1, |
|
"{:10.2f}".format(100 * evaluation[metric].aligned_accuracy) if evaluation[metric].aligned_accuracy is not None else "" |
|
)) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|
|
|
|
class TestAlignment(unittest.TestCase): |
|
@staticmethod |
|
def _load_words(words): |
|
"""Prepare fake CoNLL-U files with fake HEAD to prevent multiple roots errors.""" |
|
lines, num_words = [], 0 |
|
for w in words: |
|
parts = w.split(" ") |
|
if len(parts) == 1: |
|
num_words += 1 |
|
lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, parts[0], int(num_words>1))) |
|
else: |
|
lines.append("{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_".format(num_words + 1, num_words + len(parts) - 1, parts[0])) |
|
for part in parts[1:]: |
|
num_words += 1 |
|
lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, part, int(num_words>1))) |
|
return load_conllu((io.StringIO if sys.version_info >= (3, 0) else io.BytesIO)("\n".join(lines+["\n"]))) |
|
|
|
def _test_exception(self, gold, system): |
|
self.assertRaises(UDError, evaluate, self._load_words(gold), self._load_words(system)) |
|
|
|
def _test_ok(self, gold, system, correct): |
|
metrics = evaluate(self._load_words(gold), self._load_words(system)) |
|
gold_words = sum((max(1, len(word.split(" ")) - 1) for word in gold)) |
|
system_words = sum((max(1, len(word.split(" ")) - 1) for word in system)) |
|
self.assertEqual((metrics["Words"].precision, metrics["Words"].recall, metrics["Words"].f1), |
|
(correct / system_words, correct / gold_words, 2 * correct / (gold_words + system_words))) |
|
|
|
def test_exception(self): |
|
self._test_exception(["a"], ["b"]) |
|
|
|
def test_equal(self): |
|
self._test_ok(["a"], ["a"], 1) |
|
self._test_ok(["a", "b", "c"], ["a", "b", "c"], 3) |
|
|
|
def test_equal_with_multiword(self): |
|
self._test_ok(["abc a b c"], ["a", "b", "c"], 3) |
|
self._test_ok(["a", "bc b c", "d"], ["a", "b", "c", "d"], 4) |
|
self._test_ok(["abcd a b c d"], ["ab a b", "cd c d"], 4) |
|
self._test_ok(["abc a b c", "de d e"], ["a", "bcd b c d", "e"], 5) |
|
|
|
def test_alignment(self): |
|
self._test_ok(["abcd"], ["a", "b", "c", "d"], 0) |
|
self._test_ok(["abc", "d"], ["a", "b", "c", "d"], 1) |
|
self._test_ok(["a", "bc", "d"], ["a", "b", "c", "d"], 2) |
|
self._test_ok(["a", "bc b c", "d"], ["a", "b", "cd"], 2) |
|
self._test_ok(["abc a BX c", "def d EX f"], ["ab a b", "cd c d", "ef e f"], 4) |
|
self._test_ok(["ab a b", "cd bc d"], ["a", "bc", "d"], 2) |
|
self._test_ok(["a", "bc b c", "d"], ["ab AX BX", "cd CX a"], 1) |
|
|