File size: 14,456 Bytes
de2d491
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
# Copyright 2020 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" CoVal metric. """
import coval  # From: git+https://github.com/ns-moosavi/coval.git noqa: F401
import datasets
from coval.conll import reader, util
from coval.eval import evaluator

import evaluate


logger = evaluate.logging.get_logger(__name__)


_CITATION = """\
@InProceedings{moosavi2019minimum,
    author = { Nafise Sadat Moosavi, Leo Born, Massimo Poesio and Michael Strube},
    title = {Using Automatically Extracted Minimum Spans to Disentangle Coreference Evaluation from Boundary Detection},
    year = {2019},
    booktitle = {Proceedings of the 57th Annual Meeting of
        the Association for Computational Linguistics (Volume 1: Long Papers)},
    publisher = {Association for Computational Linguistics},
    address = {Florence, Italy},
}

@inproceedings{10.3115/1072399.1072405,
author = {Vilain, Marc and Burger, John and Aberdeen, John and Connolly, Dennis and Hirschman, Lynette},
title = {A Model-Theoretic Coreference Scoring Scheme},
year = {1995},
isbn = {1558604022},
publisher = {Association for Computational Linguistics},
address = {USA},
url = {https://doi.org/10.3115/1072399.1072405},
doi = {10.3115/1072399.1072405},
booktitle = {Proceedings of the 6th Conference on Message Understanding},
pages = {45–52},
numpages = {8},
location = {Columbia, Maryland},
series = {MUC6 ’95}
}

@INPROCEEDINGS{Bagga98algorithmsfor,
    author = {Amit Bagga and Breck Baldwin},
    title = {Algorithms for Scoring Coreference Chains},
    booktitle = {In The First International Conference on Language Resources and Evaluation Workshop on Linguistics Coreference},
    year = {1998},
    pages = {563--566}
}

@INPROCEEDINGS{Luo05oncoreference,
    author = {Xiaoqiang Luo},
    title = {On coreference resolution performance metrics},
    booktitle = {In Proc. of HLT/EMNLP},
    year = {2005},
    pages = {25--32},
    publisher = {URL}
}

@inproceedings{moosavi-strube-2016-coreference,
    title = "Which Coreference Evaluation Metric Do You Trust? A Proposal for a Link-based Entity Aware Metric",
    author = "Moosavi, Nafise Sadat  and
      Strube, Michael",
    booktitle = "Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = aug,
    year = "2016",
    address = "Berlin, Germany",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/P16-1060",
    doi = "10.18653/v1/P16-1060",
    pages = "632--642",
}

"""

_DESCRIPTION = """\
CoVal is a coreference evaluation tool for the CoNLL and ARRAU datasets which
implements of the common evaluation metrics including MUC [Vilain et al, 1995],
B-cubed [Bagga and Baldwin, 1998], CEAFe [Luo et al., 2005],
LEA [Moosavi and Strube, 2016] and the averaged CoNLL score
(the average of the F1 values of MUC, B-cubed and CEAFe)
[Denis and Baldridge, 2009a; Pradhan et al., 2011].

This wrapper of CoVal currently only work with CoNLL line format:
The CoNLL format has one word per line with all the annotation for this word in column separated by spaces:
Column	Type	Description
1	Document ID	This is a variation on the document filename
2	Part number	Some files are divided into multiple parts numbered as 000, 001, 002, ... etc.
3	Word number
4	Word itself	This is the token as segmented/tokenized in the Treebank. Initially the *_skel file contain the placeholder [WORD] which gets replaced by the actual token from the Treebank which is part of the OntoNotes release.
5	Part-of-Speech
6	Parse bit	This is the bracketed structure broken before the first open parenthesis in the parse, and the word/part-of-speech leaf replaced with a *. The full parse can be created by substituting the asterix with the "([pos] [word])" string (or leaf) and concatenating the items in the rows of that column.
7	Predicate lemma	The predicate lemma is mentioned for the rows for which we have semantic role information. All other rows are marked with a "-"
8	Predicate Frameset ID	This is the PropBank frameset ID of the predicate in Column 7.
9	Word sense	This is the word sense of the word in Column 3.
10	Speaker/Author	This is the speaker or author name where available. Mostly in Broadcast Conversation and Web Log data.
11	Named Entities	These columns identifies the spans representing various named entities.
12:N	Predicate Arguments	There is one column each of predicate argument structure information for the predicate mentioned in Column 7.
N	Coreference	Coreference chain information encoded in a parenthesis structure.
More informations on the format can be found here (section "*_conll File Format"): http://www.conll.cemantix.org/2012/data.html

Details on the evaluation on CoNLL can be found here: https://github.com/ns-moosavi/coval/blob/master/conll/README.md

CoVal code was written by @ns-moosavi.
Some parts are borrowed from https://github.com/clarkkev/deep-coref/blob/master/evaluation.py
The test suite is taken from https://github.com/conll/reference-coreference-scorers/
Mention evaluation and the test suite are added by @andreasvc.
Parsing CoNLL files is developed by Leo Born.
"""

_KWARGS_DESCRIPTION = """
Calculates coreference evaluation metrics.
Args:
    predictions: list of sentences. Each sentence is a list of word predictions to score in the CoNLL format.
        Each prediction is a word with its annotations as a string made of columns joined with spaces.
        Only columns 4, 5, 6 and the last column are used (word, POS, Pars and coreference annotation)
        See the details on the format in the description of the metric.
    references: list of sentences. Each sentence is a list of word reference to score in the CoNLL format.
        Each reference is a word with its annotations as a string made of columns joined with spaces.
        Only columns 4, 5, 6 and the last column are used (word, POS, Pars and coreference annotation)
        See the details on the format in the description of the metric.
    keep_singletons: After extracting all mentions of key or system files,
        mentions whose corresponding coreference chain is of size one,
        are considered as singletons. The default evaluation mode will include
        singletons in evaluations if they are included in the key or the system files.
        By setting 'keep_singletons=False', all singletons in the key and system files
        will be excluded from the evaluation.
    NP_only: Most of the recent coreference resolvers only resolve NP mentions and
        leave out the resolution of VPs. By setting the 'NP_only' option, the scorer will only evaluate the resolution of NPs.
    min_span: By setting 'min_span', the scorer reports the results based on automatically detected minimum spans.
        Minimum spans are determined using the MINA algorithm.

Returns:
    'mentions': mentions
    'muc': MUC metric [Vilain et al, 1995]
    'bcub': B-cubed [Bagga and Baldwin, 1998]
    'ceafe': CEAFe [Luo et al., 2005]
    'lea': LEA [Moosavi and Strube, 2016]
    'conll_score': averaged CoNLL score (the average of the F1 values of MUC, B-cubed and CEAFe)

Examples:

    >>> coval = evaluate.load('coval')
    >>> words = ['bc/cctv/00/cctv_0005   0   0       Thank   VBP  (TOP(S(VP*    thank  01   1    Xu_li  *           (V*)        *       -',
    ... 'bc/cctv/00/cctv_0005   0   1         you   PRP        (NP*)      -    -   -    Xu_li  *        (ARG1*)   (ARG0*)   (116)',
    ... 'bc/cctv/00/cctv_0005   0   2    everyone    NN        (NP*)      -    -   -    Xu_li  *    (ARGM-DIS*)        *    (116)',
    ... 'bc/cctv/00/cctv_0005   0   3         for    IN        (PP*       -    -   -    Xu_li  *        (ARG2*         *       -',
    ... 'bc/cctv/00/cctv_0005   0   4    watching   VBG   (S(VP*))))   watch  01   1    Xu_li  *             *)      (V*)      -',
    ... 'bc/cctv/00/cctv_0005   0   5           .     .          *))      -    -   -    Xu_li  *             *         *       -']
    >>> references = [words]
    >>> predictions = [words]
    >>> results = coval.compute(predictions=predictions, references=references)
    >>> print(results) # doctest:+ELLIPSIS
    {'mentions/recall': 1.0,[...] 'conll_score': 100.0}
"""


def get_coref_infos(
    key_lines, sys_lines, NP_only=False, remove_nested=False, keep_singletons=True, min_span=False, doc="dummy_doc"
):

    key_doc_lines = {doc: key_lines}
    sys_doc_lines = {doc: sys_lines}

    doc_coref_infos = {}

    key_nested_coref_num = 0
    sys_nested_coref_num = 0
    key_removed_nested_clusters = 0
    sys_removed_nested_clusters = 0
    key_singletons_num = 0
    sys_singletons_num = 0

    key_clusters, singletons_num = reader.get_doc_mentions(doc, key_doc_lines[doc], keep_singletons)
    key_singletons_num += singletons_num

    if NP_only or min_span:
        key_clusters = reader.set_annotated_parse_trees(key_clusters, key_doc_lines[doc], NP_only, min_span)

    sys_clusters, singletons_num = reader.get_doc_mentions(doc, sys_doc_lines[doc], keep_singletons)
    sys_singletons_num += singletons_num

    if NP_only or min_span:
        sys_clusters = reader.set_annotated_parse_trees(sys_clusters, key_doc_lines[doc], NP_only, min_span)

    if remove_nested:
        nested_mentions, removed_clusters = reader.remove_nested_coref_mentions(key_clusters, keep_singletons)
        key_nested_coref_num += nested_mentions
        key_removed_nested_clusters += removed_clusters

        nested_mentions, removed_clusters = reader.remove_nested_coref_mentions(sys_clusters, keep_singletons)
        sys_nested_coref_num += nested_mentions
        sys_removed_nested_clusters += removed_clusters

    sys_mention_key_cluster = reader.get_mention_assignments(sys_clusters, key_clusters)
    key_mention_sys_cluster = reader.get_mention_assignments(key_clusters, sys_clusters)

    doc_coref_infos[doc] = (key_clusters, sys_clusters, key_mention_sys_cluster, sys_mention_key_cluster)

    if remove_nested:
        logger.info(
            "Number of removed nested coreferring mentions in the key "
            f"annotation: {key_nested_coref_num}; and system annotation: {sys_nested_coref_num}"
        )
        logger.info(
            "Number of resulting singleton clusters in the key "
            f"annotation: {key_removed_nested_clusters}; and system annotation: {sys_removed_nested_clusters}"
        )

    if not keep_singletons:
        logger.info(
            f"{key_singletons_num:d} and {sys_singletons_num:d} singletons are removed from the key and system "
            "files, respectively"
        )

    return doc_coref_infos


def compute_score(key_lines, sys_lines, metrics, NP_only, remove_nested, keep_singletons, min_span):
    doc_coref_infos = get_coref_infos(key_lines, sys_lines, NP_only, remove_nested, keep_singletons, min_span)

    output_scores = {}
    conll = 0
    conll_subparts_num = 0

    for name, metric in metrics:
        recall, precision, f1 = evaluator.evaluate_documents(doc_coref_infos, metric, beta=1)
        if name in ["muc", "bcub", "ceafe"]:
            conll += f1
            conll_subparts_num += 1
        output_scores.update({f"{name}/recall": recall, f"{name}/precision": precision, f"{name}/f1": f1})

        logger.info(
            name.ljust(10),
            f"Recall: {recall * 100:.2f}",
            f" Precision: {precision * 100:.2f}",
            f" F1: {f1 * 100:.2f}",
        )

    if conll_subparts_num == 3:
        conll = (conll / 3) * 100
        logger.info(f"CoNLL score: {conll:.2f}")
        output_scores.update({"conll_score": conll})

    return output_scores


def check_gold_parse_annotation(key_lines):
    has_gold_parse = False
    for line in key_lines:
        if not line.startswith("#"):
            if len(line.split()) > 6:
                parse_col = line.split()[5]
                if not parse_col == "-":
                    has_gold_parse = True
                    break
                else:
                    break
    return has_gold_parse


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Coval(evaluate.EvaluationModule):
    def _info(self):
        return evaluate.EvaluationModuleInfo(
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            features=datasets.Features(
                {
                    "predictions": datasets.Sequence(datasets.Value("string")),
                    "references": datasets.Sequence(datasets.Value("string")),
                }
            ),
            codebase_urls=["https://github.com/ns-moosavi/coval"],
            reference_urls=[
                "https://github.com/ns-moosavi/coval",
                "https://www.aclweb.org/anthology/P16-1060",
                "http://www.conll.cemantix.org/2012/data.html",
            ],
        )

    def _compute(
        self, predictions, references, keep_singletons=True, NP_only=False, min_span=False, remove_nested=False
    ):
        allmetrics = [
            ("mentions", evaluator.mentions),
            ("muc", evaluator.muc),
            ("bcub", evaluator.b_cubed),
            ("ceafe", evaluator.ceafe),
            ("lea", evaluator.lea),
        ]

        if min_span:
            has_gold_parse = util.check_gold_parse_annotation(references)
            if not has_gold_parse:
                raise NotImplementedError("References should have gold parse annotation to use 'min_span'.")
                # util.parse_key_file(key_file)
                # key_file = key_file + ".parsed"

        score = compute_score(
            key_lines=references,
            sys_lines=predictions,
            metrics=allmetrics,
            NP_only=NP_only,
            remove_nested=remove_nested,
            keep_singletons=keep_singletons,
            min_span=min_span,
        )

        return score