lvwerra HF staff commited on
Commit
99711f3
1 Parent(s): b7ae728

Update Space (evaluate main: 2253a6e1)

Browse files
Files changed (5) hide show
  1. README.md +75 -5
  2. app.py +6 -0
  3. nist_mt.py +132 -0
  4. requirements.txt +2 -0
  5. tests.py +34 -0
README.md CHANGED
@@ -1,12 +1,82 @@
1
  ---
2
- title: Nist Mt
3
- emoji:
4
- colorFrom: pink
5
  colorTo: red
6
  sdk: gradio
7
- sdk_version: 3.12.0
8
  app_file: app.py
9
  pinned: false
 
 
 
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: NIST_MT
3
+ emoji: 🤗
4
+ colorFrom: purple
5
  colorTo: red
6
  sdk: gradio
7
+ sdk_version: 3.0.2
8
  app_file: app.py
9
  pinned: false
10
+ tags:
11
+ - evaluate
12
+ - metric
13
+ - machine-translation
14
+ description:
15
+ DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU score.
16
  ---
17
 
18
+ # Metric Card for NIST's MT metric
19
+
20
+
21
+ ## Metric Description
22
+ DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU
23
+ score. The official script used by NIST to compute BLEU and NIST score is
24
+ mteval-14.pl. The main differences are:
25
+
26
+ - BLEU uses geometric mean of the ngram overlaps, NIST uses arithmetic mean.
27
+ - NIST has a different brevity penalty
28
+ - NIST score from mteval-14.pl has a self-contained tokenizer (in the Hugging Face implementation we rely on NLTK's
29
+ implementation of the NIST-specific tokenizer)
30
+
31
+ ## Intended Uses
32
+ NIST was developed for machine translation evaluation.
33
+
34
+ ## How to Use
35
+
36
+ ```python
37
+ import evaluate
38
+ nist_mt = evaluate.load("nist_mt")
39
+ hypothesis1 = "It is a guide to action which ensures that the military always obeys the commands of the party"
40
+ reference1 = "It is a guide to action that ensures that the military will forever heed Party commands"
41
+ reference2 = "It is the guiding principle which guarantees the military forces always being under the command of the Party"
42
+ nist_mt.compute(hypothesis1, [reference1, reference2])
43
+ # {'nist_mt': 3.3709935957649324}
44
+ ```
45
+
46
+ ### Inputs
47
+ - **predictions**: tokenized predictions to score. For sentence-level NIST, a list of tokens (str);
48
+ for corpus-level NIST, a list (sentences) of lists of tokens (str)
49
+ - **references**: potentially multiple tokenized references for each prediction. For sentence-level NIST, a
50
+ list (multiple potential references) of list of tokens (str); for corpus-level NIST, a list (corpus) of lists
51
+ (multiple potential references) of lists of tokens (str)
52
+ - **n**: highest n-gram order
53
+ - **tokenize_kwargs**: arguments passed to the tokenizer (see: https://github.com/nltk/nltk/blob/90fa546ea600194f2799ee51eaf1b729c128711e/nltk/tokenize/nist.py#L139)
54
+
55
+ ### Output Values
56
+ - **nist_mt** (`float`): NIST score
57
+
58
+ Output Example:
59
+ ```python
60
+ {'nist_mt': 3.3709935957649324}
61
+ ```
62
+
63
+
64
+ ## Citation
65
+ ```bibtex
66
+ @inproceedings{10.5555/1289189.1289273,
67
+ author = {Doddington, George},
68
+ title = {Automatic Evaluation of Machine Translation Quality Using N-Gram Co-Occurrence Statistics},
69
+ year = {2002},
70
+ publisher = {Morgan Kaufmann Publishers Inc.},
71
+ address = {San Francisco, CA, USA},
72
+ booktitle = {Proceedings of the Second International Conference on Human Language Technology Research},
73
+ pages = {138–145},
74
+ numpages = {8},
75
+ location = {San Diego, California},
76
+ series = {HLT '02}
77
+ }
78
+ ```
79
+
80
+ ## Further References
81
+
82
+ This Hugging Face implementation uses [the NLTK implementation](https://github.com/nltk/nltk/blob/develop/nltk/translate/nist_score.py)
app.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
1
+ import evaluate
2
+ from evaluate.utils import launch_gradio_widget
3
+
4
+
5
+ module = evaluate.load("nist_mt")
6
+ launch_gradio_widget(module)
nist_mt.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """NLTK's NIST implementation on both the sentence and corpus level"""
15
+ from typing import Dict, Optional
16
+
17
+ import datasets
18
+ import nltk
19
+ from datasets import Sequence, Value
20
+
21
+
22
+ try:
23
+ nltk.data.find("perluniprops")
24
+ except LookupError:
25
+ nltk.download("perluniprops", quiet=True) # NISTTokenizer requirement
26
+
27
+ from nltk.tokenize.nist import NISTTokenizer
28
+ from nltk.translate.nist_score import corpus_nist, sentence_nist
29
+
30
+ import evaluate
31
+
32
+
33
+ _CITATION = """\
34
+ @inproceedings{10.5555/1289189.1289273,
35
+ author = {Doddington, George},
36
+ title = {Automatic Evaluation of Machine Translation Quality Using N-Gram Co-Occurrence Statistics},
37
+ year = {2002},
38
+ publisher = {Morgan Kaufmann Publishers Inc.},
39
+ address = {San Francisco, CA, USA},
40
+ booktitle = {Proceedings of the Second International Conference on Human Language Technology Research},
41
+ pages = {138–145},
42
+ numpages = {8},
43
+ location = {San Diego, California},
44
+ series = {HLT '02}
45
+ }
46
+ """
47
+
48
+ _DESCRIPTION = """\
49
+ DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU
50
+ score. The official script used by NIST to compute BLEU and NIST score is
51
+ mteval-14.pl. The main differences are:
52
+
53
+ - BLEU uses geometric mean of the ngram precisions, NIST uses arithmetic mean.
54
+ - NIST has a different brevity penalty
55
+ - NIST score from mteval-14.pl has a self-contained tokenizer (in the Hugging Face implementation we rely on NLTK's
56
+ implementation of the NIST-specific tokenizer)
57
+ """
58
+
59
+
60
+ _KWARGS_DESCRIPTION = """
61
+ Computes NIST score of translated segments against one or more references.
62
+ Args:
63
+ predictions: predictions to score (list of str)
64
+ references: potentially multiple references for each prediction (list of str or list of list of str)
65
+ n: highest n-gram order
66
+ lowercase: whether to lowercase the data (only applicable if 'western_lang' is True)
67
+ western_lang: whether the current language is a Western language, which will enable some specific tokenization
68
+ rules with respect to, e.g., punctuation
69
+
70
+ Returns:
71
+ 'nist_mt': nist_mt score
72
+ Examples:
73
+ >>> nist_mt = evaluate.load("nist_mt")
74
+ >>> hypothesis = "It is a guide to action which ensures that the military always obeys the commands of the party"
75
+ >>> reference1 = "It is a guide to action that ensures that the military will forever heed Party commands"
76
+ >>> reference2 = "It is the guiding principle which guarantees the military forces always being under the command of the Party"
77
+ >>> reference3 = "It is the practical guide for the army always to heed the directions of the party"
78
+ >>> nist_mt.compute(predictions=[hypothesis], references=[[reference1, reference2, reference3]])
79
+ {'nist_mt': 3.3709935957649324}
80
+ >>> nist_mt.compute(predictions=[hypothesis], references=[reference1])
81
+ {'nist_mt': 2.4477124183006533}
82
+ """
83
+
84
+
85
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
86
+ class NistMt(evaluate.Metric):
87
+ """A wrapper around NLTK's NIST implementation."""
88
+
89
+ def _info(self):
90
+ return evaluate.MetricInfo(
91
+ module_type="metric",
92
+ description=_DESCRIPTION,
93
+ citation=_CITATION,
94
+ inputs_description=_KWARGS_DESCRIPTION,
95
+ features=[
96
+ datasets.Features(
97
+ {
98
+ "predictions": Value("string", id="prediction"),
99
+ "references": Sequence(Value("string", id="reference"), id="references"),
100
+ }
101
+ ),
102
+ datasets.Features(
103
+ {
104
+ "predictions": Value("string", id="prediction"),
105
+ "references": Value("string", id="reference"),
106
+ }
107
+ ),
108
+ ],
109
+ homepage="https://www.nltk.org/api/nltk.translate.nist_score.html",
110
+ codebase_urls=["https://github.com/nltk/nltk/blob/develop/nltk/translate/nist_score.py"],
111
+ reference_urls=["https://en.wikipedia.org/wiki/NIST_(metric)"],
112
+ )
113
+
114
+ def _compute(self, predictions, references, n: int = 5, lowercase=False, western_lang=True):
115
+ tokenizer = NISTTokenizer()
116
+
117
+ # Account for single reference cases: references always need to have one more dimension than predictions
118
+ if isinstance(references[0], str):
119
+ references = [[ref] for ref in references]
120
+
121
+ predictions = [
122
+ tokenizer.tokenize(pred, return_str=False, lowercase=lowercase, western_lang=western_lang)
123
+ for pred in predictions
124
+ ]
125
+ references = [
126
+ [
127
+ tokenizer.tokenize(ref, return_str=False, lowercase=lowercase, western_lang=western_lang)
128
+ for ref in ref_sentences
129
+ ]
130
+ for ref_sentences in references
131
+ ]
132
+ return {"nist_mt": corpus_nist(list_of_references=references, hypotheses=predictions, n=n)}
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
1
+ git+https://github.com/huggingface/evaluate@2253a6e12a4b4c2c05ef77b84ea6c0f1188ac926
2
+ nltk
tests.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from _pytest.fixtures import fixture
2
+ from nist_mt import Nist_mt
3
+
4
+
5
+ nist = Nist_mt()
6
+
7
+
8
+ @fixture
9
+ def hypothesis_sent():
10
+ return "It is a guide to action which ensures that the military always obeys the commands of the party"
11
+
12
+
13
+ @fixture
14
+ def reference_sent1():
15
+ return "It is a guide to action that ensures that the military will forever heed Party commands"
16
+
17
+
18
+ @fixture
19
+ def reference_sent2():
20
+ return (
21
+ "It is the guiding principle which guarantees the military forces always being under the command of the Party"
22
+ )
23
+
24
+
25
+ @fixture
26
+ def reference_sent3():
27
+ return "It is the practical guide for the army always to heed the directions of the party"
28
+
29
+
30
+ def test_nist_sentence(hypothesis_sent, reference_sent1, reference_sent2, reference_sent3):
31
+ nist_score = nist.compute(
32
+ predictions=[hypothesis_sent], references=[[reference_sent1, reference_sent2, reference_sent3]]
33
+ )
34
+ assert abs(nist_score["nist_mt"] - 3.3709935957649324) < 1e-6