File size: 14,812 Bytes
7e60a5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
from tests.utils import wrap_test_forked


@wrap_test_forked
def test_bleurt():
    predictions = ["hello there", "general kenobi"]
    references = ["hello there", "general kenobi"]
    import evaluate
    bleurt = evaluate.load("bleurt")
    results = bleurt.compute(predictions=predictions, references=references)
    assert [round(v, 2) for v in results["scores"]] == [1.03, 1.04]


@wrap_test_forked
def test_sacrebleu():
    predictions = ["hello there general kenobi", "foo bar foobar"]
    references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]]
    import evaluate
    sacrebleu = evaluate.load("sacrebleu")
    results = sacrebleu.compute(predictions=predictions, references=references)

    assert list(results.keys()) == ['score', 'counts', 'totals', 'precisions', 'bp', 'sys_len', 'ref_len']
    assert round(results["score"], 1) == 100.0

    predictions = ["hello there general kenobi", "on our way to ankh morpork"]
    references = [["hello there general kenobi", "hello there !"], ["goodbye ankh morpork", "ankh morpork"]]
    sacrebleu = evaluate.load("sacrebleu")
    results = sacrebleu.compute(predictions=predictions, references=references)
    assert list(results.keys()) == ['score', 'counts', 'totals', 'precisions', 'bp', 'sys_len', 'ref_len']
    assert round(results["score"], 1) == 39.8


@wrap_test_forked
def test_bleu():
    predictions = ["hello there general kenobi", "foo bar foobar"]
    references = [
        ["hello there general kenobi", "hello there!"],
        ["foo bar foobar"]
    ]
    import evaluate
    bleu = evaluate.load("bleu")
    results = bleu.compute(predictions=predictions, references=references)
    assert results["bleu"] == 1.0


@wrap_test_forked
def test_squad_v1():
    predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22'}]
    references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}]
    import evaluate
    squad_metric = evaluate.load("squad")
    results = squad_metric.compute(predictions=predictions, references=references)
    assert results == {'exact_match': 100.0, 'f1': 100.0}


def test_squad_v2():
    predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22', 'no_answer_probability': 0.}]
    references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}]
    import evaluate
    squad_v2_metric = evaluate.load("squad_v2")
    results = squad_v2_metric.compute(predictions=predictions, references=references)
    assert results == {'exact': 100.0, 'f1': 100.0, 'total': 1, 'HasAns_exact': 100.0, 'HasAns_f1': 100.0,
                       'HasAns_total': 1, 'best_exact': 100.0, 'best_exact_thresh': 0.0, 'best_f1': 100.0,
                       'best_f1_thresh': 0.0}


@wrap_test_forked
def test_rougue():
    import evaluate
    rouge = evaluate.load('rouge')
    predictions = ["hello there", "general kenobi"]
    references = ["hello there", "general kenobi"]
    results = rouge.compute(predictions=predictions, references=references)
    assert results == {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}


@wrap_test_forked
def test_bertscore():
    predictions = ["hello there", "general kenobi"]
    references = ["hello there", "general kenobi"]
    import evaluate
    bertscore = evaluate.load("bertscore")
    results = bertscore.compute(predictions=predictions, references=references, lang="en")
    assert [round(v, 2) for v in results["f1"]] == [1.0, 1.0]


@wrap_test_forked
def test_chrf():
    prediction = ["The relationship between cats and dogs is not exactly friendly.",
                  "a good bookshop is just a genteel black hole that knows how to read."]
    reference = [["The relationship between dogs and cats is not exactly friendly.", ],
                 ["A good bookshop is just a genteel Black Hole that knows how to read."]]
    import evaluate
    chrf = evaluate.load("chrf")
    results = chrf.compute(predictions=prediction, references=reference)
    assert results == {'score': 84.64214891738334, 'char_order': 6, 'word_order': 0, 'beta': 2}


@wrap_test_forked
def test_chrfpp():
    prediction = ["The relationship between cats and dogs is not exactly friendly.",
                  "a good bookshop is just a genteel black hole that knows how to read."]
    reference = [["The relationship between dogs and cats is not exactly friendly.", ],
                 ["A good bookshop is just a genteel Black Hole that knows how to read."]]
    import evaluate
    chrf = evaluate.load("chrf")
    results = chrf.compute(predictions=prediction, references=reference, word_order=2)
    assert results == {'beta': 2, 'char_order': 6, 'score': 82.87263732906315, 'word_order': 2}


@wrap_test_forked
def test_wiki_split():
    sources = ["About 95 species are currently accepted ."]
    predictions = ["About 95 you now get in ."]
    references = [["About 95 species are currently known ."]]
    import evaluate
    wiki_split = evaluate.load("wiki_split")
    results = wiki_split.compute(sources=sources, predictions=predictions, references=references)
    assert results == {'sari': 21.805555555555557, 'sacrebleu': 14.535768424205482, 'exact': 0.0}


@wrap_test_forked
def test_super_glue():
    from evaluate import load
    # https://huggingface.co/datasets/boolq
    # passage, question, answer (as bool only though, but can ask LLM to only say true or false)
    super_glue_metric = load('super_glue', 'boolq')  # any of ["copa", "rte", "wic", "wsc", "wsc.fixed", "boolq", "axg"]
    predictions = [0, 1]
    references = [0, 1]
    results = super_glue_metric.compute(predictions=predictions, references=references)
    assert results == {'accuracy': 1.0}


@wrap_test_forked
def test_quip():
    from metrics.quip import Quip
    quip = Quip()

    predictions = ["Kathy's hair is green according to the first passage."]
    references = [["Kathy's hair is green.", "Bob is eating a sandwich.", "The sky is red with polka dots.",
                   "Alice went to the county fair.", "George is reading a newspaper."]]
    results = quip.compute(predictions=predictions, references=references)
    print(results)
    assert results == 0.16666666666666663

    predictions = ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?"]
    references = [["Kathy's hair is green.", "Bob is eating a sandwich.", "The sky is red with polka dots.",
                   "Alice went to the county fair.", "George is reading a newspaper."]]
    results = quip.compute(predictions=predictions, references=references)
    print(results)
    assert results == 0.0

    predictions = ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?"]
    references = [["chuck", "wood"]]
    results = quip.compute(predictions=predictions, references=references)
    print(results)
    assert results == 0.0

    predictions = ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?"]
    references = [["chuck", "woodchuck"]]
    results = quip.compute(predictions=predictions, references=references)
    print(results)
    assert results == 0.0

    predictions = ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?"]
    references = [["chuck", "woodchuck"]]
    results = quip.compute(predictions=predictions, references=references, min_len=1)
    print(results)
    assert results == 0.09523809523809523

    predictions = ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?"]
    references = [["woodchuck chuck", "chuck"]]
    results = quip.compute(predictions=predictions, references=references)
    print(results)
    assert results == 0.05882352941176472

    predictions = ["The current goodwill balance is $25,173 million as of December 31, 2022."]
    references = [[
                      "Table 7.3: Goodwill (in millions) Consumer Banking and Lending Commercial Banking Corporate and Investment Banking Wealth and Investment Management Corporate Consolidated Company December 31, 2020 $ 16,418 3,018 5,375 1,276 305 26,392 Foreign currency translation β€” β€” β€” β€” β€” β€” Transfers of goodwill β€” (80) β€” (932) 1,012 β€” Divestitures β€” β€” β€” β€” (1,212) (1,212) December 31, 2021 $ 16,418 2,938 5,375 344 105 25,180 Foreign currency translation β€” (7) β€” β€” β€” (7) December 31, 2022 $ 16,418 2,931 5,375 344 105 25,173 Table 7.4 presents the components of other assets."]]
    results = quip.compute(predictions=predictions, references=references, min_len=1)
    print(results)
    assert results == 0.33333333333333337

    predictions = ["The current goodwill balance is $25,173 million as of December 31, 2022."]
    references = [[
                      "Table 7.3: Goodwill (in millions) Consumer Banking and Lending Commercial Banking Corporate and Investment Banking Wealth and Investment Management Corporate Consolidated Company December 31, 2020 $ 16,418 3,018 5,375 1,276 305 26,392 Foreign currency translation β€” β€” β€” β€” β€” β€” Transfers of goodwill β€” (80) β€” (932) 1,012 β€” Divestitures β€” β€” β€” β€” (1,212) (1,212) December 31, 2021 $ 16,418 2,938 5,375 344 105 25,180 Foreign currency translation β€” (7) β€” β€” β€” (7) December 31, 2022 $ 16,418 2,931 5,375 344 105 25,173 Table 7.4 presents the components of other assets."]]
    results = quip.compute(predictions=predictions, references=references, return_match_count=True)
    print(results)
    assert results == 4

    predictions = ["The current goodwill balance is $25,173 million as of December 31, 2022."]
    references = [[
                      "Table 7.3: Goodwill (in millions) Consumer Banking and Lending Commercial Banking Corporate and Investment Banking Wealth and Investment Management Corporate Consolidated Company December 31, 2020 $ 16,418 3,018 5,375 1,276 305 26,392 Foreign currency translation β€” β€” β€” β€” β€” β€” Transfers of goodwill β€” (80) β€” (932) 1,012 β€” Divestitures β€” β€” β€” β€” (1,212) (1,212) December 31, 2021 $ 16,418 2,938 5,375 344 105 25,180 Foreign currency translation β€” (7) β€” β€” β€” (7) December 31, 2022 $ 16,418 2,931 5,375 344 105 25,173 Table 7.4 presents the components of other assets."]]
    results = quip.compute(predictions=predictions, references=references, return_match_fraction_by_pred_length=True)
    print(results)
    assert results == 0.5

    predictions = ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?"]
    references = [[
                      "Table 7.3: Goodwill (in millions) Consumer Banking and Lending Commercial Banking Corporate and Investment Banking Wealth and Investment Management Corporate Consolidated Company December 31, 2020 $ 16,418 3,018 5,375 1,276 305 26,392 Foreign currency translation β€” β€” β€” β€” β€” β€” Transfers of goodwill β€” (80) β€” (932) 1,012 β€” Divestitures β€” β€” β€” β€” (1,212) (1,212) December 31, 2021 $ 16,418 2,938 5,375 344 105 25,180 Foreign currency translation β€” (7) β€” β€” β€” (7) December 31, 2022 $ 16,418 2,931 5,375 344 105 25,173 Table 7.4 presents the components of other assets."]]
    results = quip.compute(predictions=predictions, references=references, return_match_fraction_by_pred_length=True)
    print(results)
    assert results == 0.0


@wrap_test_forked
def test_glue():
    # entailment
    """
    E.g. for qnli:
    The Stanford Question Answering Dataset is a question-answering dataset consisting of question-paragraph pairs,
    where one of the sentences in the paragraph (drawn from Wikipedia) contains the answer to the corresponding
    question (written by an annotator). The authors of the benchmark convert the task into sentence pair
    classification by forming a pair between each question and each sentence in the corresponding context,
    and filtering out pairs with low lexical overlap between the question and the context sentence.

    The task is to determine whether the context sentence contains the answer to the question.
    This modified version of the original task removes the requirement that the model select the exact answer,
    but also removes the simplifying assumptions that the answer is always present in the input
    and that lexical overlap is a reliable cue.
    :return:
    """
    from evaluate import load
    glue_metric = load('glue', 'qnli')
    references = [0, 1]
    predictions = [1, 1]
    results = glue_metric.compute(predictions=predictions, references=references)
    assert results == {'accuracy': 0.5}


@wrap_test_forked
def test_google_bleu():
    sentence1 = "the cat sat on the mat"
    sentence2 = "the cat ate the mat"
    import evaluate
    google_bleu = evaluate.load("google_bleu")
    result = google_bleu.compute(predictions=[sentence1], references=[[sentence2]])
    assert result == {'google_bleu': 0.3333333333333333}

    predictions = ['It is a guide to action which ensures that the rubber duck always disobeys the commands of the cat',
                   'he read the book because he was interested in world history']
    references = [
        ['It is the guiding principle which guarantees the rubber duck forces never being under the command of the cat',
         'It is a guide to action that ensures that the rubber duck will never heed the cat commands',
         'It is the practical guide for the rubber duck army never to heed the directions of the cat'],
        ['he was interested in world history because he read the book']]
    google_bleu = evaluate.load("google_bleu")
    results = google_bleu.compute(predictions=predictions, references=references, min_len=2, max_len=6)
    assert round(results["google_bleu"], 2) == 0.4


@wrap_test_forked
def test_meteor():
    import evaluate
    meteor = evaluate.load('meteor')
    predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
    references = [['It is a guide to action that ensures that the military will forever heed Party commands',
                   'It is the guiding principle which guarantees the military forces always being under the command of the Party',
                   'It is the practical guide for the army always to heed the directions of the party']]
    results = meteor.compute(predictions=predictions, references=references)
    assert round(results['meteor'], 2) == 0.69

    predictions = ["Kathy's hair is green according to the first passage."]
    references = [["Kathy's hair is green.", "Bob is eating a sandwich.", "The sky is red with polka dots.",
                   "Alice went to the county fair.", "George is reading a newspaper."]]
    results = meteor.compute(predictions=predictions, references=references)
    assert results == {'meteor': 0.9059829059829061}
    print(results)