File size: 14,812 Bytes
7e60a5e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 |
from tests.utils import wrap_test_forked
@wrap_test_forked
def test_bleurt():
predictions = ["hello there", "general kenobi"]
references = ["hello there", "general kenobi"]
import evaluate
bleurt = evaluate.load("bleurt")
results = bleurt.compute(predictions=predictions, references=references)
assert [round(v, 2) for v in results["scores"]] == [1.03, 1.04]
@wrap_test_forked
def test_sacrebleu():
predictions = ["hello there general kenobi", "foo bar foobar"]
references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]]
import evaluate
sacrebleu = evaluate.load("sacrebleu")
results = sacrebleu.compute(predictions=predictions, references=references)
assert list(results.keys()) == ['score', 'counts', 'totals', 'precisions', 'bp', 'sys_len', 'ref_len']
assert round(results["score"], 1) == 100.0
predictions = ["hello there general kenobi", "on our way to ankh morpork"]
references = [["hello there general kenobi", "hello there !"], ["goodbye ankh morpork", "ankh morpork"]]
sacrebleu = evaluate.load("sacrebleu")
results = sacrebleu.compute(predictions=predictions, references=references)
assert list(results.keys()) == ['score', 'counts', 'totals', 'precisions', 'bp', 'sys_len', 'ref_len']
assert round(results["score"], 1) == 39.8
@wrap_test_forked
def test_bleu():
predictions = ["hello there general kenobi", "foo bar foobar"]
references = [
["hello there general kenobi", "hello there!"],
["foo bar foobar"]
]
import evaluate
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=predictions, references=references)
assert results["bleu"] == 1.0
@wrap_test_forked
def test_squad_v1():
predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22'}]
references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}]
import evaluate
squad_metric = evaluate.load("squad")
results = squad_metric.compute(predictions=predictions, references=references)
assert results == {'exact_match': 100.0, 'f1': 100.0}
def test_squad_v2():
predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22', 'no_answer_probability': 0.}]
references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}]
import evaluate
squad_v2_metric = evaluate.load("squad_v2")
results = squad_v2_metric.compute(predictions=predictions, references=references)
assert results == {'exact': 100.0, 'f1': 100.0, 'total': 1, 'HasAns_exact': 100.0, 'HasAns_f1': 100.0,
'HasAns_total': 1, 'best_exact': 100.0, 'best_exact_thresh': 0.0, 'best_f1': 100.0,
'best_f1_thresh': 0.0}
@wrap_test_forked
def test_rougue():
import evaluate
rouge = evaluate.load('rouge')
predictions = ["hello there", "general kenobi"]
references = ["hello there", "general kenobi"]
results = rouge.compute(predictions=predictions, references=references)
assert results == {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
@wrap_test_forked
def test_bertscore():
predictions = ["hello there", "general kenobi"]
references = ["hello there", "general kenobi"]
import evaluate
bertscore = evaluate.load("bertscore")
results = bertscore.compute(predictions=predictions, references=references, lang="en")
assert [round(v, 2) for v in results["f1"]] == [1.0, 1.0]
@wrap_test_forked
def test_chrf():
prediction = ["The relationship between cats and dogs is not exactly friendly.",
"a good bookshop is just a genteel black hole that knows how to read."]
reference = [["The relationship between dogs and cats is not exactly friendly.", ],
["A good bookshop is just a genteel Black Hole that knows how to read."]]
import evaluate
chrf = evaluate.load("chrf")
results = chrf.compute(predictions=prediction, references=reference)
assert results == {'score': 84.64214891738334, 'char_order': 6, 'word_order': 0, 'beta': 2}
@wrap_test_forked
def test_chrfpp():
prediction = ["The relationship between cats and dogs is not exactly friendly.",
"a good bookshop is just a genteel black hole that knows how to read."]
reference = [["The relationship between dogs and cats is not exactly friendly.", ],
["A good bookshop is just a genteel Black Hole that knows how to read."]]
import evaluate
chrf = evaluate.load("chrf")
results = chrf.compute(predictions=prediction, references=reference, word_order=2)
assert results == {'beta': 2, 'char_order': 6, 'score': 82.87263732906315, 'word_order': 2}
@wrap_test_forked
def test_wiki_split():
sources = ["About 95 species are currently accepted ."]
predictions = ["About 95 you now get in ."]
references = [["About 95 species are currently known ."]]
import evaluate
wiki_split = evaluate.load("wiki_split")
results = wiki_split.compute(sources=sources, predictions=predictions, references=references)
assert results == {'sari': 21.805555555555557, 'sacrebleu': 14.535768424205482, 'exact': 0.0}
@wrap_test_forked
def test_super_glue():
from evaluate import load
# https://huggingface.co/datasets/boolq
# passage, question, answer (as bool only though, but can ask LLM to only say true or false)
super_glue_metric = load('super_glue', 'boolq') # any of ["copa", "rte", "wic", "wsc", "wsc.fixed", "boolq", "axg"]
predictions = [0, 1]
references = [0, 1]
results = super_glue_metric.compute(predictions=predictions, references=references)
assert results == {'accuracy': 1.0}
@wrap_test_forked
def test_quip():
from metrics.quip import Quip
quip = Quip()
predictions = ["Kathy's hair is green according to the first passage."]
references = [["Kathy's hair is green.", "Bob is eating a sandwich.", "The sky is red with polka dots.",
"Alice went to the county fair.", "George is reading a newspaper."]]
results = quip.compute(predictions=predictions, references=references)
print(results)
assert results == 0.16666666666666663
predictions = ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?"]
references = [["Kathy's hair is green.", "Bob is eating a sandwich.", "The sky is red with polka dots.",
"Alice went to the county fair.", "George is reading a newspaper."]]
results = quip.compute(predictions=predictions, references=references)
print(results)
assert results == 0.0
predictions = ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?"]
references = [["chuck", "wood"]]
results = quip.compute(predictions=predictions, references=references)
print(results)
assert results == 0.0
predictions = ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?"]
references = [["chuck", "woodchuck"]]
results = quip.compute(predictions=predictions, references=references)
print(results)
assert results == 0.0
predictions = ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?"]
references = [["chuck", "woodchuck"]]
results = quip.compute(predictions=predictions, references=references, min_len=1)
print(results)
assert results == 0.09523809523809523
predictions = ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?"]
references = [["woodchuck chuck", "chuck"]]
results = quip.compute(predictions=predictions, references=references)
print(results)
assert results == 0.05882352941176472
predictions = ["The current goodwill balance is $25,173 million as of December 31, 2022."]
references = [[
"Table 7.3: Goodwill (in millions) Consumer Banking and Lending Commercial Banking Corporate and Investment Banking Wealth and Investment Management Corporate Consolidated Company December 31, 2020 $ 16,418 3,018 5,375 1,276 305 26,392 Foreign currency translation β β β β β β Transfers of goodwill β (80) β (932) 1,012 β Divestitures β β β β (1,212) (1,212) December 31, 2021 $ 16,418 2,938 5,375 344 105 25,180 Foreign currency translation β (7) β β β (7) December 31, 2022 $ 16,418 2,931 5,375 344 105 25,173 Table 7.4 presents the components of other assets."]]
results = quip.compute(predictions=predictions, references=references, min_len=1)
print(results)
assert results == 0.33333333333333337
predictions = ["The current goodwill balance is $25,173 million as of December 31, 2022."]
references = [[
"Table 7.3: Goodwill (in millions) Consumer Banking and Lending Commercial Banking Corporate and Investment Banking Wealth and Investment Management Corporate Consolidated Company December 31, 2020 $ 16,418 3,018 5,375 1,276 305 26,392 Foreign currency translation β β β β β β Transfers of goodwill β (80) β (932) 1,012 β Divestitures β β β β (1,212) (1,212) December 31, 2021 $ 16,418 2,938 5,375 344 105 25,180 Foreign currency translation β (7) β β β (7) December 31, 2022 $ 16,418 2,931 5,375 344 105 25,173 Table 7.4 presents the components of other assets."]]
results = quip.compute(predictions=predictions, references=references, return_match_count=True)
print(results)
assert results == 4
predictions = ["The current goodwill balance is $25,173 million as of December 31, 2022."]
references = [[
"Table 7.3: Goodwill (in millions) Consumer Banking and Lending Commercial Banking Corporate and Investment Banking Wealth and Investment Management Corporate Consolidated Company December 31, 2020 $ 16,418 3,018 5,375 1,276 305 26,392 Foreign currency translation β β β β β β Transfers of goodwill β (80) β (932) 1,012 β Divestitures β β β β (1,212) (1,212) December 31, 2021 $ 16,418 2,938 5,375 344 105 25,180 Foreign currency translation β (7) β β β (7) December 31, 2022 $ 16,418 2,931 5,375 344 105 25,173 Table 7.4 presents the components of other assets."]]
results = quip.compute(predictions=predictions, references=references, return_match_fraction_by_pred_length=True)
print(results)
assert results == 0.5
predictions = ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?"]
references = [[
"Table 7.3: Goodwill (in millions) Consumer Banking and Lending Commercial Banking Corporate and Investment Banking Wealth and Investment Management Corporate Consolidated Company December 31, 2020 $ 16,418 3,018 5,375 1,276 305 26,392 Foreign currency translation β β β β β β Transfers of goodwill β (80) β (932) 1,012 β Divestitures β β β β (1,212) (1,212) December 31, 2021 $ 16,418 2,938 5,375 344 105 25,180 Foreign currency translation β (7) β β β (7) December 31, 2022 $ 16,418 2,931 5,375 344 105 25,173 Table 7.4 presents the components of other assets."]]
results = quip.compute(predictions=predictions, references=references, return_match_fraction_by_pred_length=True)
print(results)
assert results == 0.0
@wrap_test_forked
def test_glue():
# entailment
"""
E.g. for qnli:
The Stanford Question Answering Dataset is a question-answering dataset consisting of question-paragraph pairs,
where one of the sentences in the paragraph (drawn from Wikipedia) contains the answer to the corresponding
question (written by an annotator). The authors of the benchmark convert the task into sentence pair
classification by forming a pair between each question and each sentence in the corresponding context,
and filtering out pairs with low lexical overlap between the question and the context sentence.
The task is to determine whether the context sentence contains the answer to the question.
This modified version of the original task removes the requirement that the model select the exact answer,
but also removes the simplifying assumptions that the answer is always present in the input
and that lexical overlap is a reliable cue.
:return:
"""
from evaluate import load
glue_metric = load('glue', 'qnli')
references = [0, 1]
predictions = [1, 1]
results = glue_metric.compute(predictions=predictions, references=references)
assert results == {'accuracy': 0.5}
@wrap_test_forked
def test_google_bleu():
sentence1 = "the cat sat on the mat"
sentence2 = "the cat ate the mat"
import evaluate
google_bleu = evaluate.load("google_bleu")
result = google_bleu.compute(predictions=[sentence1], references=[[sentence2]])
assert result == {'google_bleu': 0.3333333333333333}
predictions = ['It is a guide to action which ensures that the rubber duck always disobeys the commands of the cat',
'he read the book because he was interested in world history']
references = [
['It is the guiding principle which guarantees the rubber duck forces never being under the command of the cat',
'It is a guide to action that ensures that the rubber duck will never heed the cat commands',
'It is the practical guide for the rubber duck army never to heed the directions of the cat'],
['he was interested in world history because he read the book']]
google_bleu = evaluate.load("google_bleu")
results = google_bleu.compute(predictions=predictions, references=references, min_len=2, max_len=6)
assert round(results["google_bleu"], 2) == 0.4
@wrap_test_forked
def test_meteor():
import evaluate
meteor = evaluate.load('meteor')
predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
references = [['It is a guide to action that ensures that the military will forever heed Party commands',
'It is the guiding principle which guarantees the military forces always being under the command of the Party',
'It is the practical guide for the army always to heed the directions of the party']]
results = meteor.compute(predictions=predictions, references=references)
assert round(results['meteor'], 2) == 0.69
predictions = ["Kathy's hair is green according to the first passage."]
references = [["Kathy's hair is green.", "Bob is eating a sandwich.", "The sky is red with polka dots.",
"Alice went to the county fair.", "George is reading a newspaper."]]
results = meteor.compute(predictions=predictions, references=references)
assert results == {'meteor': 0.9059829059829061}
print(results)
|