File size: 3,155 Bytes
83dd7d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81dbb49
83dd7d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import datasets
import evaluate


_CITATION = """\
tba
"""

_DESCRIPTION = """\
Negation-aware version of BLEURT metric.
BLEURT a learnt evaluation metric for Natural Language Generation. It is built using multiple phases of transfer learning starting from a pretrained BERT model (Devlin et al. 2018) and then employing another pre-training phrase using synthetic data. Finally it is trained on WMT human annotations and the CANNOT negation awareness dataset.
"""

_KWARGS_DESCRIPTION = """
Calculates the NegBLEURT scores between references and predictions
Args:
    predictions: list of predictions to score. Each prediction should be a string.
    references: single reference or list of references for each prediction. If only one reference is given, all predictions will be scored against the same reference
    batch_size: batch_size for model inference. Default is 16
Returns:
    negBLEURT: List of NegBLEURT scores for all predictions
Examples:
    >>> negBLEURT = evaluate.load('MiriUll/negbleurt')
    >>> predictions = ["Ray Charles is a legend.", "Ray Charles isn’t legendary."]
    >>> reference = "Ray Charles is legendary."
    >>> results = rouge.compute(predictions=predictions, references=reference)
    >>> print(results)
    {'negBLERUT': [0.8409, 0.2601]}
"""

@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class NegBLEURT(evaluate.Metric):
    def _info(self):
        return evaluate.MetricInfo(
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            features=[
                datasets.Features(
                    {
                        "predictions": datasets.Value("string", id="sequence"),
                        "references": datasets.Sequence(datasets.Value("string", id="sequence")),
                    }
                ),
                datasets.Features(
                    {
                        "predictions": datasets.Value("string", id="sequence"),
                        "references": datasets.Value("string", id="sequence"),
                    }
                ),
            ],
            codebase_urls=["https://github.com/MiriUll/negation_aware_evaluation"]
        )

    def _download_and_prepare(self, dl_manager):
        model_name = "tum-nlp/NegBLEURT"
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)

    def _compute(
        self, predictions, references, batch_size=16
    ):
        single_ref = isinstance(references, str)
        if single_ref:
            references = [references] * len(predictions)

        scores_negbleurt = []
        for i in tqdm(range(0, len(references), batch_size)):
            tokenized = self.tokenizer(references[i:i+batch_size], candidates[i:i+batch_size], return_tensors='pt', padding=True, max_length=512, truncation=True)
            scores_negbleurt += self.model(**tokenized).logits.flatten().tolist()
        return {'negBLEURT': scores_negbleurt}