from transformers import AutoTokenizer, AutoModelForSequenceClassification import datasets import evaluate _CITATION = """\ tba """ _DESCRIPTION = """\ Negation-aware version of BLEURT metric. BLEURT a learnt evaluation metric for Natural Language Generation. It is built using multiple phases of transfer learning starting from a pretrained BERT model (Devlin et al. 2018) and then employing another pre-training phrase using synthetic data. Finally it is trained on WMT human annotations and the CANNOT negation awareness dataset. """ _KWARGS_DESCRIPTION = """ Calculates the NegBLEURT scores between references and predictions Args: predictions: list of predictions to score. Each prediction should be a string. references: single reference or list of references for each prediction. If only one reference is given, all predictions will be scored against the same reference batch_size: batch_size for model inference. Default is 16 Returns: negBLEURT: List of NegBLEURT scores for all predictions Examples: >>> negBLEURT = evaluate.load('MiriUll/negbleurt') >>> predictions = ["Ray Charles is a legend.", "Ray Charles isn’t legendary."] >>> reference = "Ray Charles is legendary." >>> results = rouge.compute(predictions=predictions, references=reference) >>> print(results) {'negBLERUT': [0.8409, 0.2601]} """ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class NegBLEURT(evaluate.Metric): def _info(self): return evaluate.MetricInfo( description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, features=[ datasets.Features( { "predictions": datasets.Value("string", id="sequence"), "references": datasets.Sequence(datasets.Value("string", id="sequence")), } ), datasets.Features( { "predictions": datasets.Value("string", id="sequence"), "references": datasets.Value("string", id="sequence"), } ), ], codebase_urls=["https://github.com/MiriUll/negation_aware_evaluation"] ) def _download_and_prepare(self, dl_manager): model_name = "tum-nlp/NegBLEURT" self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModelForSequenceClassification.from_pretrained(model_name) def _compute( self, predictions, references, batch_size=16 ): single_ref = isinstance(references, str) if single_ref: references = [references] * len(predictions) scores_negbleurt = [] for i in tqdm(range(0, len(references), batch_size)): tokenized = self.tokenizer(references[i:i+batch_size], candidates[i:i+batch_size], return_tensors='pt', padding=True, max_length=512, truncation=True) scores_negbleurt += self.model(**tokenized).logits.flatten().tolist() return {'negBLEURT': scores_negbleurt}