Spaces:

ronaldahmed
/

nwentfaithfulness

Runtime error

App Files Files Community

ronald cardenas acosta commited on Oct 9, 2022

Commit

141eb78

•

1 Parent(s): 40166c5

batching

Browse files

Files changed (2) hide show

app.py +2 -2
nwentfaithfulness.py +70 -21

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import evaluate
 from evaluate.utils import launch_gradio_widget
-module = evaluate.load("ronaldahmed/nwentfaithfulness")
 launch_gradio_widget(module)

 import evaluate
 from evaluate.utils import launch_gradio_widget
+METRICS_CACHE_DIR="/gfs/team/nlp/users/rcardena/tools/huggingface/evaluate"
+module = evaluate.load("nwentfaithfulness",module_type="metric",cache_dir=METRICS_CACHE_DIR)
 launch_gradio_widget(module)

nwentfaithfulness.py CHANGED Viewed

@@ -15,20 +15,25 @@
 import evaluate
 import datasets
 # TODO: Add BibTeX citation
 _CITATION = """\
-@InProceedings{huggingface:module,
-title = {A great new module},
-authors={huggingface, Inc.},
-year={2020}
 }
 """
 # TODO: Add description of the module here
 _DESCRIPTION = """\
-This new module is designed to solve this great ML task and is crafted with a lot of care.
 """
@@ -36,13 +41,12 @@ This new module is designed to solve this great ML task and is crafted with a lo
 _KWARGS_DESCRIPTION = """
 Calculates how good are predictions given some references, using certain scores
 Args:
-    predictions: list of predictions to score. Each predictions
-        should be a string with tokens separated by spaces.
-    references: list of reference for each prediction. Each
-        reference should be a string with tokens separated by spaces.
 Returns:
-    accuracy: description of the first score,
-    another_score: description of the second score,
 Examples:
     Examples should be written in doctest format, and should illustrate how
     to use the function.
@@ -50,11 +54,11 @@ Examples:
     >>> my_new_module = evaluate.load("my_new_module")
     >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
     >>> print(results)
-    {'accuracy': 1.0}
 """
 # TODO: Define external resources urls if needed
-BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
@@ -75,7 +79,7 @@ class NwEntFaithfulness(evaluate.Metric):
                 'references': datasets.Value('int64'),
             }),
             # Homepage of the module for documentation
-            homepage="http://module.homepage",
             # Additional links to the codebase or references
             codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
             reference_urls=["http://path.to.reference.url/new_module"]
@@ -86,10 +90,55 @@ class NwEntFaithfulness(evaluate.Metric):
         # TODO: Download external resources if needed
         pass
-    def _compute(self, predictions, references):
-        """Returns the scores"""
-        # TODO: Compute the different scores of the module
-        accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
-        return {
-            "accuracy": accuracy,
-        }

 import evaluate
 import datasets
+import numpy as np
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+import evaluate
+from evaluate import logging
 # TODO: Add BibTeX citation
 _CITATION = """\
 }
 """
 # TODO: Add description of the module here
 _DESCRIPTION = """\
+This metric quantifies the faithfulness of a summary wrt to a source document,
+as given by the probability that the document is entailed by the summary.
+This metric uses pretrained models apt for the Newswire domain (see ScEntFaithfulness
+for a version in scientific domain).
 """
 _KWARGS_DESCRIPTION = """
 Calculates how good are predictions given some references, using certain scores
 Args:
+    predictions: list of predictions to score. Each prediction represents a summary and
+        should be a string with tokens separated by spaces
+    references: list of references for each prediction. Each
+        reference represents the input document and should be a string with tokens separated by spaces.
 Returns:
+    ent-faith: description of the first score,
 Examples:
     Examples should be written in doctest format, and should illustrate how
     to use the function.
     >>> my_new_module = evaluate.load("my_new_module")
     >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
     >>> print(results)
+    {'ent-faith': 1.0}
 """
 # TODO: Define external resources urls if needed
+# BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
                 'references': datasets.Value('int64'),
             }),
             # Homepage of the module for documentation
+            homepage="https://huggingface.co/spaces/ronaldahmed/nwentfaithfulness",
             # Additional links to the codebase or references
             codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
             reference_urls=["http://path.to.reference.url/new_module"]
         # TODO: Download external resources if needed
         pass
+    # original: references
+    def _compute(self, predictions, documents,
+                    batch_size: int = 16, device=None):
+        MODEL_CACHE_DIR="/gfs/team/nlp/users/rcardena/tools/huggingface"
+        if device is not None:
+            assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
+            if device == "gpu":
+                device = "cuda"
+        else:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        model = AutoModelForSequenceClassification.from_pretrained(
+                        "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli",
+                        cache_dir=MODEL_CACHE_DIR)
+        model = model.to(device)
+        tokenizer = AutoTokenizer.from_pretrained(
+                            "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli",
+                            cache_dir=MODEL_CACHE_DIR)
+        max_tokenized_len = model.config.max_length | 256
+        encoded_texts = []
+        attn_masks = []
+        tok_types = []
+        for pred,doc in zip(predictions,documents):
+            enc = tokenizer.encode_plus(pred, doc,
+                                        max_length=max_tokenized_len,
+                                        padding=True,
+                                        truncation=True,
+                                        return_token_type_ids=True,
+                                        return_attention_mask=True)
+            encoded_texts.append(enc["input_ids"])
+            attn_masks.append(enc["attention_mask"])
+            tok_types.append(enc["token_type_ids"])
+        enf_fs = []
+        for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):
+            end_index = min(start_index + batch_size, len(encoded_texts))
+            encoded_batch = torch.Long(encoded_texts[start_index:end_index]).to(device)
+            attn_mask = torch.Long(attn_masks[start_index:end_index]).to(device)
+            token_type = torch.Long(tok_types[start_index:end_index]).to(device)
+            with torch.no_grad():
+                outputs = model(encoded_batch,
+                                attention_mask=attn_mask,
+                                token_type_ids=token_type,
+                                labels=None)[0]
+                probs = torch.softmax(outputs,dim=1)[:,0].tolist()
+            enf_fs += probs
+        return {"ent-faith": enf_fs, "mean_ent-faith": np.mean(enf_fs)}