Spaces:

evaluate-metric
/

perplexity

Running

App Files Files Community

perplexity

by awais126 - opened Aug 17, 2022

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+13

-15

This PR is in draft mode

Files changed (3) hide show

README.md +2 -2
perplexity.py +10 -12
requirements.txt +1 -1

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🤗
 colorFrom: blue
 colorTo: red
 sdk: gradio
-sdk_version: 3.19.1
 app_file: app.py
 pinned: false
 tags:
@@ -73,7 +73,7 @@ results = perplexity.compute(model_id='gpt2',
 print(list(results.keys()))
 >>>['perplexities', 'mean_perplexity']
 print(round(results["mean_perplexity"], 2))
->>>646.75
 print(round(results["perplexities"][0], 2))
 >>>32.25
 ```

 colorFrom: blue
 colorTo: red
 sdk: gradio
+sdk_version: 3.0.2
 app_file: app.py
 pinned: false
 tags:
 print(list(results.keys()))
 >>>['perplexities', 'mean_perplexity']
 print(round(results["mean_perplexity"], 2))
+>>>646.74
 print(round(results["perplexities"][0], 2))
 >>>32.25
 ```

perplexity.py CHANGED Viewed

@@ -63,10 +63,10 @@ Examples:
         ...                              predictions=input_texts) # doctest:+ELLIPSIS
         >>> print(list(results.keys()))
         ['perplexities', 'mean_perplexity']
-        >>> print(round(results["mean_perplexity"], 0))
-        647.0
-        >>> print(round(results["perplexities"][0], 0))
-        32.0
     Example 2:
         >>> from datasets import load_dataset
@@ -100,9 +100,7 @@ class Perplexity(evaluate.Metric):
             reference_urls=["https://huggingface.co/docs/transformers/perplexity"],
         )
-    def _compute(
-        self, predictions, model_id, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None
-    ):
         if device is not None:
             assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
@@ -128,20 +126,20 @@ class Perplexity(evaluate.Metric):
             # assign one of the special tokens to also be the pad token
             tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
-        if add_start_token and max_length:
             # leave room for <BOS> token to be added:
             assert (
                 tokenizer.bos_token is not None
             ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
-            max_tokenized_len = max_length - 1
         else:
-            max_tokenized_len = max_length
         encodings = tokenizer(
             predictions,
             add_special_tokens=False,
             padding=True,
-            truncation=True if max_tokenized_len else False,
             max_length=max_tokenized_len,
             return_tensors="pt",
             return_attention_mask=True,
@@ -166,7 +164,7 @@ class Perplexity(evaluate.Metric):
             encoded_batch = encoded_texts[start_index:end_index]
             attn_mask = attn_masks[start_index:end_index]
-            if add_start_token and tokenizer.bos_token_id is not None:
                 bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)
                 encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)
                 attn_mask = torch.cat(

         ...                              predictions=input_texts) # doctest:+ELLIPSIS
         >>> print(list(results.keys()))
         ['perplexities', 'mean_perplexity']
+        >>> print(round(results["mean_perplexity"], 2))
+        78.22
+        >>> print(round(results["perplexities"][0], 2))
+        11.11
     Example 2:
         >>> from datasets import load_dataset
             reference_urls=["https://huggingface.co/docs/transformers/perplexity"],
         )
+    def _compute(self, predictions, model_id, batch_size: int = 16, add_start_token: bool = True, device=None):
         if device is not None:
             assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
             # assign one of the special tokens to also be the pad token
             tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
+        if add_start_token:
             # leave room for <BOS> token to be added:
             assert (
                 tokenizer.bos_token is not None
             ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
+            max_tokenized_len = model.config.max_length - 1
         else:
+            max_tokenized_len = model.config.max_length
         encodings = tokenizer(
             predictions,
             add_special_tokens=False,
             padding=True,
+            truncation=True,
             max_length=max_tokenized_len,
             return_tensors="pt",
             return_attention_mask=True,
             encoded_batch = encoded_texts[start_index:end_index]
             attn_mask = attn_masks[start_index:end_index]
+            if add_start_token:
                 bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)
                 encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)
                 attn_mask = torch.cat(

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-git+https://github.com/huggingface/evaluate@7c4656a407213b71cb7e6f6634b7935c18f5140d
 torch
 torch
 transformers

+git+https://github.com/huggingface/evaluate@4487d9d1e65216a36b4aa94e3396a570f44a1525
 torch
 torch
 transformers