lvwerra HF staff commited on
Commit
17c20e5
1 Parent(s): c715b54

Update Space (evaluate main: c447fc8e)

Browse files
Files changed (2) hide show
  1. perplexity.py +10 -29
  2. requirements.txt +1 -1
perplexity.py CHANGED
@@ -13,9 +13,6 @@
13
  # limitations under the License.
14
  """Perplexity Metric."""
15
 
16
- from dataclasses import dataclass
17
- from typing import Optional
18
-
19
  import datasets
20
  import numpy as np
21
  import torch
@@ -87,29 +84,14 @@ Examples:
87
  """
88
 
89
 
90
- @dataclass
91
- class PerplexityConfig(evaluate.info.Config):
92
-
93
- name: str = "default"
94
-
95
- batch_size: int = 16
96
- model_id: str = "gpt2"
97
- add_start_token: bool = True
98
- device: Optional[str] = None
99
-
100
-
101
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
102
  class Perplexity(evaluate.Measurement):
103
- CONFIG_CLASS = PerplexityConfig
104
- ALLOWED_CONFIG_NAMES = ["default"]
105
-
106
- def _info(self, config):
107
  return evaluate.MeasurementInfo(
108
  module_type="measurement",
109
  description=_DESCRIPTION,
110
  citation=_CITATION,
111
  inputs_description=_KWARGS_DESCRIPTION,
112
- config=config,
113
  features=datasets.Features(
114
  {
115
  "data": datasets.Value("string"),
@@ -118,9 +100,8 @@ class Perplexity(evaluate.Measurement):
118
  reference_urls=["https://huggingface.co/docs/transformers/perplexity"],
119
  )
120
 
121
- def _compute(self, data):
122
 
123
- device = self.config.device
124
  if device is not None:
125
  assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
126
  if device == "gpu":
@@ -128,15 +109,15 @@ class Perplexity(evaluate.Measurement):
128
  else:
129
  device = "cuda" if torch.cuda.is_available() else "cpu"
130
 
131
- model = AutoModelForCausalLM.from_pretrained(self.config.model_id)
132
  model = model.to(device)
133
 
134
- tokenizer = AutoTokenizer.from_pretrained(self.config.model_id)
135
 
136
  # if batch_size > 1 (which generally leads to padding being required), and
137
  # if there is not an already assigned pad_token, assign an existing
138
  # special token to also be the padding token
139
- if tokenizer.pad_token is None and self.config.batch_size > 1:
140
  existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
141
  # check that the model already has at least one special token defined
142
  assert (
@@ -145,7 +126,7 @@ class Perplexity(evaluate.Measurement):
145
  # assign one of the special tokens to also be the pad token
146
  tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
147
 
148
- if self.config.add_start_token:
149
  # leave room for <BOS> token to be added:
150
  assert (
151
  tokenizer.bos_token is not None
@@ -168,7 +149,7 @@ class Perplexity(evaluate.Measurement):
168
  attn_masks = encodings["attention_mask"]
169
 
170
  # check that each input is long enough:
171
- if self.config.add_start_token:
172
  assert torch.all(torch.ge(attn_masks.sum(1), 1)), "Each input text must be at least one token long."
173
  else:
174
  assert torch.all(
@@ -178,12 +159,12 @@ class Perplexity(evaluate.Measurement):
178
  ppls = []
179
  loss_fct = CrossEntropyLoss(reduction="none")
180
 
181
- for start_index in logging.tqdm(range(0, len(encoded_texts), self.config.batch_size)):
182
- end_index = min(start_index + self.config.batch_size, len(encoded_texts))
183
  encoded_batch = encoded_texts[start_index:end_index]
184
  attn_mask = attn_masks[start_index:end_index]
185
 
186
- if self.config.add_start_token:
187
  bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)
188
  encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)
189
  attn_mask = torch.cat(
 
13
  # limitations under the License.
14
  """Perplexity Metric."""
15
 
 
 
 
16
  import datasets
17
  import numpy as np
18
  import torch
 
84
  """
85
 
86
 
 
 
 
 
 
 
 
 
 
 
 
87
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
88
  class Perplexity(evaluate.Measurement):
89
+ def _info(self):
 
 
 
90
  return evaluate.MeasurementInfo(
91
  module_type="measurement",
92
  description=_DESCRIPTION,
93
  citation=_CITATION,
94
  inputs_description=_KWARGS_DESCRIPTION,
 
95
  features=datasets.Features(
96
  {
97
  "data": datasets.Value("string"),
 
100
  reference_urls=["https://huggingface.co/docs/transformers/perplexity"],
101
  )
102
 
103
+ def _compute(self, data, model_id, batch_size: int = 16, add_start_token: bool = True, device=None):
104
 
 
105
  if device is not None:
106
  assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
107
  if device == "gpu":
 
109
  else:
110
  device = "cuda" if torch.cuda.is_available() else "cpu"
111
 
112
+ model = AutoModelForCausalLM.from_pretrained(model_id)
113
  model = model.to(device)
114
 
115
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
116
 
117
  # if batch_size > 1 (which generally leads to padding being required), and
118
  # if there is not an already assigned pad_token, assign an existing
119
  # special token to also be the padding token
120
+ if tokenizer.pad_token is None and batch_size > 1:
121
  existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
122
  # check that the model already has at least one special token defined
123
  assert (
 
126
  # assign one of the special tokens to also be the pad token
127
  tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
128
 
129
+ if add_start_token:
130
  # leave room for <BOS> token to be added:
131
  assert (
132
  tokenizer.bos_token is not None
 
149
  attn_masks = encodings["attention_mask"]
150
 
151
  # check that each input is long enough:
152
+ if add_start_token:
153
  assert torch.all(torch.ge(attn_masks.sum(1), 1)), "Each input text must be at least one token long."
154
  else:
155
  assert torch.all(
 
159
  ppls = []
160
  loss_fct = CrossEntropyLoss(reduction="none")
161
 
162
+ for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):
163
+ end_index = min(start_index + batch_size, len(encoded_texts))
164
  encoded_batch = encoded_texts[start_index:end_index]
165
  attn_mask = attn_masks[start_index:end_index]
166
 
167
+ if add_start_token:
168
  bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)
169
  encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)
170
  attn_mask = torch.cat(
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
- git+https://github.com/huggingface/evaluate@e4a2724377909fe2aeb4357e3971e5a569673b39
2
  torch
3
  transformers
 
1
+ git+https://github.com/huggingface/evaluate@c447fc8eda9c62af501bfdc6988919571050d950
2
  torch
3
  transformers