Elron commited on
Commit
5c531b1
1 Parent(s): 7e18aa8

Upload metrics.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. metrics.py +111 -34
metrics.py CHANGED
@@ -1879,6 +1879,7 @@ class BertScore(HuggingfaceBulkMetric):
1879
  hf_metric_fields = ["f1", "precision", "recall"]
1880
  ci_scores = ["f1", "precision", "recall"]
1881
  model_name: str
 
1882
 
1883
  prediction_type = "str"
1884
 
@@ -1886,7 +1887,9 @@ class BertScore(HuggingfaceBulkMetric):
1886
 
1887
  def prepare(self):
1888
  super().prepare()
1889
- self.hf_compute_args = {"model_type": self.model_name, "batch_size": 16}
 
 
1890
 
1891
 
1892
  class SentenceBert(BulkInstanceMetric):
@@ -1947,6 +1950,9 @@ class Reward(BulkInstanceMetric):
1947
 
1948
  model_name: str
1949
 
 
 
 
1950
  _requirements_list: List[str] = ["transformers", "torch"]
1951
 
1952
  def prepare(self):
@@ -2141,9 +2147,13 @@ class Perplexity(BulkInstanceMetric):
2141
  reduction_map = {"mean": ["perplexity"]}
2142
  prediction_type = "str"
2143
 
2144
- perplexity_prompt: str
 
2145
  batch_size: int = 32
2146
  model_name: str
 
 
 
2147
 
2148
  _requirements_list: List[str] = ["transformers", "torch"]
2149
 
@@ -2160,24 +2170,41 @@ class Perplexity(BulkInstanceMetric):
2160
 
2161
  :return: the likelihood of generating text Y_i after each text X_i_j = P(Y_i|X_i_1), ..., P(Y_i|X_i_n) for every i.
2162
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2163
  sources = []
2164
  targets = []
2165
  for prediction, instance_references in zip(predictions, references):
2166
  for instance_reference in instance_references:
2167
- sources.append(f"{self.perplexity_prompt} {instance_reference}")
2168
- targets.append(prediction)
2169
-
2170
- from transformers import AutoConfig
2171
-
2172
- config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=True)
2173
- lm = (
2174
- self.EncoderDecoderLM(model_name=self.model_name)
2175
- if config.is_encoder_decoder is True
2176
- else self.DecoderOnlyLM(model_name=self.model_name)
2177
- )
 
 
 
2178
 
2179
  # compute P(Q|P) and store in queue
2180
- scores = lm.compute_lm(
2181
  source=sources, target=targets, batch_size=self.batch_size
2182
  )
2183
 
@@ -2200,8 +2227,25 @@ class Perplexity(BulkInstanceMetric):
2200
 
2201
  return all_instances_scores
2202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2203
  class AbstractLM(ABC):
2204
- def __init__(self, model_name):
2205
  import torch
2206
  from transformers import AutoTokenizer
2207
 
@@ -2211,6 +2255,7 @@ class Perplexity(BulkInstanceMetric):
2211
  self.model_class().from_pretrained(self.model_name).to(self.device)
2212
  )
2213
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
2214
 
2215
  def compute_lm(
2216
  self, source: List[str], target: List[str], batch_size: int
@@ -2232,7 +2277,10 @@ class Perplexity(BulkInstanceMetric):
2232
  batch_source, padding=True, return_tensors="pt"
2233
  )
2234
  tokens_target = self.tokenizer(
2235
- batch_target, padding=True, return_tensors="pt"
 
 
 
2236
  )
2237
 
2238
  # compute the logits
@@ -3353,7 +3401,7 @@ class BinaryMaxAccuracy(GlobalMetric):
3353
  def compute(
3354
  self,
3355
  references: List[List[str]],
3356
- predictions: List[List[str]],
3357
  task_data: List[Dict],
3358
  ) -> dict:
3359
  float_predictions = [to_float_or_default(p) for p in predictions]
@@ -3361,24 +3409,53 @@ class BinaryMaxAccuracy(GlobalMetric):
3361
  ["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references
3362
  ]
3363
 
3364
- best_thr = -1
3365
- best_acc = -1
3366
- for thr in set(float_predictions):
3367
- new_predictions = [
3368
- "1" if float_prediction >= thr else "0"
3369
- for float_prediction in float_predictions
3370
- ]
3371
- acc = np.mean(
3372
- [
3373
- [prediction] == reference
3374
- for prediction, reference in zip(new_predictions, references)
3375
- ]
3376
- )
3377
- if acc > best_acc:
3378
- best_acc = acc
3379
- best_thr = thr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3380
 
3381
- return {self.main_score: best_acc, "best_thr_max_acc": best_thr}
 
 
 
3382
 
3383
 
3384
  ######################
 
1879
  hf_metric_fields = ["f1", "precision", "recall"]
1880
  ci_scores = ["f1", "precision", "recall"]
1881
  model_name: str
1882
+ model_layer: int = None
1883
 
1884
  prediction_type = "str"
1885
 
 
1887
 
1888
  def prepare(self):
1889
  super().prepare()
1890
+ self.hf_compute_args = {"model_type": self.model_name, "batch_size": 32}
1891
+ if self.model_layer:
1892
+ self.hf_compute_args["num_layers"] = self.model_layer
1893
 
1894
 
1895
  class SentenceBert(BulkInstanceMetric):
 
1950
 
1951
  model_name: str
1952
 
1953
+ prediction_type = "str"
1954
+ single_reference_per_prediction = True
1955
+
1956
  _requirements_list: List[str] = ["transformers", "torch"]
1957
 
1958
  def prepare(self):
 
2147
  reduction_map = {"mean": ["perplexity"]}
2148
  prediction_type = "str"
2149
 
2150
+ source_template: str
2151
+ target_template: str
2152
  batch_size: int = 32
2153
  model_name: str
2154
+ single_token_mode: bool = False
2155
+
2156
+ lm = None
2157
 
2158
  _requirements_list: List[str] = ["transformers", "torch"]
2159
 
 
2170
 
2171
  :return: the likelihood of generating text Y_i after each text X_i_j = P(Y_i|X_i_1), ..., P(Y_i|X_i_n) for every i.
2172
  """
2173
+ if self.lm is None:
2174
+ from transformers import AutoConfig
2175
+
2176
+ config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=True)
2177
+ self.lm = (
2178
+ self.EncoderDecoderLM(
2179
+ model_name=self.model_name, single_token_mode=self.single_token_mode
2180
+ )
2181
+ if config.is_encoder_decoder is True
2182
+ else self.DecoderOnlyLM(
2183
+ model_name=self.model_name, single_token_mode=self.single_token_mode
2184
+ )
2185
+ )
2186
+
2187
  sources = []
2188
  targets = []
2189
  for prediction, instance_references in zip(predictions, references):
2190
  for instance_reference in instance_references:
2191
+ sources.append(
2192
+ self.Template.apply(
2193
+ self.source_template,
2194
+ prediction=prediction,
2195
+ reference=instance_reference,
2196
+ )
2197
+ )
2198
+ targets.append(
2199
+ self.Template.apply(
2200
+ self.target_template,
2201
+ prediction=prediction,
2202
+ reference=instance_reference,
2203
+ )
2204
+ )
2205
 
2206
  # compute P(Q|P) and store in queue
2207
+ scores = self.lm.compute_lm(
2208
  source=sources, target=targets, batch_size=self.batch_size
2209
  )
2210
 
 
2227
 
2228
  return all_instances_scores
2229
 
2230
+ class Template:
2231
+ regex = re.compile(r"\{(\w+)}")
2232
+
2233
+ @classmethod
2234
+ def apply(cls, template, **kwargs):
2235
+ matches = Perplexity.Template.regex.finditer(template)
2236
+ output = []
2237
+ cursor = 0
2238
+ for match in matches:
2239
+ start = match.start()
2240
+ end = match.end()
2241
+ output.append(template[cursor:start])
2242
+ output.append(kwargs[match.group(1)])
2243
+ cursor = end
2244
+ output.append(template[cursor:])
2245
+ return "".join(output)
2246
+
2247
  class AbstractLM(ABC):
2248
+ def __init__(self, model_name, single_token_mode):
2249
  import torch
2250
  from transformers import AutoTokenizer
2251
 
 
2255
  self.model_class().from_pretrained(self.model_name).to(self.device)
2256
  )
2257
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
2258
+ self.single_token_mode = single_token_mode
2259
 
2260
  def compute_lm(
2261
  self, source: List[str], target: List[str], batch_size: int
 
2277
  batch_source, padding=True, return_tensors="pt"
2278
  )
2279
  tokens_target = self.tokenizer(
2280
+ batch_target,
2281
+ padding=True,
2282
+ return_tensors="pt",
2283
+ add_special_tokens=not self.single_token_mode,
2284
  )
2285
 
2286
  # compute the logits
 
3401
  def compute(
3402
  self,
3403
  references: List[List[str]],
3404
+ predictions: List[str],
3405
  task_data: List[Dict],
3406
  ) -> dict:
3407
  float_predictions = [to_float_or_default(p) for p in predictions]
 
3409
  ["1"] if r[0].lower() in self.pos_classes else ["0"] for r in references
3410
  ]
3411
 
3412
+ # Sticking to the test >= thr, accuracy induced by threshold thr is the number of float predictions
3413
+ # that pass the test (are >= thr) and are paired with reference "1" plus the number of float predictions that
3414
+ # fail the test (are < thr) and are paired with reference "0".
3415
+ # A given threshold thr induces the same partition over the float predictions into passing and failing
3416
+ # as threshold thr' induces, with thr' being the smallest among the ones passing the test of thr.
3417
+ # Hence, we only need to review thresholds being float predictions, plus a threshold being larger than
3418
+ # the largest float predictions, to induce the partition into all-failing , none-passing.
3419
+
3420
+ fp = [
3421
+ (float_predictions[i], i, -1 if references[i][0] == "1" else +1)
3422
+ for i in range(len(float_predictions))
3423
+ ]
3424
+ fp.sort()
3425
+ # each triplet above: float-prediction f; f's ordinal position in float_predictions, which is also
3426
+ # a means to obtain distinct triplets; and: the change in number of predictions that the test sends
3427
+ # to the reference they are paired with, a change implied by a move of thr that transfers f
3428
+ # from the set of passing the test to the set of failing it.
3429
+
3430
+ rightmost_thr = 1.0 if fp[-1][0] < 1 else fp[-1][0] + 0.01
3431
+ # trying to be esthetic, have the threshold within [0,1], although this is not a requirement,
3432
+ # and even the float predictions are not guaranteed to be within the range [0,1]
3433
+
3434
+ current_thr = fp[0][0]
3435
+ # partition float_predictions into all-passing, none-failing
3436
+ current_acc = sum(r[0] == "1" for r in references)
3437
+ # number of predictions that thr sends to the reference they are paired with
3438
+
3439
+ best_acc = current_acc
3440
+ best_thr = current_thr
3441
+
3442
+ i = 0
3443
+ while (i < len(predictions)) and (best_acc < len(predictions)):
3444
+ # best_acc can not exceed len(predictions)
3445
+ delta = fp[i][2]
3446
+ i += 1
3447
+ while i < len(predictions) and fp[i][0] <= fp[i - 1][0]:
3448
+ delta += fp[i][2]
3449
+ i += 1
3450
+ current_acc += delta
3451
+ if current_acc > best_acc:
3452
+ best_acc = current_acc
3453
+ best_thr = fp[i][0] if i < len(predictions) else rightmost_thr
3454
 
3455
+ return {
3456
+ self.main_score: float(best_acc) / len(predictions),
3457
+ "best_thr_max_acc": best_thr,
3458
+ }
3459
 
3460
 
3461
  ######################