Elron commited on
Commit
2a86d9a
1 Parent(s): 3a3d1b3

Upload metrics.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. metrics.py +35 -5
metrics.py CHANGED
@@ -4,7 +4,6 @@ import uuid
4
  from abc import ABC, abstractmethod
5
  from collections import Counter
6
  from dataclasses import field
7
- from statistics import mean
8
  from typing import Any, Dict, Generator, List, Optional, Tuple
9
 
10
  import evaluate
@@ -1360,9 +1359,12 @@ class Perplexity(BulkInstanceMetric):
1360
  instance_scores_list.append(scores[index])
1361
  index += 1
1362
  instance_scores["reference_scores"] = instance_scores_list
1363
- instance_scores[self.main_score] = mean(instance_scores_list)
1364
 
1365
- instance_scores[self.main_score] = mean(instance_scores_list)
 
 
 
 
1366
  all_instances_scores.append(instance_scores)
1367
 
1368
  return all_instances_scores
@@ -1405,11 +1407,18 @@ class Perplexity(BulkInstanceMetric):
1405
  tokens_source, tokens_target
1406
  )
1407
 
 
 
 
 
1408
  # the model returns mean over all batch. We run the CE again without reduction
1409
- # and extarct the mean for each document
1410
  loss_fct = torch.nn.CrossEntropyLoss(
1411
  ignore_index=-100, reduction="none"
1412
  )
 
 
 
1413
  loss = loss_fct(
1414
  logits.view(-1, logits.size(-1)), labels.view(-1)
1415
  )
@@ -1420,8 +1429,29 @@ class Perplexity(BulkInstanceMetric):
1420
  labels > 0, dim=1
1421
  )
1422
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1423
  # append the batch scores to the list of all scores
1424
- scores.append(batch_loss)
1425
 
1426
  return torch.cat(scores, dim=0).tolist()
1427
 
 
4
  from abc import ABC, abstractmethod
5
  from collections import Counter
6
  from dataclasses import field
 
7
  from typing import Any, Dict, Generator, List, Optional, Tuple
8
 
9
  import evaluate
 
1359
  instance_scores_list.append(scores[index])
1360
  index += 1
1361
  instance_scores["reference_scores"] = instance_scores_list
 
1362
 
1363
+ # max seems more useful than mean for common use cases like
1364
+ # context relevance, where what we want to know is if there
1365
+ # is at least one good result in the context. Using mean will
1366
+ # bring the score down due to bad contexts at the tail.
1367
+ instance_scores[self.main_score] = max(instance_scores_list)
1368
  all_instances_scores.append(instance_scores)
1369
 
1370
  return all_instances_scores
 
1407
  tokens_source, tokens_target
1408
  )
1409
 
1410
+ # logits is a tensor of size: batch_size * len(target) * vocab_size
1411
+ # because for each example in the batch, the model predicted the
1412
+ # logit at every position in the target, for every vocab item.
1413
+
1414
  # the model returns mean over all batch. We run the CE again without reduction
1415
+ # and extract the mean for each document
1416
  loss_fct = torch.nn.CrossEntropyLoss(
1417
  ignore_index=-100, reduction="none"
1418
  )
1419
+
1420
+ # logits.size(-1) = the dimension of the vocabulary
1421
+ # labels.view(-1) = flattens the labels tensor to 1d
1422
  loss = loss_fct(
1423
  logits.view(-1, logits.size(-1)), labels.view(-1)
1424
  )
 
1429
  labels > 0, dim=1
1430
  )
1431
 
1432
+ # e^-average(cross-entropy-loss(logits) == geometric mean of the probabilities
1433
+ # proof:
1434
+ # * CE-loss of logits is computed by transforming the logits to
1435
+ # probabilities by softmax, and then -log(p) is returned, where
1436
+ # p is the probability of the gold label.
1437
+ # * Averaging the CE loss is computed by summing over -log(p) and
1438
+ # then dividing by the length of the gold labels.
1439
+ # * Thus, pr_score = (-log(p_1) + ... + -log(p_n)) / n
1440
+ # = -log(p_1 * ... * p_n) * 1/n
1441
+ # * Therefore,
1442
+ # e^(-pr_score) = e^(log(p_1 * ... * p_n) * 1/n)
1443
+ # = (e^(log(p_1 * ... * p_n))) ^ 1/n
1444
+ # = p_1 * ... * p_n) ^ 1/n
1445
+ # = geometric mean of [p_1, ..., p_n]
1446
+ #
1447
+ # in principle we could have computed the geometric mean directly over the
1448
+ # probabilities instead of e^(average cross entropy loss of the logits),
1449
+ # but the current approach is more stable numerically. See for example:
1450
+ # https://stackoverflow.com/questions/59722983/how-to-calculate-geometric-mean-in-a-differentiable-way
1451
+ geometric_mean = (-batch_loss).exp()
1452
+
1453
  # append the batch scores to the list of all scores
1454
+ scores.append(geometric_mean)
1455
 
1456
  return torch.cat(scores, dim=0).tolist()
1457