Spaces:
Runtime error
Runtime error
File size: 11,493 Bytes
a5f8a35 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 |
r"""
This module is a collection of metrics commonly used during pretraining and
downstream evaluation. Two main classes here are:
- :class:`TopkAccuracy` used for ImageNet linear classification evaluation.
- :class:`CocoCaptionsEvaluator` used for caption evaluation (CIDEr and SPICE).
Parts of this module (:meth:`tokenize`, :meth:`cider` and :meth:`spice`) are
adapted from `coco-captions evaluation code <https://github.com/tylin/coco-caption>`_.
"""
from collections import defaultdict
import json
import os
from subprocess import Popen, PIPE, check_call
import tempfile
from typing import Any, Dict, List
import numpy as np
import torch
class TopkAccuracy(object):
r"""
An accumulator for Top-K classification accuracy. This accumulates per-batch
accuracy during training/validation, which can retrieved at the end. Assumes
integer labels and predictions.
.. note::
If used in :class:`~torch.nn.parallel.DistributedDataParallel`, results
need to be aggregated across GPU processes outside this class.
Parameters
----------
top_k: int, optional (default = 1)
``k`` for computing Top-K accuracy.
"""
def __init__(self, top_k: int = 1):
self._top_k = top_k
self.reset()
def reset(self):
r"""Reset counters; to be used at the start of new epoch/validation."""
self.num_total = 0.0
self.num_correct = 0.0
def __call__(self, predictions: torch.Tensor, ground_truth: torch.Tensor):
r"""
Update accumulated accuracy using the current batch.
Parameters
----------
ground_truth: torch.Tensor
A tensor of shape ``(batch_size, )``, an integer label per example.
predictions : torch.Tensor
Predicted logits or log-probabilities of shape
``(batch_size, num_classes)``.
"""
if self._top_k == 1:
top_k = predictions.max(-1)[1].unsqueeze(-1)
else:
top_k = predictions.topk(min(self._top_k, predictions.shape[-1]), -1)[1]
correct = top_k.eq(ground_truth.unsqueeze(-1)).float()
self.num_total += ground_truth.numel()
self.num_correct += correct.sum()
def get_metric(self, reset: bool = False):
r"""Get accumulated accuracy so far (and optionally reset counters)."""
if self.num_total > 1e-12:
accuracy = float(self.num_correct) / float(self.num_total)
else:
accuracy = 0.0
if reset:
self.reset()
return accuracy
class CocoCaptionsEvaluator(object):
r"""A helper class to evaluate caption predictions in COCO format. This uses
:meth:`cider` and :meth:`spice` which exactly follow original COCO Captions
evaluation protocol.
Parameters
----------
gt_annotations_path: str
Path to ground truth annotations in COCO format (typically this would
be COCO Captions ``val2017`` split).
"""
def __init__(self, gt_annotations_path: str):
gt_annotations = json.load(open(gt_annotations_path))["annotations"]
# Keep a mapping from image id to a list of captions.
self.ground_truth: Dict[int, List[str]] = defaultdict(list)
for ann in gt_annotations:
self.ground_truth[ann["image_id"]].append(ann["caption"])
self.ground_truth = tokenize(self.ground_truth)
def evaluate(self, preds: List[Dict[str, Any]]) -> Dict[str, float]:
r"""Compute CIDEr and SPICE scores for predictions.
Parameters
----------
preds: List[Dict[str, Any]]
List of per instance predictions in COCO Captions format:
``[ {"image_id": int, "caption": str} ...]``.
Returns
-------
Dict[str, float]
Computed metrics; a dict with keys ``{"CIDEr", "SPICE"}``.
"""
if isinstance(preds, str):
preds = json.load(open(preds))
res = {ann["image_id"]: [ann["caption"]] for ann in preds}
res = tokenize(res)
# Remove IDs from predictions which are not in GT.
common_image_ids = self.ground_truth.keys() & res.keys()
res = {k: v for k, v in res.items() if k in common_image_ids}
# Add dummy entries for IDs absent in preds, but present in GT.
for k in self.ground_truth:
res[k] = res.get(k, [""])
cider_score = cider(res, self.ground_truth)
spice_score = spice(res, self.ground_truth)
return {"CIDEr": 100 * cider_score, "SPICE": 100 * spice_score}
def tokenize(image_id_to_captions: Dict[int, List[str]]) -> Dict[int, List[str]]:
r"""
Given a mapping of image id to a list of corrsponding captions, tokenize
captions in place according to Penn Treebank Tokenizer. This method assumes
the presence of Stanford CoreNLP JAR file in directory of this module.
"""
# Path to the Stanford CoreNLP JAR file.
CORENLP_JAR = (
"assets/stanford-corenlp-full-2014-08-27/stanford-corenlp-3.4.1.jar"
)
# Prepare data for Tokenizer: write captions to a text file, one per line.
image_ids = [k for k, v in image_id_to_captions.items() for _ in range(len(v))]
sentences = "\n".join(
[c.replace("\n", " ") for k, v in image_id_to_captions.items() for c in v]
)
tmp_file = tempfile.NamedTemporaryFile(delete=False)
tmp_file.write(sentences.encode())
tmp_file.close()
# fmt: off
# Tokenize sentences. We use the JAR file for tokenization.
command = [
"java", "-cp", CORENLP_JAR, "edu.stanford.nlp.process.PTBTokenizer",
"-preserveLines", "-lowerCase", tmp_file.name
]
tokenized_captions = (
Popen(command, cwd=os.path.dirname(os.path.abspath(__file__)), stdout=PIPE)
.communicate(input=sentences.rstrip())[0]
.decode()
.split("\n")
)
# fmt: on
os.remove(tmp_file.name)
# Map tokenized captions back to their image IDs.
# Punctuations to be removed from the sentences (PTB style)).
# fmt: off
PUNCTS = [
"''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", ".", "?",
"!", ",", ":", "-", "--", "...", ";",
]
# fmt: on
image_id_to_tokenized_captions: Dict[int, List[str]] = defaultdict(list)
for image_id, caption in zip(image_ids, tokenized_captions):
image_id_to_tokenized_captions[image_id].append(
" ".join([w for w in caption.rstrip().split(" ") if w not in PUNCTS])
)
return image_id_to_tokenized_captions
def cider(
predictions: Dict[int, List[str]],
ground_truth: Dict[int, List[str]],
n: int = 4,
sigma: float = 6.0,
) -> float:
r"""Compute CIDEr score given ground truth captions and predictions."""
# -------------------------------------------------------------------------
def to_ngrams(sentence: str, n: int = 4):
r"""Convert a sentence into n-grams and their counts."""
words = sentence.split()
counts = defaultdict(int) # type: ignore
for k in range(1, n + 1):
for i in range(len(words) - k + 1):
ngram = tuple(words[i : i + k])
counts[ngram] += 1
return counts
def counts2vec(cnts, document_frequency, log_reference_length):
r"""Function maps counts of ngram to vector of tfidf weights."""
vec = [defaultdict(float) for _ in range(n)]
length = 0
norm = [0.0 for _ in range(n)]
for (ngram, term_freq) in cnts.items():
df = np.log(max(1.0, document_frequency[ngram]))
# tf (term_freq) * idf (precomputed idf) for n-grams
vec[len(ngram) - 1][ngram] = float(term_freq) * (
log_reference_length - df
)
# Compute norm for the vector: will be used for computing similarity
norm[len(ngram) - 1] += pow(vec[len(ngram) - 1][ngram], 2)
if len(ngram) == 2:
length += term_freq
norm = [np.sqrt(nn) for nn in norm]
return vec, norm, length
def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
r"""Compute the cosine similarity of two vectors."""
delta = float(length_hyp - length_ref)
val = np.array([0.0 for _ in range(n)])
for nn in range(n):
for (ngram, count) in vec_hyp[nn].items():
val[nn] += (
min(vec_hyp[nn][ngram], vec_ref[nn][ngram]) * vec_ref[nn][ngram]
)
val[nn] /= (norm_hyp[nn] * norm_ref[nn]) or 1
val[nn] *= np.e ** (-(delta ** 2) / (2 * sigma ** 2))
return val
# -------------------------------------------------------------------------
ctest = [to_ngrams(predictions[image_id][0]) for image_id in ground_truth]
crefs = [
[to_ngrams(gt) for gt in ground_truth[image_id]] for image_id in ground_truth
]
# Build document frequency and compute IDF.
document_frequency = defaultdict(float)
for refs in crefs:
# refs, k ref captions of one image
for ngram in set([ngram for ref in refs for (ngram, count) in ref.items()]):
document_frequency[ngram] += 1
# Compute log reference length.
log_reference_length = np.log(float(len(crefs)))
scores = []
for test, refs in zip(ctest, crefs):
# Compute vector for test captions.
vec, norm, length = counts2vec(
test, document_frequency, log_reference_length
)
# Compute vector for ref captions.
score = np.array([0.0 for _ in range(n)])
for ref in refs:
vec_ref, norm_ref, length_ref = counts2vec(
ref, document_frequency, log_reference_length
)
score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
score_avg = np.mean(score)
score_avg /= len(refs)
score_avg *= 10.0
scores.append(score_avg)
return np.mean(scores)
def spice(
predictions: Dict[int, List[str]], ground_truth: Dict[int, List[str]]
) -> float:
r"""Compute SPICE score given ground truth captions and predictions."""
# Prepare temporary input file for the SPICE scorer.
input_data = [
{
"image_id": image_id,
"test": predictions[image_id][0],
"refs": ground_truth[image_id],
}
for image_id in ground_truth
]
# Create a temporary directory and dump input file to SPICE.
temp_dir = tempfile.mkdtemp()
INPUT_PATH = os.path.join(temp_dir, "input_file.json")
OUTPUT_PATH = os.path.join(temp_dir, "output_file.json")
json.dump(input_data, open(INPUT_PATH, "w"))
# fmt: off
# Run the command to execute SPICE jar.
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
SPICE_JAR = f"{CURRENT_DIR}/assets/SPICE-1.0/spice-1.0.jar"
CACHE_DIR = f"{CURRENT_DIR}/assets/cache"
os.makedirs(CACHE_DIR, exist_ok=True)
spice_cmd = [
"java", "-jar", "-Xmx8G", SPICE_JAR, INPUT_PATH,
"-cache", CACHE_DIR, "-out", OUTPUT_PATH, "-subset", "-silent",
]
check_call(spice_cmd, cwd=CURRENT_DIR)
# fmt: on
# Read and process results
results = json.load(open(OUTPUT_PATH))
image_id_to_scores = {item["image_id"]: item["scores"] for item in results}
spice_scores = [
np.array(item["scores"]["All"]["f"]).astype(float) for item in results
]
return np.mean(spice_scores)
|