xmutly
/

robustvlm-object-centric

Model card Files Files and versions Community

robustvlm-object-centric / CLIP_benchmark /clip_benchmark /metrics /zeroshot_retrieval.py

xmutly

Upload 294 files

e1aaaac verified 4 months ago

raw

history blame

5.6 kB

	import logging
	from contextlib import suppress

	import torch
	import torch.nn.functional as F
	from tqdm import tqdm

	def evaluate(model, dataloader, tokenizer, device, amp=True, recall_k_list=[5]):
	"""
	Evaluate the model on the given dataset

	Parameters
	----------

	model: torch.nn,Module
	CLIP-like model with `encode_image` and `encode_text`

	dataloader: torch.utils.data.Dataloader
	dataloader to use for evaluation

	tokenizer:
	text tokenizer, i.e. convert list of strings to torch.Tensor of integers

	device: cpu/cuda

	amp: whether to use automatic mixed precision

	recall_k_list: list of int
	recall@k k's to use

	Returns
	-------

	dict of retrieval metrics
	"""
	# list of batch of images embedding
	batch_images_emb_list = []
	# list of batch of text embedding
	batch_texts_emb_list = []
	# for each text, we collect the corresponding image index, as each image can have multiple corresponding texts
	texts_image_index = []
	dataloader = dataloader_with_indices(dataloader)
	autocast = torch.cuda.amp.autocast if amp else suppress
	for batch_images, batch_texts, inds in tqdm(dataloader):
	batch_images = batch_images.to(device)
	# tokenize all texts in the batch
	batch_texts_tok = tokenizer([text for i, texts in enumerate(batch_texts) for text in texts]).to(device)
	# store the index of image for each text
	batch_texts_image_index = [ind for ind, texts in zip(inds, batch_texts) for text in texts]

	# compute the embedding of images and texts
	with torch.no_grad(), autocast():
	batch_images_emb = F.normalize(model.encode_image(batch_images), dim=-1)
	batch_texts_emb = F.normalize(model.encode_text(batch_texts_tok), dim=-1)

	batch_images_emb_list.append(batch_images_emb.cpu())
	batch_texts_emb_list.append(batch_texts_emb.cpu())
	texts_image_index.extend(batch_texts_image_index)

	batch_size = len(batch_images_emb_list[0])

	# concatenate all embeddings
	images_emb = torch.cat(batch_images_emb_list)
	texts_emb = torch.cat(batch_texts_emb_list)

	# get the score for each text and image pair
	scores = texts_emb @ images_emb.t()

	# construct a the positive pair matrix, which tells whether each text-image pair is a positive or not
	positive_pairs = torch.zeros_like(scores, dtype=bool)
	positive_pairs[torch.arange(len(scores)), texts_image_index] = True
	metrics = {}
	for recall_k in recall_k_list:
	# Note that recall_at_k computes actual recall i.e. nb_true_positive/nb_positives, where the number
	# of true positives, e.g. for text retrieval, is, for each image, the number of retrieved texts matching that image among the top-k.
	# Also, the number of positives are the total number of texts matching the image in the dataset, as we have a set of captions
	# for each image, that number will be greater than 1 for text retrieval.
	# However, image/text retrieval recall@k, the way it is done in CLIP-like papers, is a bit different.
	# recall@k, in CLIP-like papers, is, for each image, either 1 or 0. It is 1 if atleast one text matches the image among the top-k.
	# so we can easily compute that using the actual recall, by checking whether there is at least one true positive,
	# which would be the case if the recall is greater than 0. One we compute the recal for each image (or text), we average
	# it over the dataset.
	metrics[f"image_retrieval_recall@{recall_k}"] = (batchify(recall_at_k, scores, positive_pairs, batch_size, device, k=recall_k)>0).float().mean().item()
	metrics[f"text_retrieval_recall@{recall_k}"] = (batchify(recall_at_k, scores.T, positive_pairs.T, batch_size, device, k=recall_k)>0).float().mean().item()

	return metrics

	def dataloader_with_indices(dataloader):
	start = 0
	for x, y in dataloader:
	end = start + len(x)
	inds = torch.arange(start, end)
	yield x, y, inds
	start = end

	def recall_at_k(scores, positive_pairs, k):
	"""
	Compute the recall at k for each sample
	:param scores: compability score between text and image embeddings (nb texts, nb images)
	:param k: number of images to consider per text, for retrieval
	:param positive_pairs: boolean matrix of positive pairs (nb texts, nb images)
	:return: recall at k averaged over all texts
	"""
	nb_texts, nb_images = scores.shape
	# for each text, sort according to image scores in decreasing order
	topk_indices = torch.topk(scores, k, dim=1)[1]
	# compute number of positives for each text
	nb_positive = positive_pairs.sum(dim=1)
	# nb_texts, k, nb_images
	topk_indices_onehot = torch.nn.functional.one_hot(topk_indices, num_classes=nb_images)
	# compute number of true positives
	positive_pairs_reshaped = positive_pairs.view(nb_texts, 1, nb_images)
	# a true positive means a positive among the topk
	nb_true_positive = (topk_indices_onehot * positive_pairs_reshaped).sum(dim=(1,2))
	# compute recall at k
	recall_at_k = (nb_true_positive / nb_positive)
	return recall_at_k

	def batchify(func, X, Y, batch_size, device, args, *kwargs):
	results = []
	for start in range(0, len(X), batch_size):
	end = start + batch_size
	x = X[start:end].to(device)
	y = Y[start:end].to(device)
	result = func(x, y, args, *kwargs).cpu()
	results.append(result)
	return torch.cat(results)