audioldm-text-to-audio-generation

Runtime error

App Files Files Community

audioldm-text-to-audio-generation / audioldm /clap /training /train.py

deeplearning

Duplicate from AIFILMS/audioldm-text-to-audio-generation

ddc593e over 1 year ago

raw

history blame contribute delete

No virus

36.3 kB

	import json
	import logging
	import math
	import os
	import time
	from contextlib import suppress

	import numpy as np
	import torch
	import torch.nn.functional as F

	try:
	import wandb
	except ImportError:
	wandb = None

	from open_clip import ClipLoss, gather_features
	from .distributed import is_master
	from .zero_shot import zero_shot_eval


	class AverageMeter(object):
	"""Computes and stores the average and current value"""

	def __init__(self):
	self.reset()

	def reset(self):
	self.val = 0
	self.avg = 0
	self.sum = 0
	self.count = 0

	def update(self, val, n=1):
	self.val = val
	self.sum += val * n
	self.count += n
	self.avg = self.sum / self.count


	def unwrap_model(model):
	if hasattr(model, "module"):
	return model.module
	else:
	return model


	def train_one_epoch(
	model, data, epoch, optimizer, scaler, scheduler, args, tb_writer=None
	):
	device = torch.device(args.device)
	autocast = torch.cuda.amp.autocast if args.precision == "amp" else suppress
	model.train()
	loss = ClipLoss(
	local_loss=args.local_loss,
	gather_with_grad=args.gather_with_grad,
	cache_labels=True,
	rank=args.rank,
	world_size=args.world_size,
	use_horovod=args.horovod,
	mlp_loss=args.clap_mlploss,
	weight_loss_kappa=args.kappa,
	)

	dataloader, sampler = data["train"].dataloader, data["train"].sampler
	if args.distributed and sampler is not None:
	sampler.set_epoch(epoch)
	num_batches_per_epoch = dataloader.num_batches
	sample_digits = math.ceil(math.log(dataloader.num_samples + 1, 10))

	# for toy dataset
	if args.dataset_type == "toy":
	dataloader.dataset.generate_queue()

	loss_m = AverageMeter()
	batch_time_m = AverageMeter()
	data_time_m = AverageMeter()
	end = time.time()

	for i, batch in enumerate(dataloader):
	# logging.info(f"batch {i} of {num_batches_per_epoch}")
	step = num_batches_per_epoch * epoch + i
	if isinstance(scheduler, dict):
	for s in scheduler.values():
	s(step)
	else:
	scheduler(step)
	audios = batch # contains mel_spec, wavform, and longer list
	texts = batch["text"]
	# audios = audios.to(device=device, non_blocking=True)
	# texts = texts.to(device=device, non_blocking=True)

	data_time_m.update(time.time() - end)
	if isinstance(optimizer, dict):
	for o_ in optimizer.values():
	o_.zero_grad()
	else:
	optimizer.zero_grad()

	with autocast():
	(
	audio_features,
	text_features,
	audio_features_mlp,
	text_features_mlp,
	logit_scale_a,
	logit_scale_t,
	) = model(audios, texts, device)

	if args.clap_mlploss:
	total_loss = loss(
	audio_features=audio_features,
	text_features=text_features,
	logit_scale_a=logit_scale_a,
	logit_scale_t=logit_scale_t,
	audio_features_mlp=audio_features_mlp,
	text_features_mlp=text_features_mlp,
	)
	else:
	total_loss = loss(
	audio_features=audio_features,
	text_features=text_features,
	logit_scale_a=logit_scale_a,
	)
	if isinstance(optimizer, dict):
	if scaler is not None:
	scaler.scale(total_loss).backward()
	for o_ in optimizer.values():
	if args.horovod:
	o_.synchronize()
	scaler.unscale_(o_)
	with o_.skip_synchronize():
	scaler.step(o_)
	else:
	scaler.step(o_)
	scaler.update()
	else:
	total_loss.backward()
	for o_ in optimizer.values():
	o_.step()
	else:
	if scaler is not None:
	scaler.scale(total_loss).backward()
	if args.horovod:
	optimizer.synchronize()
	scaler.unscale_(optimizer)
	with optimizer.skip_synchronize():
	scaler.step(optimizer)
	else:
	scaler.step(optimizer)
	scaler.update()
	else:
	total_loss.backward()
	optimizer.step()

	# Note: we clamp to 4.6052 = ln(100), as in the original paper.
	with torch.no_grad():
	unwrap_model(model).logit_scale_a.clamp_(0, math.log(100))
	if args.clap_mlploss:
	unwrap_model(model).logit_scale_t.clamp_(0, math.log(100))

	batch_time_m.update(time.time() - end)
	end = time.time()
	batch_count = i + 1
	if is_master(args) and (i % 100 == 0 or batch_count == num_batches_per_epoch):
	if isinstance(audios, dict):
	batch_size = len(audios["waveform"])
	else:
	batch_size = len(audios)
	num_samples = batch_count * batch_size * args.world_size
	samples_per_epoch = dataloader.num_samples
	percent_complete = 100.0 * batch_count / num_batches_per_epoch

	# NOTE loss is coarsely sampled, just master node and per log update
	loss_m.update(total_loss.item(), batch_size)
	logit_scale_scalar_a = logit_scale_a.item()
	logit_scale_scalar_t = logit_scale_t.item()
	if isinstance(optimizer, dict):
	if args.clap_mlploss:
	logging.info(
	f"Train Epoch: {epoch} [{num_samples:>{sample_digits}}/{samples_per_epoch} ({percent_complete:.0f}%)] "
	f"Loss: {loss_m.val:#.5g} ({loss_m.avg:#.4g}) "
	f"Data (t): {data_time_m.avg:.3f} "
	f"Batch (t): {batch_time_m.avg:.3f} "
	f"LR: {[o_.param_groups[0]['lr'] for o_ in optimizer.values()]} "
	f"Logit Scale Audio: {logit_scale_scalar_a:.3f}"
	f"Logit Scale Text: {logit_scale_scalar_t:.3f}"
	)
	log_data = {
	"loss": loss_m.val,
	"data_time": data_time_m.val,
	"batch_time": batch_time_m.val,
	"scale_audio": logit_scale_scalar_a,
	"scale_text": logit_scale_scalar_t,
	"lr": [o_.param_groups[0]["lr"] for o_ in optimizer.values()],
	}
	else:
	logging.info(
	f"Train Epoch: {epoch} [{num_samples:>{sample_digits}}/{samples_per_epoch} ({percent_complete:.0f}%)] "
	f"Loss: {loss_m.val:#.5g} ({loss_m.avg:#.4g}) "
	f"Data (t): {data_time_m.avg:.3f} "
	f"Batch (t): {batch_time_m.avg:.3f} "
	f"LR: {[o_.param_groups[0]['lr'] for o_ in optimizer.values()]} "
	f"Logit Scale Audio: {logit_scale_scalar_a:.3f}"
	)
	log_data = {
	"loss": loss_m.val,
	"data_time": data_time_m.val,
	"batch_time": batch_time_m.val,
	"scale_audio": logit_scale_scalar_a,
	"lr": [o_.param_groups[0]["lr"] for o_ in optimizer.values()],
	}

	else:
	if args.clap_mlploss:
	logging.info(
	f"Train Epoch: {epoch} [{num_samples:>{sample_digits}}/{samples_per_epoch} ({percent_complete:.0f}%)] "
	f"Loss: {loss_m.val:#.5g} ({loss_m.avg:#.4g}) "
	f"Data (t): {data_time_m.avg:.3f} "
	f"Batch (t): {batch_time_m.avg:.3f} "
	f"LR: {optimizer.param_groups[0]['lr']:5f} "
	f"Logit Scale Audio: {logit_scale_scalar_a:.3f}"
	f"Logit Scale Text: {logit_scale_scalar_t:.3f}"
	)

	# Save train loss / etc. Using non avg meter values as loggers have their own smoothing
	log_data = {
	"loss": loss_m.val,
	"data_time": data_time_m.val,
	"batch_time": batch_time_m.val,
	"scale_audio": logit_scale_scalar_a,
	"scale_text": logit_scale_scalar_t,
	"lr": optimizer.param_groups[0]["lr"],
	}
	else:
	logging.info(
	f"Train Epoch: {epoch} [{num_samples:>{sample_digits}}/{samples_per_epoch} ({percent_complete:.0f}%)] "
	f"Loss: {loss_m.val:#.5g} ({loss_m.avg:#.4g}) "
	f"Data (t): {data_time_m.avg:.3f} "
	f"Batch (t): {batch_time_m.avg:.3f} "
	f"LR: {optimizer.param_groups[0]['lr']:5f} "
	f"Logit Scale Audio: {logit_scale_scalar_a:.3f}"
	)

	# Save train loss / etc. Using non avg meter values as loggers have their own smoothing
	log_data = {
	"loss": loss_m.val,
	"data_time": data_time_m.val,
	"batch_time": batch_time_m.val,
	"scale_audio": logit_scale_scalar_a,
	"lr": optimizer.param_groups[0]["lr"],
	}
	for name, val in log_data.items():
	name = "train/" + name
	if tb_writer is not None:
	tb_writer.add_scalar(name, val, step)
	if args.wandb:
	assert wandb is not None, "Please install wandb."
	wandb.log({name: val, "step": step})

	# resetting batch / data time meters per log window
	batch_time_m.reset()
	data_time_m.reset()
	# end for


	def evaluate(model, data, epoch, args, tb_writer=None):
	metrics = {}
	if not args.parallel_eval:
	if not is_master(args):
	return metrics
	device = torch.device(args.device)
	model.eval()

	# CHANGE
	# zero_shot_metrics = zero_shot_eval(model, data, epoch, args)
	# metrics.update(zero_shot_metrics)
	if is_master(args):
	print("Evaluating...")
	autocast = torch.cuda.amp.autocast if args.precision == "amp" else suppress
	if args.val_dataset_names == ["Clotho", "audiocaps"]:
	# if only clotho and audiocaps are used, then we will use a different evaluation function.
	# This is because in the Clotho and audiocaps valid and test set, there are 5 text for 1 audio.
	if args.parallel_eval:
	# (yusong): just a hack here. Don't use parallel eval when evaluating only clotho and audiocaps.
	raise NotImplementedError(
	"Parallel evaluation not supported for eval only Clotho and audiocaps."
	)
	val_metrics_per_dataset = evaluate_clotho_audiocaps(
	model, data, epoch, args, autocast, device, tb_writer
	)
	for m in val_metrics_per_dataset.values():
	metrics.update(m)
	if "epoch" not in metrics.keys():
	metrics.update({"epoch": epoch})
	metrics = select_top_metric_clotho_audiocaps(
	metrics, val_metrics_per_dataset, args
	)
	elif "val" in data and (
	args.val_frequency
	and ((epoch % args.val_frequency) == 0 or epoch == args.epochs)
	):
	dataloader = data["val"].dataloader
	num_samples = 0
	samples_per_val = dataloader.num_samples

	# FIXME this does not scale past small eval datasets
	# all_audio_features @ all_text_features will blow up memory and compute very quickly
	eval_info = {}
	if args.clap_mlploss:
	eval_info["all"] = {
	"cumulative_loss": 0.0,
	"num_samples": 0,
	"all_audio_features": [],
	"all_text_features": [],
	"all_audio_features_mlp": [],
	"all_text_features_mlp": [],
	} # cumulative_loss = 0.0
	else:
	eval_info["all"] = {
	"cumulative_loss": 0.0,
	"num_samples": 0,
	"all_audio_features": [],
	"all_text_features": [],
	} # cumu
	# all_audio_features, all_text_features, all_audio_features_mlp, all_text_features_mlp = [], [], [], []
	with torch.no_grad():
	for i, batch in enumerate(dataloader):
	audios = batch # contains mel_spec, wavform, and longer list
	texts = batch["text"]
	# audios = audios.to(device=device, non_blocking=True)

	all_names = list(
	set(["-".join(b.split("/")[-3:-1]) for b in batch["__url__"]])
	)
	for name in all_names:
	if name not in eval_info.keys():
	if args.clap_mlploss:
	eval_info[name] = {
	"cumulative_loss": 0.0,
	"num_samples": 0,
	"all_audio_features": [],
	"all_text_features": [],
	"all_audio_features_mlp": [],
	"all_text_features_mlp": [],
	}
	else:
	eval_info[name] = {
	"cumulative_loss": 0.0,
	"num_samples": 0,
	"all_audio_features": [],
	"all_text_features": [],
	}
	with autocast():
	(
	audio_features,
	text_features,
	audio_features_mlp,
	text_features_mlp,
	logit_scale_a,
	logit_scale_t,
	) = model(audios, texts, device)

	if args.parallel_eval:
	# multi-GPU eval
	if args.clap_mlploss:
	(
	audio_features,
	text_features,
	audio_features_mlp,
	text_features_mlp,
	) = gather_features(
	audio_features=audio_features,
	text_features=text_features,
	audio_features_mlp=audio_features_mlp,
	text_features_mlp=text_features_mlp,
	local_loss=False,
	gather_with_grad=False,
	rank=args.rank,
	world_size=args.world_size,
	use_horovod=args.horovod,
	mlp_loss=args.clap_mlploss,
	)
	else:
	(audio_features, text_features,) = gather_features(
	audio_features=audio_features,
	text_features=text_features,
	local_loss=False,
	gather_with_grad=False,
	rank=args.rank,
	world_size=args.world_size,
	use_horovod=args.horovod,
	mlp_loss=args.clap_mlploss,
	)

	if is_master(args):
	num_samples += audio_features.shape[0]
	for n in [*all_names, "all"]:
	if n == "all":
	eval_info[n]["all_audio_features"].append(
	audio_features.cpu()
	)
	eval_info[n]["all_text_features"].append(
	text_features.cpu()
	)
	if args.clap_mlploss:
	eval_info[n]["all_audio_features_mlp"].append(
	audio_features_mlp.cpu()
	)
	eval_info[n]["all_text_features_mlp"].append(
	text_features_mlp.cpu()
	)
	else:
	idx = np.where(
	np.array(
	[
	"-".join(b.split("/")[-3:-1])
	for b in batch["__url__"]
	]
	)
	== n
	)[0]
	eval_info[n]["all_audio_features"].append(
	audio_features.cpu().index_select(
	0, torch.tensor(idx).long()
	)
	)
	eval_info[n]["all_text_features"].append(
	text_features.cpu().index_select(
	0, torch.tensor(idx).long()
	)
	)
	if args.clap_mlploss:
	eval_info[n]["all_audio_features_mlp"].append(
	audio_features_mlp.cpu().index_select(
	0, torch.tensor(idx).long()
	)
	)
	eval_info[n]["all_text_features_mlp"].append(
	text_features_mlp.cpu().index_select(
	0, torch.tensor(idx).long()
	)
	)
	# print(f'eval step {i}') # (yusong): for debug

	# cumulative_loss += total_loss * batch_size
	# num_samples += batch_size
	if is_master(args) and (i % 100) == 0: # and i != 0:
	logging.info(
	f"Eval Epoch: {epoch} [{num_samples} / {samples_per_val}]"
	)
	if is_master(args):
	val_metrics_per_dataset = {}
	for n in eval_info.keys():
	if args.clap_mlploss:
	metrics_single_dataset = get_metrics(
	audio_features=torch.cat(
	eval_info[n]["all_audio_features"]
	),
	text_features=torch.cat(eval_info[n]["all_text_features"]),
	logit_scale_a=logit_scale_a.cpu(),
	audio_features_mlp=torch.cat(
	eval_info[n]["all_audio_features_mlp"]
	),
	text_features_mlp=torch.cat(
	eval_info[n]["all_text_features_mlp"]
	),
	logit_scale_t=logit_scale_t.cpu(),
	mlp_loss=args.clap_mlploss,
	)
	else:
	metrics_single_dataset = get_metrics(
	audio_features=torch.cat(
	eval_info[n]["all_audio_features"]
	),
	text_features=torch.cat(eval_info[n]["all_text_features"]),
	logit_scale_a=logit_scale_a.cpu(),
	mlp_loss=args.clap_mlploss,
	)
	val_metrics_per_dataset[n] = {
	n + "/" + k: v for k, v in metrics_single_dataset.items()
	}
	metrics.update(val_metrics_per_dataset[n])
	if "epoch" not in metrics.keys():
	metrics.update({"epoch": epoch})
	if is_master(args):
	if not metrics:
	return metrics

	logging.info(
	f"Eval Epoch: {epoch} "
	+ "\n".join(
	[
	"\t".join([f"{k}: {round(v, 4):.4f}" for k, v in m.items()])
	for m in val_metrics_per_dataset.values()
	]
	)
	)

	if args.save_logs:
	for name, val in metrics.items():
	if tb_writer is not None:
	tb_writer.add_scalar(f"val/{name}", val, epoch)

	with open(os.path.join(args.checkpoint_path, "results.jsonl"), "a+") as f:
	f.write(json.dumps(metrics))
	f.write("\n")

	if args.wandb:
	assert wandb is not None, "Please install wandb."
	for name, val in metrics.items():
	wandb.log({f"val/{name}": val, "epoch": epoch})

	return metrics
	else:
	return metrics


	def get_metrics(
	audio_features,
	text_features,
	logit_scale_a,
	audio_features_mlp=None,
	text_features_mlp=None,
	logit_scale_t=None,
	mlp_loss=False,
	):
	metrics = {}
	if mlp_loss:
	# Set up audio to text & text to audio similary matrice
	a_logits_per_audio = (
	(logit_scale_a * audio_features @ text_features_mlp.t()).detach().cpu()
	)
	a_logits_per_text = a_logits_per_audio.t().detach().cpu()
	t_logits_per_audio = (
	(logit_scale_t * audio_features_mlp @ text_features.t()).detach().cpu()
	)
	t_logits_per_text = t_logits_per_audio.t().detach().cpu()

	labels = torch.arange(audio_features.shape[0]).long()
	# Change the loss from two terms into four terms with 2x2 combined CE loss
	total_loss = (
	F.cross_entropy(a_logits_per_audio, labels)
	+ F.cross_entropy(a_logits_per_text, labels)
	+ F.cross_entropy(t_logits_per_audio, labels)
	+ F.cross_entropy(t_logits_per_text, labels)
	) / 4

	metrics[f"cumulative_loss"] = total_loss.item()
	metrics[f"num_samples"] = audio_features.shape[0]

	logits = {
	"audio_to_text": (a_logits_per_audio + t_logits_per_audio) / 2,
	"text_to_audio": (a_logits_per_text + t_logits_per_text) / 2,
	}
	ground_truth = torch.arange(len(text_features)).view(-1, 1)

	else:
	# print("text_features", text_features)
	# print("text_features.shape", text_features.shape)
	logits_per_audio = (
	(logit_scale_a * audio_features @ text_features.t()).detach().cpu()
	)
	logits_per_text = logits_per_audio.t().detach().cpu()

	labels = torch.arange(audio_features.shape[0]).long()
	# Change the loss from two terms into four terms with 2x2 combined CE loss
	total_loss = (
	F.cross_entropy(logits_per_audio, labels)
	+ F.cross_entropy(logits_per_text, labels)
	) / 2

	metrics[f"cumulative_loss"] = total_loss.item()
	metrics[f"num_samples"] = audio_features.shape[0]

	logits = {"audio_to_text": logits_per_audio, "text_to_audio": logits_per_text}

	ground_truth = torch.arange(len(text_features)).view(-1, 1)

	for name, logit in logits.items():
	ranking = torch.argsort(logit, descending=True)
	preds = torch.where(ranking == ground_truth)[
	1
	] # (yusong) this line is slow because it uses single thread
	preds = preds.detach().cpu().numpy()
	metrics[f"{name}_mean_rank"] = preds.mean() + 1
	metrics[f"{name}_median_rank"] = np.floor(np.median(preds)) + 1
	for k in [1, 5, 10]:
	metrics[f"{name}_R@{k}"] = np.mean(preds < k)
	# map@10
	metrics[f"{name}_mAP@10"] = np.mean(np.where(preds < 10, 1 / (preds + 1), 0.0))

	return metrics


	def evaluate_clotho_audiocaps(
	model, data, epoch, args, autocast, device, tb_writer=None
	):
	"""
	Adapted from https://github.com/XinhaoMei/audio-text_retrieval/blob/main/tools/utils.py.
	1. for text-to-audio retrieval, do 5 times and average the results
	2. for R@1, R@5, R@10 in audio-to-text retrieval, take the best rank among 5 text
	3. for map@10 in audio-to-text retrieval:
	3.1: sort the rank of 5 text
	3.2: exclude the rank >=10 (0-index)
	3.3: compute the map regarding the remaining ranks: np.mean(np.arange(1, len(ranks)+1) / ranks).
	(3.3) That is, take the top ranks of 5 text that is < 10, and assign the descending number as ground truth.
	(3.3) E.g.: the ground truth of first rank of the 5 text should be 1, the second rank should be 2, etc.
	"""
	# TODO: (yusong) only support single GPU evaluation and only support non-mlp case for now.
	dataloader = data["val"].dataloader
	with torch.no_grad():
	eval_info = {}
	for i, batch in enumerate(dataloader):
	audios = batch # contains mel_spec, wavform, and longer list

	# each item in the list has 5 texts
	if args.tmodel == "transformer":
	from open_clip import tokenize

	texts = [tokenize(t) for t in batch["full_text"]]
	texts = torch.cat(texts)
	else:
	from .data import tokenizer

	texts = [
	tokenizer(t) for t in batch["full_text"]
	] # 5 texts for each audio
	texts = {
	k: torch.cat([t[k] for t in texts]) for k in texts[0].keys()
	} # 5 x batch

	# audios = audios.to(device=device, non_blocking=True)

	all_names = list(
	set(["-".join(b.split("/")[-3:-1]) for b in batch["__url__"]])
	)
	for name in all_names:
	if name not in eval_info.keys():
	# we will not use mlp outputs even if args.clap_mlploss=True
	eval_info[name] = {
	"cumulative_loss": 0.0,
	"num_samples": 0,
	"all_audio_features": [],
	"all_text_features": [],
	}
	with autocast():
	audio_features = model(audios, None, device)
	text_features = model(None, texts, device)
	audio_features = F.normalize(audio_features, dim=-1)
	text_features = F.normalize(text_features, dim=-1)

	all_names = list(
	set(["-".join(b.split("/")[-3:-1]) for b in batch["__url__"]])
	)
	for n in all_names:
	idx = np.where(
	np.array(
	["-".join(b.split("/")[-3:-1]) for b in batch["__url__"]]
	)
	== n
	)[0]
	eval_info[n]["all_audio_features"].append(
	audio_features.cpu().index_select(0, torch.tensor(idx).long())
	)
	# (yusong) please double-check. This is for selecting 5 text features at once.
	# because idx is a list of indices in size of num_samples,
	# and text_features is a tensor of size (5*num_samples, dim)
	# so we need to select 5 consecutive indices at once for a single index in idx.
	eval_info[n]["all_text_features"].append(
	text_features.cpu()
	.reshape([-1, 5, text_features.shape[1]])
	.index_select(0, torch.tensor(idx).long())
	.reshape([-1, text_features.shape[1]])
	)

	val_metrics_all = {}

	for n in eval_info.keys():
	logit_scale_a, logit_scale_t = model(None, None, device)
	logit_scale_a = logit_scale_a.cpu()

	audio_features = torch.cat(eval_info[n]["all_audio_features"], dim=0)
	text_features = torch.cat(eval_info[n]["all_text_features"], dim=0)

	logits_per_audio = (
	(logit_scale_a * audio_features @ text_features.t()).detach().cpu()
	)
	logits_per_text = logits_per_audio.t().detach().cpu()

	# logits_per_audio shape: [num_samples, num_samples*5]
	# logits_per_text shape: [num_samples*5, num_samples]

	logging.info(
	f"dataset {n}, logits_per_audio shape: {logits_per_audio.shape}, "
	f"logits_per_text shape: {logits_per_text.shape}"
	)

	metrics = {}
	num_samples = audio_features.shape[0]
	metrics[f"num_samples"] = num_samples

	# (yusong) the following code is very important, please double-check:
	# logits_per_audio.reshape(num_samples, num_samples, 5)[:, :, d]
	# logits_per_text.reshape(num_samples, 5, num_samples)[:, d, :]
	# Those two are retrieving one of the 5 text for each audio.
	labels = torch.arange(audio_features.shape[0]).long()
	audio_to_text_loss = [
	F.cross_entropy(
	logits_per_audio.reshape(num_samples, num_samples, 5)[:, :, d],
	labels,
	)
	for d in range(5)
	]
	text_to_audio_loss = [
	F.cross_entropy(
	logits_per_text.reshape(num_samples, 5, num_samples)[:, d, :],
	labels,
	)
	for d in range(5)
	]
	total_loss = (np.mean(audio_to_text_loss) + np.mean(text_to_audio_loss)) / 2

	metrics[f"cumulative_loss"] = total_loss.item()

	# text to audio: do 5 times
	pred_text = []
	for d in range(5):
	logit = logits_per_text.reshape(num_samples, 5, num_samples)[:, d, :]
	ground_truth = torch.arange(len(logit)).view(-1, 1)
	ranking = torch.argsort(
	logit, descending=True
	) # [num_samples, num_samples]
	preds = torch.where(ranking == ground_truth)[1]
	pred_text.append(preds.detach().cpu().numpy())
	pred_text_concat = np.concatenate(pred_text, axis=0) # [5*num_samples]
	metrics[f"text_to_audio_mean_rank"] = pred_text_concat.mean() + 1
	metrics[f"text_to_audio_median_rank"] = (
	np.floor(np.median(pred_text_concat)) + 1
	)
	for k in [1, 5, 10]:
	metrics[f"text_to_audio_R@{k}"] = np.mean(pred_text_concat < k)
	# map@10
	metrics[f"text_to_audio_mAP@10"] = np.mean(
	np.where(pred_text_concat < 10, 1 / (pred_text_concat + 1), 0.0)
	)

	# audio to text: take the best result
	# for audio to text map 10, sort and assign descending ground truth.
	# see https://github.com/XinhaoMei/audio-text_retrieval/blob/main/tools/utils.py#L103
	# map@10
	map_all = []
	pred_audio_all = []
	for d in range(num_samples):
	# logits_per_audio: [num_samples, num_samples*5]
	logit_single = logits_per_audio[d, :] # [5*num_samples]
	# Ground-truth index: [d5, d5+1, d5+2, d5+3, d*5+4]
	ranking = torch.argsort(
	logit_single, descending=True
	) # [5*num_samples]
	# ranking: the index of first match, second match, ...
	ground_truth = torch.arange(d * 5, d * 5 + 5)[None]
	all_pred = torch.where(
	torch.stack([ranking] * 5) == ground_truth.view(-1, 1)
	)[1]
	min_pred = torch.min(all_pred)
	pred_audio_all.append(min_pred.detach().cpu().numpy())
	all_pred_filter = all_pred[all_pred < 10].detach().cpu().numpy()
	# /5 because we have 5 text, so it means for the text rank >=10 we count as 0.
	map_single = (
	np.sum(
	(np.arange(1, len(all_pred_filter) + 1) / (all_pred_filter + 1))
	)
	/ 5
	)
	map_all.append(map_single)
	metrics[f"audio_to_text_mAP@10"] = np.mean(map_all)
	for k in [1, 5, 10]:
	metrics[f"audio_to_text_R@{k}"] = np.mean(np.array(pred_audio_all) < k)

	val_metrics_all[n] = {n + "/" + k: v for k, v in metrics.items()}
	return val_metrics_all


	def calculate_selection_performance_clotho_audiocaps(val_metrics_per_dataset):
	"""
	Calculate performance for Clotho+AudioCaps for model selection.
	"""
	selection_performance_all = []
	for n in val_metrics_per_dataset.keys():
	selection_performance = (
	val_metrics_per_dataset[n][f"{n}/audio_to_text_mAP@10"]
	+ val_metrics_per_dataset[n][f"{n}/text_to_audio_mAP@10"]
	) / 2
	selection_performance_all.append(selection_performance)
	return np.mean(selection_performance_all)


	def select_top_metric_clotho_audiocaps(metrics, val_metrics_per_dataset, args):
	# val_metrics_per_dataset: dict, key: dataset name, value: dict, key: metric name, value: metric value
	# metrics: dict, key: metric name, value: metric value
	# Hack: use args to save the top performance
	if not hasattr(args, "top_selection_performance"):
	selection_performance = calculate_selection_performance_clotho_audiocaps(
	val_metrics_per_dataset
	)
	# TODO: write the if and else together
	metric_update = {}
	for n in val_metrics_per_dataset.keys():
	for k in val_metrics_per_dataset[n].keys():
	metric_update[
	k.split("/")[0] + "-top" + "/" + k.split("/")[1]
	] = val_metrics_per_dataset[n][k]
	metric_update["top_selection_performance"] = selection_performance
	metric_update["top-selection-epoch"] = metrics["epoch"]
	metrics.update(metric_update)
	args.top_metric = metric_update
	args.top_selection_performance = selection_performance
	else:
	selection_performance_new = calculate_selection_performance_clotho_audiocaps(
	val_metrics_per_dataset
	)
	selection_performance_old = args.top_selection_performance
	if selection_performance_new > selection_performance_old:
	metric_update = {}
	for n in val_metrics_per_dataset.keys():
	for k in val_metrics_per_dataset[n].keys():
	metric_update[
	k.split("/")[0] + "-top" + "/" + k.split("/")[1]
	] = val_metrics_per_dataset[n][k]
	metric_update["top_selection_performance"] = selection_performance_new
	metric_update["top-selection-epoch"] = metrics["epoch"]
	metrics.update(metric_update)
	args.top_metric = metric_update
	args.top_selection_performance = selection_performance_new
	else:
	metrics.update(args.top_metric)
	return metrics