deeplearning's picture
Duplicate from AIFILMS/audioldm-text-to-audio-generation
ddc593e
raw
history blame contribute delete
No virus
36.3 kB
import json
import logging
import math
import os
import time
from contextlib import suppress
import numpy as np
import torch
import torch.nn.functional as F
try:
import wandb
except ImportError:
wandb = None
from open_clip import ClipLoss, gather_features
from .distributed import is_master
from .zero_shot import zero_shot_eval
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def unwrap_model(model):
if hasattr(model, "module"):
return model.module
else:
return model
def train_one_epoch(
model, data, epoch, optimizer, scaler, scheduler, args, tb_writer=None
):
device = torch.device(args.device)
autocast = torch.cuda.amp.autocast if args.precision == "amp" else suppress
model.train()
loss = ClipLoss(
local_loss=args.local_loss,
gather_with_grad=args.gather_with_grad,
cache_labels=True,
rank=args.rank,
world_size=args.world_size,
use_horovod=args.horovod,
mlp_loss=args.clap_mlploss,
weight_loss_kappa=args.kappa,
)
dataloader, sampler = data["train"].dataloader, data["train"].sampler
if args.distributed and sampler is not None:
sampler.set_epoch(epoch)
num_batches_per_epoch = dataloader.num_batches
sample_digits = math.ceil(math.log(dataloader.num_samples + 1, 10))
# for toy dataset
if args.dataset_type == "toy":
dataloader.dataset.generate_queue()
loss_m = AverageMeter()
batch_time_m = AverageMeter()
data_time_m = AverageMeter()
end = time.time()
for i, batch in enumerate(dataloader):
# logging.info(f"batch {i} of {num_batches_per_epoch}")
step = num_batches_per_epoch * epoch + i
if isinstance(scheduler, dict):
for s in scheduler.values():
s(step)
else:
scheduler(step)
audios = batch # contains mel_spec, wavform, and longer list
texts = batch["text"]
# audios = audios.to(device=device, non_blocking=True)
# texts = texts.to(device=device, non_blocking=True)
data_time_m.update(time.time() - end)
if isinstance(optimizer, dict):
for o_ in optimizer.values():
o_.zero_grad()
else:
optimizer.zero_grad()
with autocast():
(
audio_features,
text_features,
audio_features_mlp,
text_features_mlp,
logit_scale_a,
logit_scale_t,
) = model(audios, texts, device)
if args.clap_mlploss:
total_loss = loss(
audio_features=audio_features,
text_features=text_features,
logit_scale_a=logit_scale_a,
logit_scale_t=logit_scale_t,
audio_features_mlp=audio_features_mlp,
text_features_mlp=text_features_mlp,
)
else:
total_loss = loss(
audio_features=audio_features,
text_features=text_features,
logit_scale_a=logit_scale_a,
)
if isinstance(optimizer, dict):
if scaler is not None:
scaler.scale(total_loss).backward()
for o_ in optimizer.values():
if args.horovod:
o_.synchronize()
scaler.unscale_(o_)
with o_.skip_synchronize():
scaler.step(o_)
else:
scaler.step(o_)
scaler.update()
else:
total_loss.backward()
for o_ in optimizer.values():
o_.step()
else:
if scaler is not None:
scaler.scale(total_loss).backward()
if args.horovod:
optimizer.synchronize()
scaler.unscale_(optimizer)
with optimizer.skip_synchronize():
scaler.step(optimizer)
else:
scaler.step(optimizer)
scaler.update()
else:
total_loss.backward()
optimizer.step()
# Note: we clamp to 4.6052 = ln(100), as in the original paper.
with torch.no_grad():
unwrap_model(model).logit_scale_a.clamp_(0, math.log(100))
if args.clap_mlploss:
unwrap_model(model).logit_scale_t.clamp_(0, math.log(100))
batch_time_m.update(time.time() - end)
end = time.time()
batch_count = i + 1
if is_master(args) and (i % 100 == 0 or batch_count == num_batches_per_epoch):
if isinstance(audios, dict):
batch_size = len(audios["waveform"])
else:
batch_size = len(audios)
num_samples = batch_count * batch_size * args.world_size
samples_per_epoch = dataloader.num_samples
percent_complete = 100.0 * batch_count / num_batches_per_epoch
# NOTE loss is coarsely sampled, just master node and per log update
loss_m.update(total_loss.item(), batch_size)
logit_scale_scalar_a = logit_scale_a.item()
logit_scale_scalar_t = logit_scale_t.item()
if isinstance(optimizer, dict):
if args.clap_mlploss:
logging.info(
f"Train Epoch: {epoch} [{num_samples:>{sample_digits}}/{samples_per_epoch} ({percent_complete:.0f}%)] "
f"Loss: {loss_m.val:#.5g} ({loss_m.avg:#.4g}) "
f"Data (t): {data_time_m.avg:.3f} "
f"Batch (t): {batch_time_m.avg:.3f} "
f"LR: {[o_.param_groups[0]['lr'] for o_ in optimizer.values()]} "
f"Logit Scale Audio: {logit_scale_scalar_a:.3f}"
f"Logit Scale Text: {logit_scale_scalar_t:.3f}"
)
log_data = {
"loss": loss_m.val,
"data_time": data_time_m.val,
"batch_time": batch_time_m.val,
"scale_audio": logit_scale_scalar_a,
"scale_text": logit_scale_scalar_t,
"lr": [o_.param_groups[0]["lr"] for o_ in optimizer.values()],
}
else:
logging.info(
f"Train Epoch: {epoch} [{num_samples:>{sample_digits}}/{samples_per_epoch} ({percent_complete:.0f}%)] "
f"Loss: {loss_m.val:#.5g} ({loss_m.avg:#.4g}) "
f"Data (t): {data_time_m.avg:.3f} "
f"Batch (t): {batch_time_m.avg:.3f} "
f"LR: {[o_.param_groups[0]['lr'] for o_ in optimizer.values()]} "
f"Logit Scale Audio: {logit_scale_scalar_a:.3f}"
)
log_data = {
"loss": loss_m.val,
"data_time": data_time_m.val,
"batch_time": batch_time_m.val,
"scale_audio": logit_scale_scalar_a,
"lr": [o_.param_groups[0]["lr"] for o_ in optimizer.values()],
}
else:
if args.clap_mlploss:
logging.info(
f"Train Epoch: {epoch} [{num_samples:>{sample_digits}}/{samples_per_epoch} ({percent_complete:.0f}%)] "
f"Loss: {loss_m.val:#.5g} ({loss_m.avg:#.4g}) "
f"Data (t): {data_time_m.avg:.3f} "
f"Batch (t): {batch_time_m.avg:.3f} "
f"LR: {optimizer.param_groups[0]['lr']:5f} "
f"Logit Scale Audio: {logit_scale_scalar_a:.3f}"
f"Logit Scale Text: {logit_scale_scalar_t:.3f}"
)
# Save train loss / etc. Using non avg meter values as loggers have their own smoothing
log_data = {
"loss": loss_m.val,
"data_time": data_time_m.val,
"batch_time": batch_time_m.val,
"scale_audio": logit_scale_scalar_a,
"scale_text": logit_scale_scalar_t,
"lr": optimizer.param_groups[0]["lr"],
}
else:
logging.info(
f"Train Epoch: {epoch} [{num_samples:>{sample_digits}}/{samples_per_epoch} ({percent_complete:.0f}%)] "
f"Loss: {loss_m.val:#.5g} ({loss_m.avg:#.4g}) "
f"Data (t): {data_time_m.avg:.3f} "
f"Batch (t): {batch_time_m.avg:.3f} "
f"LR: {optimizer.param_groups[0]['lr']:5f} "
f"Logit Scale Audio: {logit_scale_scalar_a:.3f}"
)
# Save train loss / etc. Using non avg meter values as loggers have their own smoothing
log_data = {
"loss": loss_m.val,
"data_time": data_time_m.val,
"batch_time": batch_time_m.val,
"scale_audio": logit_scale_scalar_a,
"lr": optimizer.param_groups[0]["lr"],
}
for name, val in log_data.items():
name = "train/" + name
if tb_writer is not None:
tb_writer.add_scalar(name, val, step)
if args.wandb:
assert wandb is not None, "Please install wandb."
wandb.log({name: val, "step": step})
# resetting batch / data time meters per log window
batch_time_m.reset()
data_time_m.reset()
# end for
def evaluate(model, data, epoch, args, tb_writer=None):
metrics = {}
if not args.parallel_eval:
if not is_master(args):
return metrics
device = torch.device(args.device)
model.eval()
# CHANGE
# zero_shot_metrics = zero_shot_eval(model, data, epoch, args)
# metrics.update(zero_shot_metrics)
if is_master(args):
print("Evaluating...")
autocast = torch.cuda.amp.autocast if args.precision == "amp" else suppress
if args.val_dataset_names == ["Clotho", "audiocaps"]:
# if only clotho and audiocaps are used, then we will use a different evaluation function.
# This is because in the Clotho and audiocaps valid and test set, there are 5 text for 1 audio.
if args.parallel_eval:
# (yusong): just a hack here. Don't use parallel eval when evaluating only clotho and audiocaps.
raise NotImplementedError(
"Parallel evaluation not supported for eval only Clotho and audiocaps."
)
val_metrics_per_dataset = evaluate_clotho_audiocaps(
model, data, epoch, args, autocast, device, tb_writer
)
for m in val_metrics_per_dataset.values():
metrics.update(m)
if "epoch" not in metrics.keys():
metrics.update({"epoch": epoch})
metrics = select_top_metric_clotho_audiocaps(
metrics, val_metrics_per_dataset, args
)
elif "val" in data and (
args.val_frequency
and ((epoch % args.val_frequency) == 0 or epoch == args.epochs)
):
dataloader = data["val"].dataloader
num_samples = 0
samples_per_val = dataloader.num_samples
# FIXME this does not scale past small eval datasets
# all_audio_features @ all_text_features will blow up memory and compute very quickly
eval_info = {}
if args.clap_mlploss:
eval_info["all"] = {
"cumulative_loss": 0.0,
"num_samples": 0,
"all_audio_features": [],
"all_text_features": [],
"all_audio_features_mlp": [],
"all_text_features_mlp": [],
} # cumulative_loss = 0.0
else:
eval_info["all"] = {
"cumulative_loss": 0.0,
"num_samples": 0,
"all_audio_features": [],
"all_text_features": [],
} # cumu
# all_audio_features, all_text_features, all_audio_features_mlp, all_text_features_mlp = [], [], [], []
with torch.no_grad():
for i, batch in enumerate(dataloader):
audios = batch # contains mel_spec, wavform, and longer list
texts = batch["text"]
# audios = audios.to(device=device, non_blocking=True)
all_names = list(
set(["-".join(b.split("/")[-3:-1]) for b in batch["__url__"]])
)
for name in all_names:
if name not in eval_info.keys():
if args.clap_mlploss:
eval_info[name] = {
"cumulative_loss": 0.0,
"num_samples": 0,
"all_audio_features": [],
"all_text_features": [],
"all_audio_features_mlp": [],
"all_text_features_mlp": [],
}
else:
eval_info[name] = {
"cumulative_loss": 0.0,
"num_samples": 0,
"all_audio_features": [],
"all_text_features": [],
}
with autocast():
(
audio_features,
text_features,
audio_features_mlp,
text_features_mlp,
logit_scale_a,
logit_scale_t,
) = model(audios, texts, device)
if args.parallel_eval:
# multi-GPU eval
if args.clap_mlploss:
(
audio_features,
text_features,
audio_features_mlp,
text_features_mlp,
) = gather_features(
audio_features=audio_features,
text_features=text_features,
audio_features_mlp=audio_features_mlp,
text_features_mlp=text_features_mlp,
local_loss=False,
gather_with_grad=False,
rank=args.rank,
world_size=args.world_size,
use_horovod=args.horovod,
mlp_loss=args.clap_mlploss,
)
else:
(audio_features, text_features,) = gather_features(
audio_features=audio_features,
text_features=text_features,
local_loss=False,
gather_with_grad=False,
rank=args.rank,
world_size=args.world_size,
use_horovod=args.horovod,
mlp_loss=args.clap_mlploss,
)
if is_master(args):
num_samples += audio_features.shape[0]
for n in [*all_names, "all"]:
if n == "all":
eval_info[n]["all_audio_features"].append(
audio_features.cpu()
)
eval_info[n]["all_text_features"].append(
text_features.cpu()
)
if args.clap_mlploss:
eval_info[n]["all_audio_features_mlp"].append(
audio_features_mlp.cpu()
)
eval_info[n]["all_text_features_mlp"].append(
text_features_mlp.cpu()
)
else:
idx = np.where(
np.array(
[
"-".join(b.split("/")[-3:-1])
for b in batch["__url__"]
]
)
== n
)[0]
eval_info[n]["all_audio_features"].append(
audio_features.cpu().index_select(
0, torch.tensor(idx).long()
)
)
eval_info[n]["all_text_features"].append(
text_features.cpu().index_select(
0, torch.tensor(idx).long()
)
)
if args.clap_mlploss:
eval_info[n]["all_audio_features_mlp"].append(
audio_features_mlp.cpu().index_select(
0, torch.tensor(idx).long()
)
)
eval_info[n]["all_text_features_mlp"].append(
text_features_mlp.cpu().index_select(
0, torch.tensor(idx).long()
)
)
# print(f'eval step {i}') # (yusong): for debug
# cumulative_loss += total_loss * batch_size
# num_samples += batch_size
if is_master(args) and (i % 100) == 0: # and i != 0:
logging.info(
f"Eval Epoch: {epoch} [{num_samples} / {samples_per_val}]"
)
if is_master(args):
val_metrics_per_dataset = {}
for n in eval_info.keys():
if args.clap_mlploss:
metrics_single_dataset = get_metrics(
audio_features=torch.cat(
eval_info[n]["all_audio_features"]
),
text_features=torch.cat(eval_info[n]["all_text_features"]),
logit_scale_a=logit_scale_a.cpu(),
audio_features_mlp=torch.cat(
eval_info[n]["all_audio_features_mlp"]
),
text_features_mlp=torch.cat(
eval_info[n]["all_text_features_mlp"]
),
logit_scale_t=logit_scale_t.cpu(),
mlp_loss=args.clap_mlploss,
)
else:
metrics_single_dataset = get_metrics(
audio_features=torch.cat(
eval_info[n]["all_audio_features"]
),
text_features=torch.cat(eval_info[n]["all_text_features"]),
logit_scale_a=logit_scale_a.cpu(),
mlp_loss=args.clap_mlploss,
)
val_metrics_per_dataset[n] = {
n + "/" + k: v for k, v in metrics_single_dataset.items()
}
metrics.update(val_metrics_per_dataset[n])
if "epoch" not in metrics.keys():
metrics.update({"epoch": epoch})
if is_master(args):
if not metrics:
return metrics
logging.info(
f"Eval Epoch: {epoch} "
+ "\n".join(
[
"\t".join([f"{k}: {round(v, 4):.4f}" for k, v in m.items()])
for m in val_metrics_per_dataset.values()
]
)
)
if args.save_logs:
for name, val in metrics.items():
if tb_writer is not None:
tb_writer.add_scalar(f"val/{name}", val, epoch)
with open(os.path.join(args.checkpoint_path, "results.jsonl"), "a+") as f:
f.write(json.dumps(metrics))
f.write("\n")
if args.wandb:
assert wandb is not None, "Please install wandb."
for name, val in metrics.items():
wandb.log({f"val/{name}": val, "epoch": epoch})
return metrics
else:
return metrics
def get_metrics(
audio_features,
text_features,
logit_scale_a,
audio_features_mlp=None,
text_features_mlp=None,
logit_scale_t=None,
mlp_loss=False,
):
metrics = {}
if mlp_loss:
# Set up audio to text & text to audio similary matrice
a_logits_per_audio = (
(logit_scale_a * audio_features @ text_features_mlp.t()).detach().cpu()
)
a_logits_per_text = a_logits_per_audio.t().detach().cpu()
t_logits_per_audio = (
(logit_scale_t * audio_features_mlp @ text_features.t()).detach().cpu()
)
t_logits_per_text = t_logits_per_audio.t().detach().cpu()
labels = torch.arange(audio_features.shape[0]).long()
# Change the loss from two terms into four terms with 2x2 combined CE loss
total_loss = (
F.cross_entropy(a_logits_per_audio, labels)
+ F.cross_entropy(a_logits_per_text, labels)
+ F.cross_entropy(t_logits_per_audio, labels)
+ F.cross_entropy(t_logits_per_text, labels)
) / 4
metrics[f"cumulative_loss"] = total_loss.item()
metrics[f"num_samples"] = audio_features.shape[0]
logits = {
"audio_to_text": (a_logits_per_audio + t_logits_per_audio) / 2,
"text_to_audio": (a_logits_per_text + t_logits_per_text) / 2,
}
ground_truth = torch.arange(len(text_features)).view(-1, 1)
else:
# print("text_features", text_features)
# print("text_features.shape", text_features.shape)
logits_per_audio = (
(logit_scale_a * audio_features @ text_features.t()).detach().cpu()
)
logits_per_text = logits_per_audio.t().detach().cpu()
labels = torch.arange(audio_features.shape[0]).long()
# Change the loss from two terms into four terms with 2x2 combined CE loss
total_loss = (
F.cross_entropy(logits_per_audio, labels)
+ F.cross_entropy(logits_per_text, labels)
) / 2
metrics[f"cumulative_loss"] = total_loss.item()
metrics[f"num_samples"] = audio_features.shape[0]
logits = {"audio_to_text": logits_per_audio, "text_to_audio": logits_per_text}
ground_truth = torch.arange(len(text_features)).view(-1, 1)
for name, logit in logits.items():
ranking = torch.argsort(logit, descending=True)
preds = torch.where(ranking == ground_truth)[
1
] # (yusong) this line is slow because it uses single thread
preds = preds.detach().cpu().numpy()
metrics[f"{name}_mean_rank"] = preds.mean() + 1
metrics[f"{name}_median_rank"] = np.floor(np.median(preds)) + 1
for k in [1, 5, 10]:
metrics[f"{name}_R@{k}"] = np.mean(preds < k)
# map@10
metrics[f"{name}_mAP@10"] = np.mean(np.where(preds < 10, 1 / (preds + 1), 0.0))
return metrics
def evaluate_clotho_audiocaps(
model, data, epoch, args, autocast, device, tb_writer=None
):
"""
Adapted from https://github.com/XinhaoMei/audio-text_retrieval/blob/main/tools/utils.py.
1. for text-to-audio retrieval, do 5 times and average the results
2. for R@1, R@5, R@10 in audio-to-text retrieval, take the best rank among 5 text
3. for map@10 in audio-to-text retrieval:
3.1: sort the rank of 5 text
3.2: exclude the rank >=10 (0-index)
3.3: compute the map regarding the remaining ranks: np.mean(np.arange(1, len(ranks)+1) / ranks).
(3.3) That is, take the top ranks of 5 text that is < 10, and assign the descending number as ground truth.
(3.3) E.g.: the ground truth of first rank of the 5 text should be 1, the second rank should be 2, etc.
"""
# TODO: (yusong) only support single GPU evaluation and only support non-mlp case for now.
dataloader = data["val"].dataloader
with torch.no_grad():
eval_info = {}
for i, batch in enumerate(dataloader):
audios = batch # contains mel_spec, wavform, and longer list
# each item in the list has 5 texts
if args.tmodel == "transformer":
from open_clip import tokenize
texts = [tokenize(t) for t in batch["full_text"]]
texts = torch.cat(texts)
else:
from .data import tokenizer
texts = [
tokenizer(t) for t in batch["full_text"]
] # 5 texts for each audio
texts = {
k: torch.cat([t[k] for t in texts]) for k in texts[0].keys()
} # 5 x batch
# audios = audios.to(device=device, non_blocking=True)
all_names = list(
set(["-".join(b.split("/")[-3:-1]) for b in batch["__url__"]])
)
for name in all_names:
if name not in eval_info.keys():
# we will not use mlp outputs even if args.clap_mlploss=True
eval_info[name] = {
"cumulative_loss": 0.0,
"num_samples": 0,
"all_audio_features": [],
"all_text_features": [],
}
with autocast():
audio_features = model(audios, None, device)
text_features = model(None, texts, device)
audio_features = F.normalize(audio_features, dim=-1)
text_features = F.normalize(text_features, dim=-1)
all_names = list(
set(["-".join(b.split("/")[-3:-1]) for b in batch["__url__"]])
)
for n in all_names:
idx = np.where(
np.array(
["-".join(b.split("/")[-3:-1]) for b in batch["__url__"]]
)
== n
)[0]
eval_info[n]["all_audio_features"].append(
audio_features.cpu().index_select(0, torch.tensor(idx).long())
)
# (yusong) please double-check. This is for selecting 5 text features at once.
# because idx is a list of indices in size of num_samples,
# and text_features is a tensor of size (5*num_samples, dim)
# so we need to select 5 consecutive indices at once for a single index in idx.
eval_info[n]["all_text_features"].append(
text_features.cpu()
.reshape([-1, 5, text_features.shape[1]])
.index_select(0, torch.tensor(idx).long())
.reshape([-1, text_features.shape[1]])
)
val_metrics_all = {}
for n in eval_info.keys():
logit_scale_a, logit_scale_t = model(None, None, device)
logit_scale_a = logit_scale_a.cpu()
audio_features = torch.cat(eval_info[n]["all_audio_features"], dim=0)
text_features = torch.cat(eval_info[n]["all_text_features"], dim=0)
logits_per_audio = (
(logit_scale_a * audio_features @ text_features.t()).detach().cpu()
)
logits_per_text = logits_per_audio.t().detach().cpu()
# logits_per_audio shape: [num_samples, num_samples*5]
# logits_per_text shape: [num_samples*5, num_samples]
logging.info(
f"dataset {n}, logits_per_audio shape: {logits_per_audio.shape}, "
f"logits_per_text shape: {logits_per_text.shape}"
)
metrics = {}
num_samples = audio_features.shape[0]
metrics[f"num_samples"] = num_samples
# (yusong) the following code is very important, please double-check:
# logits_per_audio.reshape(num_samples, num_samples, 5)[:, :, d]
# logits_per_text.reshape(num_samples, 5, num_samples)[:, d, :]
# Those two are retrieving one of the 5 text for each audio.
labels = torch.arange(audio_features.shape[0]).long()
audio_to_text_loss = [
F.cross_entropy(
logits_per_audio.reshape(num_samples, num_samples, 5)[:, :, d],
labels,
)
for d in range(5)
]
text_to_audio_loss = [
F.cross_entropy(
logits_per_text.reshape(num_samples, 5, num_samples)[:, d, :],
labels,
)
for d in range(5)
]
total_loss = (np.mean(audio_to_text_loss) + np.mean(text_to_audio_loss)) / 2
metrics[f"cumulative_loss"] = total_loss.item()
# text to audio: do 5 times
pred_text = []
for d in range(5):
logit = logits_per_text.reshape(num_samples, 5, num_samples)[:, d, :]
ground_truth = torch.arange(len(logit)).view(-1, 1)
ranking = torch.argsort(
logit, descending=True
) # [num_samples, num_samples]
preds = torch.where(ranking == ground_truth)[1]
pred_text.append(preds.detach().cpu().numpy())
pred_text_concat = np.concatenate(pred_text, axis=0) # [5*num_samples]
metrics[f"text_to_audio_mean_rank"] = pred_text_concat.mean() + 1
metrics[f"text_to_audio_median_rank"] = (
np.floor(np.median(pred_text_concat)) + 1
)
for k in [1, 5, 10]:
metrics[f"text_to_audio_R@{k}"] = np.mean(pred_text_concat < k)
# map@10
metrics[f"text_to_audio_mAP@10"] = np.mean(
np.where(pred_text_concat < 10, 1 / (pred_text_concat + 1), 0.0)
)
# audio to text: take the best result
# for audio to text map 10, sort and assign descending ground truth.
# see https://github.com/XinhaoMei/audio-text_retrieval/blob/main/tools/utils.py#L103
# map@10
map_all = []
pred_audio_all = []
for d in range(num_samples):
# logits_per_audio: [num_samples, num_samples*5]
logit_single = logits_per_audio[d, :] # [5*num_samples]
# Ground-truth index: [d*5, d*5+1, d*5+2, d*5+3, d*5+4]
ranking = torch.argsort(
logit_single, descending=True
) # [5*num_samples]
# ranking: the index of first match, second match, ...
ground_truth = torch.arange(d * 5, d * 5 + 5)[None]
all_pred = torch.where(
torch.stack([ranking] * 5) == ground_truth.view(-1, 1)
)[1]
min_pred = torch.min(all_pred)
pred_audio_all.append(min_pred.detach().cpu().numpy())
all_pred_filter = all_pred[all_pred < 10].detach().cpu().numpy()
# /5 because we have 5 text, so it means for the text rank >=10 we count as 0.
map_single = (
np.sum(
(np.arange(1, len(all_pred_filter) + 1) / (all_pred_filter + 1))
)
/ 5
)
map_all.append(map_single)
metrics[f"audio_to_text_mAP@10"] = np.mean(map_all)
for k in [1, 5, 10]:
metrics[f"audio_to_text_R@{k}"] = np.mean(np.array(pred_audio_all) < k)
val_metrics_all[n] = {n + "/" + k: v for k, v in metrics.items()}
return val_metrics_all
def calculate_selection_performance_clotho_audiocaps(val_metrics_per_dataset):
"""
Calculate performance for Clotho+AudioCaps for model selection.
"""
selection_performance_all = []
for n in val_metrics_per_dataset.keys():
selection_performance = (
val_metrics_per_dataset[n][f"{n}/audio_to_text_mAP@10"]
+ val_metrics_per_dataset[n][f"{n}/text_to_audio_mAP@10"]
) / 2
selection_performance_all.append(selection_performance)
return np.mean(selection_performance_all)
def select_top_metric_clotho_audiocaps(metrics, val_metrics_per_dataset, args):
# val_metrics_per_dataset: dict, key: dataset name, value: dict, key: metric name, value: metric value
# metrics: dict, key: metric name, value: metric value
# Hack: use args to save the top performance
if not hasattr(args, "top_selection_performance"):
selection_performance = calculate_selection_performance_clotho_audiocaps(
val_metrics_per_dataset
)
# TODO: write the if and else together
metric_update = {}
for n in val_metrics_per_dataset.keys():
for k in val_metrics_per_dataset[n].keys():
metric_update[
k.split("/")[0] + "-top" + "/" + k.split("/")[1]
] = val_metrics_per_dataset[n][k]
metric_update["top_selection_performance"] = selection_performance
metric_update["top-selection-epoch"] = metrics["epoch"]
metrics.update(metric_update)
args.top_metric = metric_update
args.top_selection_performance = selection_performance
else:
selection_performance_new = calculate_selection_performance_clotho_audiocaps(
val_metrics_per_dataset
)
selection_performance_old = args.top_selection_performance
if selection_performance_new > selection_performance_old:
metric_update = {}
for n in val_metrics_per_dataset.keys():
for k in val_metrics_per_dataset[n].keys():
metric_update[
k.split("/")[0] + "-top" + "/" + k.split("/")[1]
] = val_metrics_per_dataset[n][k]
metric_update["top_selection_performance"] = selection_performance_new
metric_update["top-selection-epoch"] = metrics["epoch"]
metrics.update(metric_update)
args.top_metric = metric_update
args.top_selection_performance = selection_performance_new
else:
metrics.update(args.top_metric)
return metrics