audioldm-text-to-audio-generation

Runtime error

audioldm-text-to-audio-generation / audioldm /clap /open_clip /loss.py

haoheliu

first commit and add large model

bdab1da over 1 year ago

16.1 kB

	from multiprocessing.sharedctypes import Value
	import torch
	import torch.distributed.nn
	from torch import distributed as dist, nn as nn
	from torch.nn import functional as F
	import numpy as np
	from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score

	try:
	import horovod.torch as hvd
	except ImportError:
	hvd = None


	def gather_features(
	audio_features,
	text_features,
	audio_features_mlp=None,
	text_features_mlp=None,
	local_loss=False,
	gather_with_grad=False,
	rank=0,
	world_size=1,
	use_horovod=False,
	mlp_loss=False,
	):
	if use_horovod:
	assert hvd is not None, "Please install horovod"
	if gather_with_grad:
	all_audio_features = hvd.allgather(audio_features)
	all_text_features = hvd.allgather(text_features)
	if mlp_loss:
	all_audio_features_mlp = hvd.allgather(audio_features_mlp)
	all_text_features_mlp = hvd.allgather(text_features_mlp)
	else:
	with torch.no_grad():
	all_audio_features = hvd.allgather(audio_features)
	all_text_features = hvd.allgather(text_features)
	if mlp_loss:
	all_audio_features_mlp = hvd.allgather(audio_features_mlp)
	all_text_features_mlp = hvd.allgather(text_features_mlp)
	if not local_loss:
	# ensure grads for local rank when all_* features don't have a gradient
	gathered_audio_features = list(
	all_audio_features.chunk(world_size, dim=0)
	)
	gathered_text_features = list(
	all_text_features.chunk(world_size, dim=0)
	)
	gathered_audio_features[rank] = audio_features
	gathered_text_features[rank] = text_features
	all_audio_features = torch.cat(gathered_audio_features, dim=0)
	all_text_features = torch.cat(gathered_text_features, dim=0)
	if mlp_loss:
	gathered_audio_features_mlp = list(
	all_audio_features_mlp.chunk(world_size, dim=0)
	)
	gathered_text_features_mlp = list(
	all_text_features_mlp.chunk(world_size, dim=0)
	)
	gathered_audio_features_mlp[rank] = audio_features_mlp
	gathered_text_features_mlp[rank] = text_features_mlp
	all_audio_features_mlp = torch.cat(
	gathered_audio_features_mlp, dim=0
	)
	all_text_features_mlp = torch.cat(gathered_text_features_mlp, dim=0)
	else:
	# We gather tensors from all gpus
	if gather_with_grad:
	all_audio_features = torch.cat(
	torch.distributed.nn.all_gather(audio_features), dim=0
	)
	all_text_features = torch.cat(
	torch.distributed.nn.all_gather(text_features), dim=0
	)
	if mlp_loss:
	all_audio_features_mlp = torch.cat(
	torch.distributed.nn.all_gather(audio_features_mlp), dim=0
	)
	all_text_features_mlp = torch.cat(
	torch.distributed.nn.all_gather(text_features_mlp), dim=0
	)
	else:
	gathered_audio_features = [
	torch.zeros_like(audio_features) for _ in range(world_size)
	]
	gathered_text_features = [
	torch.zeros_like(text_features) for _ in range(world_size)
	]
	dist.all_gather(gathered_audio_features, audio_features)
	dist.all_gather(gathered_text_features, text_features)
	if mlp_loss:
	gathered_audio_features_mlp = [
	torch.zeros_like(audio_features_mlp) for _ in range(world_size)
	]
	gathered_text_features_mlp = [
	torch.zeros_like(text_features_mlp) for _ in range(world_size)
	]
	dist.all_gather(gathered_audio_features_mlp, audio_features_mlp)
	dist.all_gather(gathered_text_features_mlp, text_features_mlp)
	if not local_loss:
	# ensure grads for local rank when all_* features don't have a gradient
	gathered_audio_features[rank] = audio_features
	gathered_text_features[rank] = text_features
	if mlp_loss:
	gathered_audio_features_mlp[rank] = audio_features_mlp
	gathered_text_features_mlp[rank] = text_features_mlp

	all_audio_features = torch.cat(gathered_audio_features, dim=0)
	all_text_features = torch.cat(gathered_text_features, dim=0)
	if mlp_loss:
	all_audio_features_mlp = torch.cat(gathered_audio_features_mlp, dim=0)
	all_text_features_mlp = torch.cat(gathered_text_features_mlp, dim=0)
	if mlp_loss:
	return (
	all_audio_features,
	all_text_features,
	all_audio_features_mlp,
	all_text_features_mlp,
	)
	else:
	return all_audio_features, all_text_features


	class ClipLoss(nn.Module):
	def __init__(
	self,
	local_loss=False,
	gather_with_grad=False,
	cache_labels=False,
	rank=0,
	world_size=1,
	use_horovod=False,
	mlp_loss=False,
	weight_loss_kappa=0,
	):
	super().__init__()
	self.local_loss = local_loss
	self.gather_with_grad = gather_with_grad
	self.cache_labels = cache_labels
	self.rank = rank
	self.world_size = world_size
	self.use_horovod = use_horovod
	self.mlp_loss = mlp_loss
	self.weighted_loss = bool(weight_loss_kappa != 0)
	self.weight_loss_kappa = weight_loss_kappa
	# cache state
	self.prev_num_logits = 0
	self.labels = {}

	def forward(
	self,
	audio_features,
	text_features,
	logit_scale_a,
	logit_scale_t=None,
	audio_features_mlp=None,
	text_features_mlp=None,
	):
	device = audio_features.device
	if self.mlp_loss:
	if self.world_size > 1:
	(
	all_audio_features,
	all_text_features,
	all_audio_features_mlp,
	all_text_features_mlp,
	) = gather_features(
	audio_features=audio_features,
	text_features=text_features,
	audio_features_mlp=audio_features_mlp,
	text_features_mlp=text_features_mlp,
	local_loss=self.local_loss,
	gather_with_grad=self.gather_with_grad,
	rank=self.rank,
	world_size=self.world_size,
	use_horovod=self.use_horovod,
	mlp_loss=self.mlp_loss,
	)
	if self.local_loss:
	a_logits_per_audio = (
	logit_scale_a * audio_features @ all_text_features_mlp.T
	)
	a_logits_per_text = (
	logit_scale_a * text_features_mlp @ all_audio_features.T
	)
	t_logits_per_audio = (
	logit_scale_t * audio_features_mlp @ all_text_features.T
	)
	t_logits_per_text = (
	logit_scale_t * text_features @ all_audio_features_mlp.T
	)
	else:
	a_logits_per_audio = (
	logit_scale_a * all_audio_features @ all_text_features_mlp.T
	)
	a_logits_per_text = a_logits_per_audio.T
	t_logits_per_audio = (
	logit_scale_t * all_audio_features_mlp @ all_text_features.T
	)
	t_logits_per_text = t_logits_per_audio.T
	else:
	a_logits_per_audio = (
	logit_scale_a * audio_features @ text_features_mlp.T
	)
	a_logits_per_text = logit_scale_a * text_features_mlp @ audio_features.T
	t_logits_per_audio = (
	logit_scale_t * audio_features_mlp @ text_features.T
	)
	t_logits_per_text = logit_scale_t * text_features @ audio_features_mlp.T

	# calculated ground-truth and cache if enabled
	num_logits = a_logits_per_audio.shape[0]
	if self.prev_num_logits != num_logits or device not in self.labels:
	labels = torch.arange(num_logits, device=device, dtype=torch.long)
	if self.world_size > 1 and self.local_loss:
	labels = labels + num_logits * self.rank
	if self.cache_labels:
	self.labels[device] = labels
	self.prev_num_logits = num_logits
	else:
	labels = self.labels[device]

	if not self.weighted_loss:
	total_loss = (
	F.cross_entropy(a_logits_per_audio, labels)
	+ F.cross_entropy(a_logits_per_text, labels)
	+ F.cross_entropy(t_logits_per_audio, labels)
	+ F.cross_entropy(t_logits_per_text, labels)
	) / 4
	else:
	audio_weight = (audio_features @ audio_features.T).detach()
	audio_weight = (
	torch.exp(
	torch.sum(audio_weight, axis=1)
	/ (self.weight_loss_kappa * len(audio_weight))
	)
	).detach()
	text_weight = (text_features @ text_features.T).detach()
	text_weight = (
	torch.exp(
	torch.sum(text_weight, axis=1)
	/ (self.weight_loss_kappa * len(text_features))
	)
	).detach()
	total_loss = (
	F.cross_entropy(a_logits_per_audio, labels, weight=audio_weight)
	+ F.cross_entropy(a_logits_per_text, labels, weight=audio_weight)
	+ F.cross_entropy(t_logits_per_audio, labels, weight=text_weight)
	+ F.cross_entropy(t_logits_per_text, labels, weight=text_weight)
	) / 4
	else:
	if self.world_size > 1:
	all_audio_features, all_text_features = gather_features(
	audio_features=audio_features,
	text_features=text_features,
	local_loss=self.local_loss,
	gather_with_grad=self.gather_with_grad,
	rank=self.rank,
	world_size=self.world_size,
	use_horovod=self.use_horovod,
	mlp_loss=self.mlp_loss,
	)

	if self.local_loss:
	logits_per_audio = (
	logit_scale_a * audio_features @ all_text_features.T
	)
	logits_per_text = (
	logit_scale_a * text_features @ all_audio_features.T
	)
	else:
	logits_per_audio = (
	logit_scale_a * all_audio_features @ all_text_features.T
	)
	logits_per_text = logits_per_audio.T
	else:
	logits_per_audio = logit_scale_a * audio_features @ text_features.T
	logits_per_text = logit_scale_a * text_features @ audio_features.T

	# calculated ground-truth and cache if enabled
	num_logits = logits_per_audio.shape[0]
	if self.prev_num_logits != num_logits or device not in self.labels:
	labels = torch.arange(num_logits, device=device, dtype=torch.long)
	if self.world_size > 1 and self.local_loss:
	labels = labels + num_logits * self.rank
	if self.cache_labels:
	self.labels[device] = labels
	self.prev_num_logits = num_logits
	else:
	labels = self.labels[device]
	if not self.weighted_loss:
	total_loss = (
	F.cross_entropy(logits_per_audio, labels)
	+ F.cross_entropy(logits_per_text, labels)
	) / 2
	else:
	audio_weight = (all_audio_features @ all_audio_features.T).detach()
	audio_weight = (
	torch.exp(
	torch.sum(audio_weight, axis=1)
	/ (self.weight_loss_kappa * len(all_audio_features))
	)
	).detach()
	text_weight = (all_text_features @ all_text_features.T).detach()
	text_weight = (
	torch.exp(
	torch.sum(text_weight, axis=1)
	/ (self.weight_loss_kappa * len(all_text_features))
	)
	).detach()
	total_loss = (
	F.cross_entropy(logits_per_audio, labels, weight=text_weight)
	+ F.cross_entropy(logits_per_text, labels, weight=audio_weight)
	) / 2
	return total_loss


	def lp_gather_features(pred, target, world_size=1, use_horovod=False):
	if use_horovod:
	assert hvd is not None, "Please install horovod"
	with torch.no_grad():
	all_preds = hvd.allgather(pred)
	all_targets = hvd.allgath(target)
	else:
	gathered_preds = [torch.zeros_like(pred) for _ in range(world_size)]
	gathered_targets = [torch.zeros_like(target) for _ in range(world_size)]

	dist.all_gather(gathered_preds, pred)
	dist.all_gather(gathered_targets, target)
	all_preds = torch.cat(gathered_preds, dim=0)
	all_targets = torch.cat(gathered_targets, dim=0)

	return all_preds, all_targets


	def get_map(pred, target):
	pred = torch.sigmoid(pred).numpy()
	target = target.numpy()
	return np.mean(average_precision_score(target, pred, average=None))


	def get_acc(pred, target):
	pred = torch.argmax(pred, 1).numpy()
	target = torch.argmax(target, 1).numpy()
	return accuracy_score(target, pred)


	def get_mauc(pred, target):
	pred = torch.sigmoid(pred).numpy()
	target = target.numpy()
	return np.mean(roc_auc_score(target, pred, average=None))


	class LPMetrics(object):
	def __init__(self, metric_names=["map", "acc", "mauc"]):
	self.metrics = []
	for name in metric_names:
	self.metrics.append(self.get_metric(name))
	self.metric_names = metric_names

	def get_metric(self, name):
	if name == "map":
	return get_map
	elif name == "acc":
	return get_acc
	elif name == "mauc":
	return get_mauc
	else:
	raise ValueError(f"the metric should be at least one of [map, acc, mauc]")

	def evaluate_mertics(self, pred, target):
	metric_dict = {}
	for i in range(len(self.metric_names)):
	metric_dict[self.metric_names[i]] = self.metrics[i](pred, target)
	return metric_dict


	def calc_celoss(pred, target):
	target = torch.argmax(target, 1).long()
	return nn.CrossEntropyLoss()(pred, target)


	class LPLoss(nn.Module):
	def __init__(self, loss_name):
	super().__init__()
	if loss_name == "bce":
	self.loss_func = nn.BCEWithLogitsLoss()
	elif loss_name == "ce":
	self.loss_func = calc_celoss
	elif loss_name == "mse":
	self.loss_func = nn.MSELoss()
	else:
	raise ValueError(f"the loss func should be at least one of [bce, ce, mse]")

	def forward(self, pred, target):
	loss = self.loss_func(pred, target)
	return loss