audioldm-text-to-audio-generation-copy

Paused

File size: 16,143 Bytes

ac1cc14

from multiprocessing.sharedctypes import Value
import torch
import torch.distributed.nn
from torch import distributed as dist, nn as nn
from torch.nn import functional as F
import numpy as np
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score

try:
    import horovod.torch as hvd
except ImportError:
    hvd = None


def gather_features(
    audio_features,
    text_features,
    audio_features_mlp=None,
    text_features_mlp=None,
    local_loss=False,
    gather_with_grad=False,
    rank=0,
    world_size=1,
    use_horovod=False,
    mlp_loss=False,
):
    if use_horovod:
        assert hvd is not None, "Please install horovod"
        if gather_with_grad:
            all_audio_features = hvd.allgather(audio_features)
            all_text_features = hvd.allgather(text_features)
            if mlp_loss:
                all_audio_features_mlp = hvd.allgather(audio_features_mlp)
                all_text_features_mlp = hvd.allgather(text_features_mlp)
        else:
            with torch.no_grad():
                all_audio_features = hvd.allgather(audio_features)
                all_text_features = hvd.allgather(text_features)
                if mlp_loss:
                    all_audio_features_mlp = hvd.allgather(audio_features_mlp)
                    all_text_features_mlp = hvd.allgather(text_features_mlp)
            if not local_loss:
                # ensure grads for local rank when all_* features don't have a gradient
                gathered_audio_features = list(
                    all_audio_features.chunk(world_size, dim=0)
                )
                gathered_text_features = list(
                    all_text_features.chunk(world_size, dim=0)
                )
                gathered_audio_features[rank] = audio_features
                gathered_text_features[rank] = text_features
                all_audio_features = torch.cat(gathered_audio_features, dim=0)
                all_text_features = torch.cat(gathered_text_features, dim=0)
                if mlp_loss:
                    gathered_audio_features_mlp = list(
                        all_audio_features_mlp.chunk(world_size, dim=0)
                    )
                    gathered_text_features_mlp = list(
                        all_text_features_mlp.chunk(world_size, dim=0)
                    )
                    gathered_audio_features_mlp[rank] = audio_features_mlp
                    gathered_text_features_mlp[rank] = text_features_mlp
                    all_audio_features_mlp = torch.cat(
                        gathered_audio_features_mlp, dim=0
                    )
                    all_text_features_mlp = torch.cat(gathered_text_features_mlp, dim=0)
    else:
        # We gather tensors from all gpus
        if gather_with_grad:
            all_audio_features = torch.cat(
                torch.distributed.nn.all_gather(audio_features), dim=0
            )
            all_text_features = torch.cat(
                torch.distributed.nn.all_gather(text_features), dim=0
            )
            if mlp_loss:
                all_audio_features_mlp = torch.cat(
                    torch.distributed.nn.all_gather(audio_features_mlp), dim=0
                )
                all_text_features_mlp = torch.cat(
                    torch.distributed.nn.all_gather(text_features_mlp), dim=0
                )
        else:
            gathered_audio_features = [
                torch.zeros_like(audio_features) for _ in range(world_size)
            ]
            gathered_text_features = [
                torch.zeros_like(text_features) for _ in range(world_size)
            ]
            dist.all_gather(gathered_audio_features, audio_features)
            dist.all_gather(gathered_text_features, text_features)
            if mlp_loss:
                gathered_audio_features_mlp = [
                    torch.zeros_like(audio_features_mlp) for _ in range(world_size)
                ]
                gathered_text_features_mlp = [
                    torch.zeros_like(text_features_mlp) for _ in range(world_size)
                ]
                dist.all_gather(gathered_audio_features_mlp, audio_features_mlp)
                dist.all_gather(gathered_text_features_mlp, text_features_mlp)
            if not local_loss:
                # ensure grads for local rank when all_* features don't have a gradient
                gathered_audio_features[rank] = audio_features
                gathered_text_features[rank] = text_features
                if mlp_loss:
                    gathered_audio_features_mlp[rank] = audio_features_mlp
                    gathered_text_features_mlp[rank] = text_features_mlp

            all_audio_features = torch.cat(gathered_audio_features, dim=0)
            all_text_features = torch.cat(gathered_text_features, dim=0)
            if mlp_loss:
                all_audio_features_mlp = torch.cat(gathered_audio_features_mlp, dim=0)
                all_text_features_mlp = torch.cat(gathered_text_features_mlp, dim=0)
    if mlp_loss:
        return (
            all_audio_features,
            all_text_features,
            all_audio_features_mlp,
            all_text_features_mlp,
        )
    else:
        return all_audio_features, all_text_features


class ClipLoss(nn.Module):
    def __init__(
        self,
        local_loss=False,
        gather_with_grad=False,
        cache_labels=False,
        rank=0,
        world_size=1,
        use_horovod=False,
        mlp_loss=False,
        weight_loss_kappa=0,
    ):
        super().__init__()
        self.local_loss = local_loss
        self.gather_with_grad = gather_with_grad
        self.cache_labels = cache_labels
        self.rank = rank
        self.world_size = world_size
        self.use_horovod = use_horovod
        self.mlp_loss = mlp_loss
        self.weighted_loss = bool(weight_loss_kappa != 0)
        self.weight_loss_kappa = weight_loss_kappa
        # cache state
        self.prev_num_logits = 0
        self.labels = {}

    def forward(
        self,
        audio_features,
        text_features,
        logit_scale_a,
        logit_scale_t=None,
        audio_features_mlp=None,
        text_features_mlp=None,
    ):
        device = audio_features.device
        if self.mlp_loss:
            if self.world_size > 1:
                (
                    all_audio_features,
                    all_text_features,
                    all_audio_features_mlp,
                    all_text_features_mlp,
                ) = gather_features(
                    audio_features=audio_features,
                    text_features=text_features,
                    audio_features_mlp=audio_features_mlp,
                    text_features_mlp=text_features_mlp,
                    local_loss=self.local_loss,
                    gather_with_grad=self.gather_with_grad,
                    rank=self.rank,
                    world_size=self.world_size,
                    use_horovod=self.use_horovod,
                    mlp_loss=self.mlp_loss,
                )
                if self.local_loss:
                    a_logits_per_audio = (
                        logit_scale_a * audio_features @ all_text_features_mlp.T
                    )
                    a_logits_per_text = (
                        logit_scale_a * text_features_mlp @ all_audio_features.T
                    )
                    t_logits_per_audio = (
                        logit_scale_t * audio_features_mlp @ all_text_features.T
                    )
                    t_logits_per_text = (
                        logit_scale_t * text_features @ all_audio_features_mlp.T
                    )
                else:
                    a_logits_per_audio = (
                        logit_scale_a * all_audio_features @ all_text_features_mlp.T
                    )
                    a_logits_per_text = a_logits_per_audio.T
                    t_logits_per_audio = (
                        logit_scale_t * all_audio_features_mlp @ all_text_features.T
                    )
                    t_logits_per_text = t_logits_per_audio.T
            else:
                a_logits_per_audio = (
                    logit_scale_a * audio_features @ text_features_mlp.T
                )
                a_logits_per_text = logit_scale_a * text_features_mlp @ audio_features.T
                t_logits_per_audio = (
                    logit_scale_t * audio_features_mlp @ text_features.T
                )
                t_logits_per_text = logit_scale_t * text_features @ audio_features_mlp.T

            # calculated ground-truth and cache if enabled
            num_logits = a_logits_per_audio.shape[0]
            if self.prev_num_logits != num_logits or device not in self.labels:
                labels = torch.arange(num_logits, device=device, dtype=torch.long)
                if self.world_size > 1 and self.local_loss:
                    labels = labels + num_logits * self.rank
                if self.cache_labels:
                    self.labels[device] = labels
                    self.prev_num_logits = num_logits
            else:
                labels = self.labels[device]

            if not self.weighted_loss:
                total_loss = (
                    F.cross_entropy(a_logits_per_audio, labels)
                    + F.cross_entropy(a_logits_per_text, labels)
                    + F.cross_entropy(t_logits_per_audio, labels)
                    + F.cross_entropy(t_logits_per_text, labels)
                ) / 4
            else:
                audio_weight = (audio_features @ audio_features.T).detach()
                audio_weight = (
                    torch.exp(
                        torch.sum(audio_weight, axis=1)
                        / (self.weight_loss_kappa * len(audio_weight))
                    )
                ).detach()
                text_weight = (text_features @ text_features.T).detach()
                text_weight = (
                    torch.exp(
                        torch.sum(text_weight, axis=1)
                        / (self.weight_loss_kappa * len(text_features))
                    )
                ).detach()
                total_loss = (
                    F.cross_entropy(a_logits_per_audio, labels, weight=audio_weight)
                    + F.cross_entropy(a_logits_per_text, labels, weight=audio_weight)
                    + F.cross_entropy(t_logits_per_audio, labels, weight=text_weight)
                    + F.cross_entropy(t_logits_per_text, labels, weight=text_weight)
                ) / 4
        else:
            if self.world_size > 1:
                all_audio_features, all_text_features = gather_features(
                    audio_features=audio_features,
                    text_features=text_features,
                    local_loss=self.local_loss,
                    gather_with_grad=self.gather_with_grad,
                    rank=self.rank,
                    world_size=self.world_size,
                    use_horovod=self.use_horovod,
                    mlp_loss=self.mlp_loss,
                )

                if self.local_loss:
                    logits_per_audio = (
                        logit_scale_a * audio_features @ all_text_features.T
                    )
                    logits_per_text = (
                        logit_scale_a * text_features @ all_audio_features.T
                    )
                else:
                    logits_per_audio = (
                        logit_scale_a * all_audio_features @ all_text_features.T
                    )
                    logits_per_text = logits_per_audio.T
            else:
                logits_per_audio = logit_scale_a * audio_features @ text_features.T
                logits_per_text = logit_scale_a * text_features @ audio_features.T

            # calculated ground-truth and cache if enabled
            num_logits = logits_per_audio.shape[0]
            if self.prev_num_logits != num_logits or device not in self.labels:
                labels = torch.arange(num_logits, device=device, dtype=torch.long)
                if self.world_size > 1 and self.local_loss:
                    labels = labels + num_logits * self.rank
                if self.cache_labels:
                    self.labels[device] = labels
                    self.prev_num_logits = num_logits
            else:
                labels = self.labels[device]
            if not self.weighted_loss:
                total_loss = (
                    F.cross_entropy(logits_per_audio, labels)
                    + F.cross_entropy(logits_per_text, labels)
                ) / 2
            else:
                audio_weight = (all_audio_features @ all_audio_features.T).detach()
                audio_weight = (
                    torch.exp(
                        torch.sum(audio_weight, axis=1)
                        / (self.weight_loss_kappa * len(all_audio_features))
                    )
                ).detach()
                text_weight = (all_text_features @ all_text_features.T).detach()
                text_weight = (
                    torch.exp(
                        torch.sum(text_weight, axis=1)
                        / (self.weight_loss_kappa * len(all_text_features))
                    )
                ).detach()
                total_loss = (
                    F.cross_entropy(logits_per_audio, labels, weight=text_weight)
                    + F.cross_entropy(logits_per_text, labels, weight=audio_weight)
                ) / 2
        return total_loss


def lp_gather_features(pred, target, world_size=1, use_horovod=False):
    if use_horovod:
        assert hvd is not None, "Please install horovod"
        with torch.no_grad():
            all_preds = hvd.allgather(pred)
            all_targets = hvd.allgath(target)
    else:
        gathered_preds = [torch.zeros_like(pred) for _ in range(world_size)]
        gathered_targets = [torch.zeros_like(target) for _ in range(world_size)]

        dist.all_gather(gathered_preds, pred)
        dist.all_gather(gathered_targets, target)
        all_preds = torch.cat(gathered_preds, dim=0)
        all_targets = torch.cat(gathered_targets, dim=0)

    return all_preds, all_targets


def get_map(pred, target):
    pred = torch.sigmoid(pred).numpy()
    target = target.numpy()
    return np.mean(average_precision_score(target, pred, average=None))


def get_acc(pred, target):
    pred = torch.argmax(pred, 1).numpy()
    target = torch.argmax(target, 1).numpy()
    return accuracy_score(target, pred)


def get_mauc(pred, target):
    pred = torch.sigmoid(pred).numpy()
    target = target.numpy()
    return np.mean(roc_auc_score(target, pred, average=None))


class LPMetrics(object):
    def __init__(self, metric_names=["map", "acc", "mauc"]):
        self.metrics = []
        for name in metric_names:
            self.metrics.append(self.get_metric(name))
        self.metric_names = metric_names

    def get_metric(self, name):
        if name == "map":
            return get_map
        elif name == "acc":
            return get_acc
        elif name == "mauc":
            return get_mauc
        else:
            raise ValueError(f"the metric should be at least one of [map, acc, mauc]")

    def evaluate_mertics(self, pred, target):
        metric_dict = {}
        for i in range(len(self.metric_names)):
            metric_dict[self.metric_names[i]] = self.metrics[i](pred, target)
        return metric_dict


def calc_celoss(pred, target):
    target = torch.argmax(target, 1).long()
    return nn.CrossEntropyLoss()(pred, target)


class LPLoss(nn.Module):
    def __init__(self, loss_name):
        super().__init__()
        if loss_name == "bce":
            self.loss_func = nn.BCEWithLogitsLoss()
        elif loss_name == "ce":
            self.loss_func = calc_celoss
        elif loss_name == "mse":
            self.loss_func = nn.MSELoss()
        else:
            raise ValueError(f"the loss func should be at least one of [bce, ce, mse]")

    def forward(self, pred, target):
        loss = self.loss_func(pred, target)
        return loss