Spaces:
Paused
Paused
from multiprocessing.sharedctypes import Value | |
import torch | |
import torch.distributed.nn | |
from torch import distributed as dist, nn as nn | |
from torch.nn import functional as F | |
import numpy as np | |
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score | |
try: | |
import horovod.torch as hvd | |
except ImportError: | |
hvd = None | |
def gather_features( | |
audio_features, | |
text_features, | |
audio_features_mlp=None, | |
text_features_mlp=None, | |
local_loss=False, | |
gather_with_grad=False, | |
rank=0, | |
world_size=1, | |
use_horovod=False, | |
mlp_loss=False, | |
): | |
if use_horovod: | |
assert hvd is not None, "Please install horovod" | |
if gather_with_grad: | |
all_audio_features = hvd.allgather(audio_features) | |
all_text_features = hvd.allgather(text_features) | |
if mlp_loss: | |
all_audio_features_mlp = hvd.allgather(audio_features_mlp) | |
all_text_features_mlp = hvd.allgather(text_features_mlp) | |
else: | |
with torch.no_grad(): | |
all_audio_features = hvd.allgather(audio_features) | |
all_text_features = hvd.allgather(text_features) | |
if mlp_loss: | |
all_audio_features_mlp = hvd.allgather(audio_features_mlp) | |
all_text_features_mlp = hvd.allgather(text_features_mlp) | |
if not local_loss: | |
# ensure grads for local rank when all_* features don't have a gradient | |
gathered_audio_features = list( | |
all_audio_features.chunk(world_size, dim=0) | |
) | |
gathered_text_features = list( | |
all_text_features.chunk(world_size, dim=0) | |
) | |
gathered_audio_features[rank] = audio_features | |
gathered_text_features[rank] = text_features | |
all_audio_features = torch.cat(gathered_audio_features, dim=0) | |
all_text_features = torch.cat(gathered_text_features, dim=0) | |
if mlp_loss: | |
gathered_audio_features_mlp = list( | |
all_audio_features_mlp.chunk(world_size, dim=0) | |
) | |
gathered_text_features_mlp = list( | |
all_text_features_mlp.chunk(world_size, dim=0) | |
) | |
gathered_audio_features_mlp[rank] = audio_features_mlp | |
gathered_text_features_mlp[rank] = text_features_mlp | |
all_audio_features_mlp = torch.cat( | |
gathered_audio_features_mlp, dim=0 | |
) | |
all_text_features_mlp = torch.cat(gathered_text_features_mlp, dim=0) | |
else: | |
# We gather tensors from all gpus | |
if gather_with_grad: | |
all_audio_features = torch.cat( | |
torch.distributed.nn.all_gather(audio_features), dim=0 | |
) | |
all_text_features = torch.cat( | |
torch.distributed.nn.all_gather(text_features), dim=0 | |
) | |
if mlp_loss: | |
all_audio_features_mlp = torch.cat( | |
torch.distributed.nn.all_gather(audio_features_mlp), dim=0 | |
) | |
all_text_features_mlp = torch.cat( | |
torch.distributed.nn.all_gather(text_features_mlp), dim=0 | |
) | |
else: | |
gathered_audio_features = [ | |
torch.zeros_like(audio_features) for _ in range(world_size) | |
] | |
gathered_text_features = [ | |
torch.zeros_like(text_features) for _ in range(world_size) | |
] | |
dist.all_gather(gathered_audio_features, audio_features) | |
dist.all_gather(gathered_text_features, text_features) | |
if mlp_loss: | |
gathered_audio_features_mlp = [ | |
torch.zeros_like(audio_features_mlp) for _ in range(world_size) | |
] | |
gathered_text_features_mlp = [ | |
torch.zeros_like(text_features_mlp) for _ in range(world_size) | |
] | |
dist.all_gather(gathered_audio_features_mlp, audio_features_mlp) | |
dist.all_gather(gathered_text_features_mlp, text_features_mlp) | |
if not local_loss: | |
# ensure grads for local rank when all_* features don't have a gradient | |
gathered_audio_features[rank] = audio_features | |
gathered_text_features[rank] = text_features | |
if mlp_loss: | |
gathered_audio_features_mlp[rank] = audio_features_mlp | |
gathered_text_features_mlp[rank] = text_features_mlp | |
all_audio_features = torch.cat(gathered_audio_features, dim=0) | |
all_text_features = torch.cat(gathered_text_features, dim=0) | |
if mlp_loss: | |
all_audio_features_mlp = torch.cat(gathered_audio_features_mlp, dim=0) | |
all_text_features_mlp = torch.cat(gathered_text_features_mlp, dim=0) | |
if mlp_loss: | |
return ( | |
all_audio_features, | |
all_text_features, | |
all_audio_features_mlp, | |
all_text_features_mlp, | |
) | |
else: | |
return all_audio_features, all_text_features | |
class ClipLoss(nn.Module): | |
def __init__( | |
self, | |
local_loss=False, | |
gather_with_grad=False, | |
cache_labels=False, | |
rank=0, | |
world_size=1, | |
use_horovod=False, | |
mlp_loss=False, | |
weight_loss_kappa=0, | |
): | |
super().__init__() | |
self.local_loss = local_loss | |
self.gather_with_grad = gather_with_grad | |
self.cache_labels = cache_labels | |
self.rank = rank | |
self.world_size = world_size | |
self.use_horovod = use_horovod | |
self.mlp_loss = mlp_loss | |
self.weighted_loss = bool(weight_loss_kappa != 0) | |
self.weight_loss_kappa = weight_loss_kappa | |
# cache state | |
self.prev_num_logits = 0 | |
self.labels = {} | |
def forward( | |
self, | |
audio_features, | |
text_features, | |
logit_scale_a, | |
logit_scale_t=None, | |
audio_features_mlp=None, | |
text_features_mlp=None, | |
): | |
device = audio_features.device | |
if self.mlp_loss: | |
if self.world_size > 1: | |
( | |
all_audio_features, | |
all_text_features, | |
all_audio_features_mlp, | |
all_text_features_mlp, | |
) = gather_features( | |
audio_features=audio_features, | |
text_features=text_features, | |
audio_features_mlp=audio_features_mlp, | |
text_features_mlp=text_features_mlp, | |
local_loss=self.local_loss, | |
gather_with_grad=self.gather_with_grad, | |
rank=self.rank, | |
world_size=self.world_size, | |
use_horovod=self.use_horovod, | |
mlp_loss=self.mlp_loss, | |
) | |
if self.local_loss: | |
a_logits_per_audio = ( | |
logit_scale_a * audio_features @ all_text_features_mlp.T | |
) | |
a_logits_per_text = ( | |
logit_scale_a * text_features_mlp @ all_audio_features.T | |
) | |
t_logits_per_audio = ( | |
logit_scale_t * audio_features_mlp @ all_text_features.T | |
) | |
t_logits_per_text = ( | |
logit_scale_t * text_features @ all_audio_features_mlp.T | |
) | |
else: | |
a_logits_per_audio = ( | |
logit_scale_a * all_audio_features @ all_text_features_mlp.T | |
) | |
a_logits_per_text = a_logits_per_audio.T | |
t_logits_per_audio = ( | |
logit_scale_t * all_audio_features_mlp @ all_text_features.T | |
) | |
t_logits_per_text = t_logits_per_audio.T | |
else: | |
a_logits_per_audio = ( | |
logit_scale_a * audio_features @ text_features_mlp.T | |
) | |
a_logits_per_text = logit_scale_a * text_features_mlp @ audio_features.T | |
t_logits_per_audio = ( | |
logit_scale_t * audio_features_mlp @ text_features.T | |
) | |
t_logits_per_text = logit_scale_t * text_features @ audio_features_mlp.T | |
# calculated ground-truth and cache if enabled | |
num_logits = a_logits_per_audio.shape[0] | |
if self.prev_num_logits != num_logits or device not in self.labels: | |
labels = torch.arange(num_logits, device=device, dtype=torch.long) | |
if self.world_size > 1 and self.local_loss: | |
labels = labels + num_logits * self.rank | |
if self.cache_labels: | |
self.labels[device] = labels | |
self.prev_num_logits = num_logits | |
else: | |
labels = self.labels[device] | |
if not self.weighted_loss: | |
total_loss = ( | |
F.cross_entropy(a_logits_per_audio, labels) | |
+ F.cross_entropy(a_logits_per_text, labels) | |
+ F.cross_entropy(t_logits_per_audio, labels) | |
+ F.cross_entropy(t_logits_per_text, labels) | |
) / 4 | |
else: | |
audio_weight = (audio_features @ audio_features.T).detach() | |
audio_weight = ( | |
torch.exp( | |
torch.sum(audio_weight, axis=1) | |
/ (self.weight_loss_kappa * len(audio_weight)) | |
) | |
).detach() | |
text_weight = (text_features @ text_features.T).detach() | |
text_weight = ( | |
torch.exp( | |
torch.sum(text_weight, axis=1) | |
/ (self.weight_loss_kappa * len(text_features)) | |
) | |
).detach() | |
total_loss = ( | |
F.cross_entropy(a_logits_per_audio, labels, weight=audio_weight) | |
+ F.cross_entropy(a_logits_per_text, labels, weight=audio_weight) | |
+ F.cross_entropy(t_logits_per_audio, labels, weight=text_weight) | |
+ F.cross_entropy(t_logits_per_text, labels, weight=text_weight) | |
) / 4 | |
else: | |
if self.world_size > 1: | |
all_audio_features, all_text_features = gather_features( | |
audio_features=audio_features, | |
text_features=text_features, | |
local_loss=self.local_loss, | |
gather_with_grad=self.gather_with_grad, | |
rank=self.rank, | |
world_size=self.world_size, | |
use_horovod=self.use_horovod, | |
mlp_loss=self.mlp_loss, | |
) | |
if self.local_loss: | |
logits_per_audio = ( | |
logit_scale_a * audio_features @ all_text_features.T | |
) | |
logits_per_text = ( | |
logit_scale_a * text_features @ all_audio_features.T | |
) | |
else: | |
logits_per_audio = ( | |
logit_scale_a * all_audio_features @ all_text_features.T | |
) | |
logits_per_text = logits_per_audio.T | |
else: | |
logits_per_audio = logit_scale_a * audio_features @ text_features.T | |
logits_per_text = logit_scale_a * text_features @ audio_features.T | |
# calculated ground-truth and cache if enabled | |
num_logits = logits_per_audio.shape[0] | |
if self.prev_num_logits != num_logits or device not in self.labels: | |
labels = torch.arange(num_logits, device=device, dtype=torch.long) | |
if self.world_size > 1 and self.local_loss: | |
labels = labels + num_logits * self.rank | |
if self.cache_labels: | |
self.labels[device] = labels | |
self.prev_num_logits = num_logits | |
else: | |
labels = self.labels[device] | |
if not self.weighted_loss: | |
total_loss = ( | |
F.cross_entropy(logits_per_audio, labels) | |
+ F.cross_entropy(logits_per_text, labels) | |
) / 2 | |
else: | |
audio_weight = (all_audio_features @ all_audio_features.T).detach() | |
audio_weight = ( | |
torch.exp( | |
torch.sum(audio_weight, axis=1) | |
/ (self.weight_loss_kappa * len(all_audio_features)) | |
) | |
).detach() | |
text_weight = (all_text_features @ all_text_features.T).detach() | |
text_weight = ( | |
torch.exp( | |
torch.sum(text_weight, axis=1) | |
/ (self.weight_loss_kappa * len(all_text_features)) | |
) | |
).detach() | |
total_loss = ( | |
F.cross_entropy(logits_per_audio, labels, weight=text_weight) | |
+ F.cross_entropy(logits_per_text, labels, weight=audio_weight) | |
) / 2 | |
return total_loss | |
def lp_gather_features(pred, target, world_size=1, use_horovod=False): | |
if use_horovod: | |
assert hvd is not None, "Please install horovod" | |
with torch.no_grad(): | |
all_preds = hvd.allgather(pred) | |
all_targets = hvd.allgath(target) | |
else: | |
gathered_preds = [torch.zeros_like(pred) for _ in range(world_size)] | |
gathered_targets = [torch.zeros_like(target) for _ in range(world_size)] | |
dist.all_gather(gathered_preds, pred) | |
dist.all_gather(gathered_targets, target) | |
all_preds = torch.cat(gathered_preds, dim=0) | |
all_targets = torch.cat(gathered_targets, dim=0) | |
return all_preds, all_targets | |
def get_map(pred, target): | |
pred = torch.sigmoid(pred).numpy() | |
target = target.numpy() | |
return np.mean(average_precision_score(target, pred, average=None)) | |
def get_acc(pred, target): | |
pred = torch.argmax(pred, 1).numpy() | |
target = torch.argmax(target, 1).numpy() | |
return accuracy_score(target, pred) | |
def get_mauc(pred, target): | |
pred = torch.sigmoid(pred).numpy() | |
target = target.numpy() | |
return np.mean(roc_auc_score(target, pred, average=None)) | |
class LPMetrics(object): | |
def __init__(self, metric_names=["map", "acc", "mauc"]): | |
self.metrics = [] | |
for name in metric_names: | |
self.metrics.append(self.get_metric(name)) | |
self.metric_names = metric_names | |
def get_metric(self, name): | |
if name == "map": | |
return get_map | |
elif name == "acc": | |
return get_acc | |
elif name == "mauc": | |
return get_mauc | |
else: | |
raise ValueError(f"the metric should be at least one of [map, acc, mauc]") | |
def evaluate_mertics(self, pred, target): | |
metric_dict = {} | |
for i in range(len(self.metric_names)): | |
metric_dict[self.metric_names[i]] = self.metrics[i](pred, target) | |
return metric_dict | |
def calc_celoss(pred, target): | |
target = torch.argmax(target, 1).long() | |
return nn.CrossEntropyLoss()(pred, target) | |
class LPLoss(nn.Module): | |
def __init__(self, loss_name): | |
super().__init__() | |
if loss_name == "bce": | |
self.loss_func = nn.BCEWithLogitsLoss() | |
elif loss_name == "ce": | |
self.loss_func = calc_celoss | |
elif loss_name == "mse": | |
self.loss_func = nn.MSELoss() | |
else: | |
raise ValueError(f"the loss func should be at least one of [bce, ce, mse]") | |
def forward(self, pred, target): | |
loss = self.loss_func(pred, target) | |
return loss | |