Spaces:

z-uo
/

HTS-Audio-Transformer

Build error

App Files Files Community

artelabsuper commited on Jul 19, 2022

Commit

0d4ce65

•

1 Parent(s): 4e2283a

add utils

Browse files

Files changed (1) hide show

utils.py +226 -0

utils.py ADDED Viewed

	@@ -0,0 +1,226 @@

+# Ke Chen
+# knutchen@ucsd.edu
+# HTS-AT: A HIERARCHICAL TOKEN-SEMANTIC AUDIO TRANSFORMER FOR SOUND CLASSIFICATION AND DETECTION
+# Some Useful Common Methods
+import numpy as np
+import torch
+import torch.nn as nn
+from torch import Tensor
+from typing import Optional
+import logging
+import os
+import sys
+import h5py
+import csv
+import time
+import json
+import museval
+import librosa
+from datetime import datetime
+from tqdm import tqdm
+from scipy import stats
+import torch.nn as nn
+import torch.nn.functional as F
+# import from https://github.com/Alibaba-MIIL/ASL/blob/main/src/loss_functions/losses.py
+class AsymmetricLoss(nn.Module):
+    def __init__(self, gamma_neg=4, gamma_pos=1, clip=0.05, eps=1e-8, disable_torch_grad_focal_loss=True):
+        super(AsymmetricLoss, self).__init__()
+        self.gamma_neg = gamma_neg
+        self.gamma_pos = gamma_pos
+        self.clip = clip
+        self.disable_torch_grad_focal_loss = disable_torch_grad_focal_loss
+        self.eps = eps
+    def forward(self, x, y):
+        """"
+        Parameters
+        ----------
+        x: input logits
+        y: targets (multi-label binarized vector)
+        """
+        # Calculating Probabilities
+        # x_sigmoid = torch.sigmoid(x)
+        x_sigmoid = x # without sigmoid since it has been computed
+        xs_pos = x_sigmoid
+        xs_neg = 1 - x_sigmoid
+        # Asymmetric Clipping
+        if self.clip is not None and self.clip > 0:
+            xs_neg = (xs_neg + self.clip).clamp(max=1)
+        # Basic CE calculation
+        los_pos = y * torch.log(xs_pos.clamp(min=self.eps))
+        los_neg = (1 - y) * torch.log(xs_neg.clamp(min=self.eps))
+        loss = los_pos + los_neg
+        # Asymmetric Focusing
+        if self.gamma_neg > 0 or self.gamma_pos > 0:
+            if self.disable_torch_grad_focal_loss:
+                torch.set_grad_enabled(False)
+            pt0 = xs_pos * y
+            pt1 = xs_neg * (1 - y)  # pt = p if t > 0 else 1-p
+            pt = pt0 + pt1
+            one_sided_gamma = self.gamma_pos * y + self.gamma_neg * (1 - y)
+            one_sided_w = torch.pow(1 - pt, one_sided_gamma)
+            if self.disable_torch_grad_focal_loss:
+                torch.set_grad_enabled(True)
+            loss *= one_sided_w
+        return -loss.mean()
+def get_mix_lambda(mixup_alpha, batch_size):
+    mixup_lambdas = [np.random.beta(mixup_alpha, mixup_alpha, 1)[0] for _ in range(batch_size)]
+    return np.array(mixup_lambdas).astype(np.float32)
+def create_folder(fd):
+    if not os.path.exists(fd):
+        os.makedirs(fd)
+def dump_config(config, filename, include_time = False):
+    save_time = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
+    config_json = {}
+    for key in dir(config):
+        if not key.startswith("_"):
+            config_json[key] = eval("config." + key)
+    if include_time:
+        filename = filename + "_" + save_time
+    with open(filename + ".json", "w") as f:
+        json.dump(config_json, f ,indent=4)
+def int16_to_float32(x):
+    return (x / 32767.).astype(np.float32)
+def float32_to_int16(x):
+    x = np.clip(x, a_min = -1., a_max = 1.)
+    return (x * 32767.).astype(np.int16)
+# index for each class
+def process_idc(index_path, classes_num, filename):
+    # load data
+    logging.info("Load Data...............")
+    idc = [[] for _ in range(classes_num)]
+    with h5py.File(index_path, "r") as f:
+        for i in tqdm(range(len(f["target"]))):
+            t_class = np.where(f["target"][i])[0]
+            for t in t_class:
+                idc[t].append(i)
+    print(idc)
+    np.save(filename, idc)
+    logging.info("Load Data Succeed...............")
+def clip_bce(pred, target):
+    """Binary crossentropy loss.
+    """
+    return F.cross_entropy(pred, target)
+    # return F.binary_cross_entropy(pred, target)
+def clip_ce(pred, target):
+    return F.cross_entropy(pred, target)
+def d_prime(auc):
+    d_prime = stats.norm().ppf(auc) * np.sqrt(2.0)
+    return d_prime
+def get_loss_func(loss_type):
+    if loss_type == 'clip_bce':
+        return clip_bce
+    if loss_type == 'clip_ce':
+        return clip_ce
+    if loss_type == 'asl_loss':
+        loss_func = AsymmetricLoss(gamma_neg=4, gamma_pos=0,clip=0.05)
+        return loss_func
+def do_mixup_label(x):
+    out = torch.logical_or(x, torch.flip(x, dims = [0])).float()
+    return out
+def do_mixup(x, mixup_lambda):
+    """
+    Args:
+      x: (batch_size , ...)
+      mixup_lambda: (batch_size,)
+    Returns:
+      out: (batch_size, ...)
+    """
+    out = (x.transpose(0,-1) * mixup_lambda + torch.flip(x, dims = [0]).transpose(0,-1) * (1 - mixup_lambda)).transpose(0,-1)
+    return out
+def interpolate(x, ratio):
+    """Interpolate data in time domain. This is used to compensate the
+    resolution reduction in downsampling of a CNN.
+    Args:
+      x: (batch_size, time_steps, classes_num)
+      ratio: int, ratio to interpolate
+    Returns:
+      upsampled: (batch_size, time_steps * ratio, classes_num)
+    """
+    (batch_size, time_steps, classes_num) = x.shape
+    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
+    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
+    return upsampled
+def pad_framewise_output(framewise_output, frames_num):
+    """Pad framewise_output to the same length as input frames. The pad value
+    is the same as the value of the last frame.
+    Args:
+      framewise_output: (batch_size, frames_num, classes_num)
+      frames_num: int, number of frames to pad
+    Outputs:
+      output: (batch_size, frames_num, classes_num)
+    """
+    pad = framewise_output[:, -1 :, :].repeat(1, frames_num - framewise_output.shape[1], 1)
+    """tensor for padding"""
+    output = torch.cat((framewise_output, pad), dim=1)
+    """(batch_size, frames_num, classes_num)"""
+    return output
+# set the audio into the format that can be fed into the model
+# resample -> convert to mono -> output the audio
+# track [n_sample, n_channel]
+def prepprocess_audio(track, ofs, rfs, mono_type = "mix"):
+    if track.shape[-1] > 1:
+        # stereo
+        if mono_type == "mix":
+            track = np.transpose(track, (1,0))
+            track = librosa.to_mono(track)
+        elif mono_type == "left":
+            track = track[:, 0]
+        elif mono_type == "right":
+            track = track[:, 1]
+    else:
+        track = track[:, 0]
+    # track [n_sample]
+    if ofs != rfs:
+        track = librosa.resample(track, ofs, rfs)
+    return track
+def init_hier_head(class_map, num_class):
+    class_map = np.load(class_map, allow_pickle = True)
+    head_weight = torch.zeros(num_class,num_class).float()
+    head_bias = torch.zeros(num_class).float()
+    for i in range(len(class_map)):
+        for d in class_map[i][1]:
+            head_weight[d][i] = 1.0
+        for d in class_map[i][2]:
+            head_weight[d][i] = 1.0 / len(class_map[i][2])
+        head_weight[i][i] = 1.0
+    return head_weight, head_bias