audio-styling

Build error

File size: 7,769 Bytes

51da11a

# Adapted from:
# https://github.com/csteinmetz1/micro-tcn/blob/main/microtcn/utils.py
import os
import csv
import torch
import fnmatch
import numpy as np
import random
from enum import Enum
import pyloudnorm as pyln


class DSPMode(Enum):
    NONE = "none"
    TRAIN_INFER = "train_infer"
    INFER = "infer"

    def __str__(self):
        return self.value


def loudness_normalize(x, sample_rate, target_loudness=-24.0):
    x = x.view(1, -1)
    stereo_audio = x.repeat(2, 1).permute(1, 0).numpy()
    meter = pyln.Meter(sample_rate)
    loudness = meter.integrated_loudness(stereo_audio)
    norm_x = pyln.normalize.loudness(
        stereo_audio,
        loudness,
        target_loudness,
    )
    x = torch.tensor(norm_x).permute(1, 0)
    x = x[0, :].view(1, -1)

    return x


def get_random_file_id(keys):
    # generate a random index into the keys of the input files
    rand_input_idx = torch.randint(0, len(keys) - 1, [1])[0]
    # find the key (file_id) correponding to the random index
    rand_input_file_id = list(keys)[rand_input_idx]

    return rand_input_file_id


def get_random_patch(audio_file, length, check_silence=True):
    silent = True
    while silent:
        start_idx = int(torch.rand(1) * (audio_file.num_frames - length))
        stop_idx = start_idx + length
        patch = audio_file.audio[:, start_idx:stop_idx].clone().detach()
        if (patch ** 2).mean() > 1e-4 or not check_silence:
            silent = False

    return start_idx, stop_idx


def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2 ** 32
    np.random.seed(worker_seed)
    random.seed(worker_seed)


def getFilesPath(directory, extension):

    n_path = []
    for path, subdirs, files in os.walk(directory):
        for name in files:
            if fnmatch.fnmatch(name, extension):
                n_path.append(os.path.join(path, name))
    n_path.sort()

    return n_path


def count_parameters(model, trainable_only=True):

    if trainable_only:
        if len(list(model.parameters())) > 0:
            params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        else:
            params = 0
    else:
        if len(list(model.parameters())) > 0:
            params = sum(p.numel() for p in model.parameters())
        else:
            params = 0

    return params


def system_summary(system):
    print(f"Encoder: {count_parameters(system.encoder)/1e6:0.2f} M")
    print(f"Processor: {count_parameters(system.processor)/1e6:0.2f} M")

    if hasattr(system, "adv_loss_fn"):
        for idx, disc in enumerate(system.adv_loss_fn.discriminators):
            print(f"Discriminator {idx+1}: {count_parameters(disc)/1e6:0.2f} M")


def center_crop(x, length: int):
    if x.shape[-1] != length:
        start = (x.shape[-1] - length) // 2
        stop = start + length
        x = x[..., start:stop]
    return x


def causal_crop(x, length: int):
    if x.shape[-1] != length:
        stop = x.shape[-1] - 1
        start = stop - length
        x = x[..., start:stop]
    return x


def denormalize(norm_val, max_val, min_val):
    return (norm_val * (max_val - min_val)) + min_val


def normalize(denorm_val, max_val, min_val):
    return (denorm_val - min_val) / (max_val - min_val)


def get_random_patch(audio_file, length, energy_treshold=1e-4):
    """Produce sample indicies for a random patch of size `length`.

    This function will check the energy of the selected patch to
    ensure that it is not complete silence. If silence is found,
    it will continue searching for a non-silent patch.

    Args:
        audio_file (AudioFile): Audio file object.
        length (int): Number of samples in random patch.

    Returns:
        start_idx (int): Starting sample index
        stop_idx (int): Stop sample index
    """

    silent = True
    while silent:
        start_idx = int(torch.rand(1) * (audio_file.num_frames - length))
        stop_idx = start_idx + length
        patch = audio_file.audio[:, start_idx:stop_idx]
        if (patch ** 2).mean() > energy_treshold:
            silent = False

    return start_idx, stop_idx


def split_dataset(file_list, subset, train_frac):
    """Given a list of files, split into train/val/test sets.

    Args:
        file_list (list): List of audio files.
        subset (str): One of "train", "val", or "test".
        train_frac (float): Fraction of the dataset to use for training.

    Returns:
        file_list (list): List of audio files corresponding to subset.
    """
    assert train_frac > 0.1 and train_frac < 1.0

    total_num_examples = len(file_list)

    train_num_examples = int(total_num_examples * train_frac)
    val_num_examples = int(total_num_examples * (1 - train_frac) / 2)
    test_num_examples = total_num_examples - (train_num_examples + val_num_examples)

    if train_num_examples < 0:
        raise ValueError(
            f"No examples in training set. Try increasing train_frac: {train_frac}."
        )
    elif val_num_examples < 0:
        raise ValueError(
            f"No examples in validation set. Try decreasing train_frac: {train_frac}."
        )
    elif test_num_examples < 0:
        raise ValueError(
            f"No examples in test set. Try decreasing train_frac: {train_frac}."
        )

    if subset == "train":
        start_idx = 0
        stop_idx = train_num_examples
    elif subset == "val":
        start_idx = train_num_examples
        stop_idx = start_idx + val_num_examples
    elif subset == "test":
        start_idx = train_num_examples + val_num_examples
        stop_idx = start_idx + test_num_examples + 1
    else:
        raise ValueError("Invalid subset: {subset}.")

    return file_list[start_idx:stop_idx]


def rademacher(size):
    """Generates random samples from a Rademacher distribution +-1

    Args:
        size (int):

    """
    m = torch.distributions.binomial.Binomial(1, 0.5)
    x = m.sample(size)
    x[x == 0] = -1
    return x


def get_subset(csv_file):
    subset_files = []
    with open(csv_file) as fp:
        reader = csv.DictReader(fp)
        for row in reader:
            subset_files.append(row["filepath"])

    return list(set(subset_files))


def conform_length(x: torch.Tensor, length: int):
    """Crop or pad input on last dim to match `length`."""
    if x.shape[-1] < length:
        padsize = length - x.shape[-1]
        x = torch.nn.functional.pad(x, (0, padsize))
    elif x.shape[-1] > length:
        x = x[..., :length]

    return x


def linear_fade(
    x: torch.Tensor,
    fade_ms: float = 50.0,
    sample_rate: float = 22050,
):
    """Apply fade in and fade out to last dim."""
    fade_samples = int(fade_ms * 1e-3 * 22050)

    fade_in = torch.linspace(0.0, 1.0, steps=fade_samples)
    fade_out = torch.linspace(1.0, 0.0, steps=fade_samples)

    # fade in
    x[..., :fade_samples] *= fade_in

    # fade out
    x[..., -fade_samples:] *= fade_out

    return x


# def get_random_patch(x, sample_rate, length_samples):
#     length = length_samples
#     silent = True
#     while silent:
#         start_idx = np.random.randint(0, x.shape[-1] - length - 1)
#         stop_idx = start_idx + length
#         x_crop = x[0:1, start_idx:stop_idx]

#         # check for silence
#         frames = length // sample_rate
#         silent_frames = []
#         for n in range(frames):
#             start_idx = n * sample_rate
#             stop_idx = start_idx + sample_rate
#             x_frame = x_crop[0:1, start_idx:stop_idx]
#             if (x_frame ** 2).mean() > 3e-4:
#                 silent_frames.append(False)
#             else:
#                 silent_frames.append(True)
#         silent = True if any(silent_frames) else False

#     x_crop /= x_crop.abs().max()

#     return x_crop