Spaces:

fffiloni
/

MusiConGen

Sleeping

App Files Files Community

fffiloni commited on Jul 24

Commit

f48c226

•

1 Parent(s): 2319c67

Delete audiocraft

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

audiocraft/audiocraft/__init__.py +0 -26
audiocraft/audiocraft/__pycache__/__init__.cpython-311.pyc +0 -0
audiocraft/audiocraft/__pycache__/environment.cpython-311.pyc +0 -0
audiocraft/audiocraft/__pycache__/train.cpython-311.pyc +0 -0
audiocraft/audiocraft/adversarial/__init__.py +0 -22
audiocraft/audiocraft/adversarial/__pycache__/__init__.cpython-311.pyc +0 -0
audiocraft/audiocraft/adversarial/__pycache__/losses.cpython-311.pyc +0 -0
audiocraft/audiocraft/adversarial/discriminators/__init__.py +0 -10
audiocraft/audiocraft/adversarial/discriminators/__pycache__/__init__.cpython-311.pyc +0 -0
audiocraft/audiocraft/adversarial/discriminators/__pycache__/base.cpython-311.pyc +0 -0
audiocraft/audiocraft/adversarial/discriminators/__pycache__/mpd.cpython-311.pyc +0 -0
audiocraft/audiocraft/adversarial/discriminators/__pycache__/msd.cpython-311.pyc +0 -0
audiocraft/audiocraft/adversarial/discriminators/__pycache__/msstftd.cpython-311.pyc +0 -0
audiocraft/audiocraft/adversarial/discriminators/base.py +0 -34
audiocraft/audiocraft/adversarial/discriminators/mpd.py +0 -106
audiocraft/audiocraft/adversarial/discriminators/msd.py +0 -126
audiocraft/audiocraft/adversarial/discriminators/msstftd.py +0 -134
audiocraft/audiocraft/adversarial/losses.py +0 -228
audiocraft/audiocraft/data/__init__.py +0 -10
audiocraft/audiocraft/data/__pycache__/__init__.cpython-311.pyc +0 -0
audiocraft/audiocraft/data/__pycache__/audio.cpython-311.pyc +0 -0
audiocraft/audiocraft/data/__pycache__/audio_dataset.cpython-311.pyc +0 -0
audiocraft/audiocraft/data/__pycache__/audio_utils.cpython-311.pyc +0 -0
audiocraft/audiocraft/data/__pycache__/btc_chords.cpython-311.pyc +0 -0
audiocraft/audiocraft/data/__pycache__/chords.cpython-311.pyc +0 -0
audiocraft/audiocraft/data/__pycache__/info_audio_dataset.cpython-311.pyc +0 -0
audiocraft/audiocraft/data/__pycache__/music_dataset.cpython-311.pyc +0 -0
audiocraft/audiocraft/data/__pycache__/sound_dataset.cpython-311.pyc +0 -0
audiocraft/audiocraft/data/__pycache__/zip.cpython-311.pyc +0 -0
audiocraft/audiocraft/data/audio.py +0 -257
audiocraft/audiocraft/data/audio_dataset.py +0 -614
audiocraft/audiocraft/data/audio_utils.py +0 -385
audiocraft/audiocraft/data/btc_chords.py +0 -524
audiocraft/audiocraft/data/chords.py +0 -524
audiocraft/audiocraft/data/info_audio_dataset.py +0 -110
audiocraft/audiocraft/data/music_dataset.py +0 -349
audiocraft/audiocraft/data/sound_dataset.py +0 -330
audiocraft/audiocraft/data/zip.py +0 -76
audiocraft/audiocraft/environment.py +0 -176
audiocraft/audiocraft/grids/__init__.py +0 -6
audiocraft/audiocraft/grids/_base_explorers.py +0 -80
audiocraft/audiocraft/grids/audiogen/__init__.py +0 -6
audiocraft/audiocraft/grids/audiogen/audiogen_base_16khz.py +0 -23
audiocraft/audiocraft/grids/audiogen/audiogen_pretrained_16khz_eval.py +0 -68
audiocraft/audiocraft/grids/compression/__init__.py +0 -6
audiocraft/audiocraft/grids/compression/_explorers.py +0 -55
audiocraft/audiocraft/grids/compression/debug.py +0 -31
audiocraft/audiocraft/grids/compression/encodec_audiogen_16khz.py +0 -29
audiocraft/audiocraft/grids/compression/encodec_base_24khz.py +0 -28
audiocraft/audiocraft/grids/compression/encodec_musicgen_32khz.py +0 -34

audiocraft/audiocraft/__init__.py DELETED Viewed

@@ -1,26 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-AudioCraft is a general framework for training audio generative models.
-At the moment we provide the training code for:
-- [MusicGen](https://arxiv.org/abs/2306.05284), a state-of-the-art
-    text-to-music and melody+text autoregressive generative model.
-    For the solver, see `audiocraft.solvers.musicgen.MusicGenSolver`, and for the model,
-    `audiocraft.models.musicgen.MusicGen`.
-- [AudioGen](https://arxiv.org/abs/2209.15352), a state-of-the-art
-    text-to-general-audio generative model.
-- [EnCodec](https://arxiv.org/abs/2210.13438), efficient and high fidelity
-    neural audio codec which provides an excellent tokenizer for autoregressive language models.
-    See `audiocraft.solvers.compression.CompressionSolver`, and `audiocraft.models.encodec.EncodecModel`.
-- [MultiBandDiffusion](TODO), alternative diffusion-based decoder compatible with EnCodec that
-    improves the perceived quality and reduces the artifacts coming from adversarial decoders.
-"""
-# flake8: noqa
-from . import data, modules, models
-__version__ = '1.0.0'

audiocraft/audiocraft/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (1.29 kB)

audiocraft/audiocraft/__pycache__/environment.cpython-311.pyc DELETED Viewed

Binary file (10.5 kB)

audiocraft/audiocraft/__pycache__/train.cpython-311.pyc DELETED Viewed

Binary file (9.52 kB)

audiocraft/audiocraft/adversarial/__init__.py DELETED Viewed

@@ -1,22 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""Adversarial losses and discriminator architectures."""
-# flake8: noqa
-from .discriminators import (
-    MultiPeriodDiscriminator,
-    MultiScaleDiscriminator,
-    MultiScaleSTFTDiscriminator
-)
-from .losses import (
-    AdversarialLoss,
-    AdvLossType,
-    get_adv_criterion,
-    get_fake_criterion,
-    get_real_criterion,
-    FeatLossType,
-    FeatureMatchingLoss
-)

audiocraft/audiocraft/adversarial/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (740 Bytes)

audiocraft/audiocraft/adversarial/__pycache__/losses.cpython-311.pyc DELETED Viewed

Binary file (15.9 kB)

audiocraft/audiocraft/adversarial/discriminators/__init__.py DELETED Viewed

@@ -1,10 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# flake8: noqa
-from .mpd import MultiPeriodDiscriminator
-from .msd import MultiScaleDiscriminator
-from .msstftd import MultiScaleSTFTDiscriminator

audiocraft/audiocraft/adversarial/discriminators/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (411 Bytes)

audiocraft/audiocraft/adversarial/discriminators/__pycache__/base.cpython-311.pyc DELETED Viewed

Binary file (1.87 kB)

audiocraft/audiocraft/adversarial/discriminators/__pycache__/mpd.cpython-311.pyc DELETED Viewed

Binary file (7.01 kB)

audiocraft/audiocraft/adversarial/discriminators/__pycache__/msd.cpython-311.pyc DELETED Viewed

Binary file (8.88 kB)

audiocraft/audiocraft/adversarial/discriminators/__pycache__/msstftd.cpython-311.pyc DELETED Viewed

Binary file (9.98 kB)

audiocraft/audiocraft/adversarial/discriminators/base.py DELETED Viewed

@@ -1,34 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-from abc import ABC, abstractmethod
-import typing as tp
-import torch
-import torch.nn as nn
-FeatureMapType = tp.List[torch.Tensor]
-LogitsType = torch.Tensor
-MultiDiscriminatorOutputType = tp.Tuple[tp.List[LogitsType], tp.List[FeatureMapType]]
-class MultiDiscriminator(ABC, nn.Module):
-    """Base implementation for discriminators composed of sub-discriminators acting at different scales.
-    """
-    def __init__(self):
-        super().__init__()
-    @abstractmethod
-    def forward(self, x: torch.Tensor) -> MultiDiscriminatorOutputType:
-        ...
-    @property
-    @abstractmethod
-    def num_discriminators(self) -> int:
-        """Number of discriminators.
-        """
-        ...

audiocraft/audiocraft/adversarial/discriminators/mpd.py DELETED Viewed

@@ -1,106 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import typing as tp
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from ...modules import NormConv2d
-from .base import MultiDiscriminator, MultiDiscriminatorOutputType
-def get_padding(kernel_size: int, dilation: int = 1) -> int:
-    return int((kernel_size * dilation - dilation) / 2)
-class PeriodDiscriminator(nn.Module):
-    """Period sub-discriminator.
-    Args:
-        period (int): Period between samples of audio.
-        in_channels (int): Number of input channels.
-        out_channels (int): Number of output channels.
-        n_layers (int): Number of convolutional layers.
-        kernel_sizes (list of int): Kernel sizes for convolutions.
-        stride (int): Stride for convolutions.
-        filters (int): Initial number of filters in convolutions.
-        filters_scale (int): Multiplier of number of filters as we increase depth.
-        max_filters (int): Maximum number of filters.
-        norm (str): Normalization method.
-        activation (str): Activation function.
-        activation_params (dict): Parameters to provide to the activation function.
-    """
-    def __init__(self, period: int, in_channels: int = 1, out_channels: int = 1,
-                 n_layers: int = 5, kernel_sizes: tp.List[int] = [5, 3], stride: int = 3,
-                 filters: int = 8, filters_scale: int = 4, max_filters: int = 1024,
-                 norm: str = 'weight_norm', activation: str = 'LeakyReLU',
-                 activation_params: dict = {'negative_slope': 0.2}):
-        super().__init__()
-        self.period = period
-        self.n_layers = n_layers
-        self.activation = getattr(torch.nn, activation)(**activation_params)
-        self.convs = nn.ModuleList()
-        in_chs = in_channels
-        for i in range(self.n_layers):
-            out_chs = min(filters * (filters_scale ** (i + 1)), max_filters)
-            eff_stride = 1 if i == self.n_layers - 1 else stride
-            self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=(kernel_sizes[0], 1), stride=(eff_stride, 1),
-                                         padding=((kernel_sizes[0] - 1) // 2, 0), norm=norm))
-            in_chs = out_chs
-        self.conv_post = NormConv2d(in_chs, out_channels, kernel_size=(kernel_sizes[1], 1), stride=1,
-                                    padding=((kernel_sizes[1] - 1) // 2, 0), norm=norm)
-    def forward(self, x: torch.Tensor):
-        fmap = []
-        # 1d to 2d
-        b, c, t = x.shape
-        if t % self.period != 0:  # pad first
-            n_pad = self.period - (t % self.period)
-            x = F.pad(x, (0, n_pad), 'reflect')
-            t = t + n_pad
-        x = x.view(b, c, t // self.period, self.period)
-        for conv in self.convs:
-            x = conv(x)
-            x = self.activation(x)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        # x = torch.flatten(x, 1, -1)
-        return x, fmap
-class MultiPeriodDiscriminator(MultiDiscriminator):
-    """Multi-Period (MPD) Discriminator.
-    Args:
-        in_channels (int): Number of input channels.
-        out_channels (int): Number of output channels.
-        periods (Sequence[int]): Periods between samples of audio for the sub-discriminators.
-        **kwargs: Additional args for `PeriodDiscriminator`
-    """
-    def __init__(self, in_channels: int = 1, out_channels: int = 1,
-                 periods: tp.Sequence[int] = [2, 3, 5, 7, 11], **kwargs):
-        super().__init__()
-        self.discriminators = nn.ModuleList([
-            PeriodDiscriminator(p, in_channels, out_channels, **kwargs) for p in periods
-        ])
-    @property
-    def num_discriminators(self):
-        return len(self.discriminators)
-    def forward(self, x: torch.Tensor) -> MultiDiscriminatorOutputType:
-        logits = []
-        fmaps = []
-        for disc in self.discriminators:
-            logit, fmap = disc(x)
-            logits.append(logit)
-            fmaps.append(fmap)
-        return logits, fmaps

audiocraft/audiocraft/adversarial/discriminators/msd.py DELETED Viewed

@@ -1,126 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import typing as tp
-import numpy as np
-import torch
-import torch.nn as nn
-from ...modules import NormConv1d
-from .base import MultiDiscriminator, MultiDiscriminatorOutputType
-class ScaleDiscriminator(nn.Module):
-    """Waveform sub-discriminator.
-    Args:
-        in_channels (int): Number of input channels.
-        out_channels (int): Number of output channels.
-        kernel_sizes (Sequence[int]): Kernel sizes for first and last convolutions.
-        filters (int): Number of initial filters for convolutions.
-        max_filters (int): Maximum number of filters.
-        downsample_scales (Sequence[int]): Scale for downsampling implemented as strided convolutions.
-        inner_kernel_sizes (Sequence[int] or None): Kernel sizes for inner convolutions.
-        groups (Sequence[int] or None): Groups for inner convolutions.
-        strides (Sequence[int] or None): Strides for inner convolutions.
-        paddings (Sequence[int] or None): Paddings for inner convolutions.
-        norm (str): Normalization method.
-        activation (str): Activation function.
-        activation_params (dict): Parameters to provide to the activation function.
-        pad (str): Padding for initial convolution.
-        pad_params (dict): Parameters to provide to the padding module.
-    """
-    def __init__(self, in_channels=1, out_channels=1, kernel_sizes: tp.Sequence[int] = [5, 3],
-                 filters: int = 16, max_filters: int = 1024, downsample_scales: tp.Sequence[int] = [4, 4, 4, 4],
-                 inner_kernel_sizes: tp.Optional[tp.Sequence[int]] = None, groups: tp.Optional[tp.Sequence[int]] = None,
-                 strides: tp.Optional[tp.Sequence[int]] = None, paddings: tp.Optional[tp.Sequence[int]] = None,
-                 norm: str = 'weight_norm', activation: str = 'LeakyReLU',
-                 activation_params: dict = {'negative_slope': 0.2}, pad: str = 'ReflectionPad1d',
-                 pad_params: dict = {}):
-        super().__init__()
-        assert len(kernel_sizes) == 2
-        assert kernel_sizes[0] % 2 == 1
-        assert kernel_sizes[1] % 2 == 1
-        assert (inner_kernel_sizes is None or len(inner_kernel_sizes) == len(downsample_scales))
-        assert (groups is None or len(groups) == len(downsample_scales))
-        assert (strides is None or len(strides) == len(downsample_scales))
-        assert (paddings is None or len(paddings) == len(downsample_scales))
-        self.activation = getattr(torch.nn, activation)(**activation_params)
-        self.convs = nn.ModuleList()
-        self.convs.append(
-            nn.Sequential(
-                getattr(torch.nn, pad)((np.prod(kernel_sizes) - 1) // 2, **pad_params),
-                NormConv1d(in_channels, filters, kernel_size=np.prod(kernel_sizes), stride=1, norm=norm)
-            )
-        )
-        in_chs = filters
-        for i, downsample_scale in enumerate(downsample_scales):
-            out_chs = min(in_chs * downsample_scale, max_filters)
-            default_kernel_size = downsample_scale * 10 + 1
-            default_stride = downsample_scale
-            default_padding = (default_kernel_size - 1) // 2
-            default_groups = in_chs // 4
-            self.convs.append(
-                NormConv1d(in_chs, out_chs,
-                           kernel_size=inner_kernel_sizes[i] if inner_kernel_sizes else default_kernel_size,
-                           stride=strides[i] if strides else default_stride,
-                           groups=groups[i] if groups else default_groups,
-                           padding=paddings[i] if paddings else default_padding,
-                           norm=norm))
-            in_chs = out_chs
-        out_chs = min(in_chs * 2, max_filters)
-        self.convs.append(NormConv1d(in_chs, out_chs, kernel_size=kernel_sizes[0], stride=1,
-                                     padding=(kernel_sizes[0] - 1) // 2, norm=norm))
-        self.conv_post = NormConv1d(out_chs, out_channels, kernel_size=kernel_sizes[1], stride=1,
-                                    padding=(kernel_sizes[1] - 1) // 2, norm=norm)
-    def forward(self, x: torch.Tensor):
-        fmap = []
-        for layer in self.convs:
-            x = layer(x)
-            x = self.activation(x)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        # x = torch.flatten(x, 1, -1)
-        return x, fmap
-class MultiScaleDiscriminator(MultiDiscriminator):
-    """Multi-Scale (MSD) Discriminator,
-    Args:
-        in_channels (int): Number of input channels.
-        out_channels (int): Number of output channels.
-        downsample_factor (int): Downsampling factor between the different scales.
-        scale_norms (Sequence[str]): Normalization for each sub-discriminator.
-        **kwargs: Additional args for ScaleDiscriminator.
-    """
-    def __init__(self, in_channels: int = 1, out_channels: int = 1, downsample_factor: int = 2,
-                 scale_norms: tp.Sequence[str] = ['weight_norm', 'weight_norm', 'weight_norm'], **kwargs):
-        super().__init__()
-        self.discriminators = nn.ModuleList([
-            ScaleDiscriminator(in_channels, out_channels, norm=norm, **kwargs) for norm in scale_norms
-        ])
-        self.downsample = nn.AvgPool1d(downsample_factor * 2, downsample_factor, padding=downsample_factor)
-    @property
-    def num_discriminators(self):
-        return len(self.discriminators)
-    def forward(self, x: torch.Tensor) -> MultiDiscriminatorOutputType:
-        logits = []
-        fmaps = []
-        for i, disc in enumerate(self.discriminators):
-            if i != 0:
-                self.downsample(x)
-            logit, fmap = disc(x)
-            logits.append(logit)
-            fmaps.append(fmap)
-        return logits, fmaps

audiocraft/audiocraft/adversarial/discriminators/msstftd.py DELETED Viewed

@@ -1,134 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import typing as tp
-import torchaudio
-import torch
-from torch import nn
-from einops import rearrange
-from ...modules import NormConv2d
-from .base import MultiDiscriminator, MultiDiscriminatorOutputType
-def get_2d_padding(kernel_size: tp.Tuple[int, int], dilation: tp.Tuple[int, int] = (1, 1)):
-    return (((kernel_size[0] - 1) * dilation[0]) // 2, ((kernel_size[1] - 1) * dilation[1]) // 2)
-class DiscriminatorSTFT(nn.Module):
-    """STFT sub-discriminator.
-    Args:
-        filters (int): Number of filters in convolutions.
-        in_channels (int): Number of input channels.
-        out_channels (int): Number of output channels.
-        n_fft (int): Size of FFT for each scale.
-        hop_length (int): Length of hop between STFT windows for each scale.
-        kernel_size (tuple of int): Inner Conv2d kernel sizes.
-        stride (tuple of int): Inner Conv2d strides.
-        dilations (list of int): Inner Conv2d dilation on the time dimension.
-        win_length (int): Window size for each scale.
-        normalized (bool): Whether to normalize by magnitude after stft.
-        norm (str): Normalization method.
-        activation (str): Activation function.
-        activation_params (dict): Parameters to provide to the activation function.
-        growth (int): Growth factor for the filters.
-    """
-    def __init__(self, filters: int, in_channels: int = 1, out_channels: int = 1,
-                 n_fft: int = 1024, hop_length: int = 256, win_length: int = 1024, max_filters: int = 1024,
-                 filters_scale: int = 1, kernel_size: tp.Tuple[int, int] = (3, 9), dilations: tp.List = [1, 2, 4],
-                 stride: tp.Tuple[int, int] = (1, 2), normalized: bool = True, norm: str = 'weight_norm',
-                 activation: str = 'LeakyReLU', activation_params: dict = {'negative_slope': 0.2}):
-        super().__init__()
-        assert len(kernel_size) == 2
-        assert len(stride) == 2
-        self.filters = filters
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.n_fft = n_fft
-        self.hop_length = hop_length
-        self.win_length = win_length
-        self.normalized = normalized
-        self.activation = getattr(torch.nn, activation)(**activation_params)
-        self.spec_transform = torchaudio.transforms.Spectrogram(
-            n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window_fn=torch.hann_window,
-            normalized=self.normalized, center=False, pad_mode=None, power=None)
-        spec_channels = 2 * self.in_channels
-        self.convs = nn.ModuleList()
-        self.convs.append(
-            NormConv2d(spec_channels, self.filters, kernel_size=kernel_size, padding=get_2d_padding(kernel_size))
-        )
-        in_chs = min(filters_scale * self.filters, max_filters)
-        for i, dilation in enumerate(dilations):
-            out_chs = min((filters_scale ** (i + 1)) * self.filters, max_filters)
-            self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=kernel_size, stride=stride,
-                                         dilation=(dilation, 1), padding=get_2d_padding(kernel_size, (dilation, 1)),
-                                         norm=norm))
-            in_chs = out_chs
-        out_chs = min((filters_scale ** (len(dilations) + 1)) * self.filters, max_filters)
-        self.convs.append(NormConv2d(in_chs, out_chs, kernel_size=(kernel_size[0], kernel_size[0]),
-                                     padding=get_2d_padding((kernel_size[0], kernel_size[0])),
-                                     norm=norm))
-        self.conv_post = NormConv2d(out_chs, self.out_channels,
-                                    kernel_size=(kernel_size[0], kernel_size[0]),
-                                    padding=get_2d_padding((kernel_size[0], kernel_size[0])),
-                                    norm=norm)
-    def forward(self, x: torch.Tensor):
-        fmap = []
-        z = self.spec_transform(x)  # [B, 2, Freq, Frames, 2]
-        z = torch.cat([z.real, z.imag], dim=1)
-        z = rearrange(z, 'b c w t -> b c t w')
-        for i, layer in enumerate(self.convs):
-            z = layer(z)
-            z = self.activation(z)
-            fmap.append(z)
-        z = self.conv_post(z)
-        return z, fmap
-class MultiScaleSTFTDiscriminator(MultiDiscriminator):
-    """Multi-Scale STFT (MS-STFT) discriminator.
-    Args:
-        filters (int): Number of filters in convolutions.
-        in_channels (int): Number of input channels.
-        out_channels (int): Number of output channels.
-        sep_channels (bool): Separate channels to distinct samples for stereo support.
-        n_ffts (Sequence[int]): Size of FFT for each scale.
-        hop_lengths (Sequence[int]): Length of hop between STFT windows for each scale.
-        win_lengths (Sequence[int]): Window size for each scale.
-        **kwargs: Additional args for STFTDiscriminator.
-    """
-    def __init__(self, filters: int, in_channels: int = 1, out_channels: int = 1, sep_channels: bool = False,
-                 n_ffts: tp.List[int] = [1024, 2048, 512], hop_lengths: tp.List[int] = [256, 512, 128],
-                 win_lengths: tp.List[int] = [1024, 2048, 512], **kwargs):
-        super().__init__()
-        assert len(n_ffts) == len(hop_lengths) == len(win_lengths)
-        self.sep_channels = sep_channels
-        self.discriminators = nn.ModuleList([
-            DiscriminatorSTFT(filters, in_channels=in_channels, out_channels=out_channels,
-                              n_fft=n_ffts[i], win_length=win_lengths[i], hop_length=hop_lengths[i], **kwargs)
-            for i in range(len(n_ffts))
-        ])
-    @property
-    def num_discriminators(self):
-        return len(self.discriminators)
-    def _separate_channels(self, x: torch.Tensor) -> torch.Tensor:
-        B, C, T = x.shape
-        return x.view(-1, 1, T)
-    def forward(self, x: torch.Tensor) -> MultiDiscriminatorOutputType:
-        logits = []
-        fmaps = []
-        for disc in self.discriminators:
-            logit, fmap = disc(x)
-            logits.append(logit)
-            fmaps.append(fmap)
-        return logits, fmaps

audiocraft/audiocraft/adversarial/losses.py DELETED Viewed

@@ -1,228 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Utility module to handle adversarial losses without requiring to mess up the main training loop.
-"""
-import typing as tp
-import flashy
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-ADVERSARIAL_LOSSES = ['mse', 'hinge', 'hinge2']
-AdvLossType = tp.Union[nn.Module, tp.Callable[[torch.Tensor], torch.Tensor]]
-FeatLossType = tp.Union[nn.Module, tp.Callable[[torch.Tensor, torch.Tensor], torch.Tensor]]
-class AdversarialLoss(nn.Module):
-    """Adversary training wrapper.
-    Args:
-        adversary (nn.Module): The adversary module will be used to estimate the logits given the fake and real samples.
-            We assume here the adversary output is ``Tuple[List[torch.Tensor], List[List[torch.Tensor]]]``
-            where the first item is a list of logits and the second item is a list of feature maps.
-        optimizer (torch.optim.Optimizer): Optimizer used for training the given module.
-        loss (AdvLossType): Loss function for generator training.
-        loss_real (AdvLossType): Loss function for adversarial training on logits from real samples.
-        loss_fake (AdvLossType): Loss function for adversarial training on logits from fake samples.
-        loss_feat (FeatLossType): Feature matching loss function for generator training.
-        normalize (bool): Whether to normalize by number of sub-discriminators.
-    Example of usage:
-        adv_loss = AdversarialLoss(adversaries, optimizer, loss, loss_real, loss_fake)
-        for real in loader:
-            noise = torch.randn(...)
-            fake = model(noise)
-            adv_loss.train_adv(fake, real)
-            loss, _ = adv_loss(fake, real)
-            loss.backward()
-    """
-    def __init__(self,
-                 adversary: nn.Module,
-                 optimizer: torch.optim.Optimizer,
-                 loss: AdvLossType,
-                 loss_real: AdvLossType,
-                 loss_fake: AdvLossType,
-                 loss_feat: tp.Optional[FeatLossType] = None,
-                 normalize: bool = True):
-        super().__init__()
-        self.adversary: nn.Module = adversary
-        flashy.distrib.broadcast_model(self.adversary)
-        self.optimizer = optimizer
-        self.loss = loss
-        self.loss_real = loss_real
-        self.loss_fake = loss_fake
-        self.loss_feat = loss_feat
-        self.normalize = normalize
-    def _save_to_state_dict(self, destination, prefix, keep_vars):
-        # Add the optimizer state dict inside our own.
-        super()._save_to_state_dict(destination, prefix, keep_vars)
-        destination[prefix + 'optimizer'] = self.optimizer.state_dict()
-        return destination
-    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
-        # Load optimizer state.
-        self.optimizer.load_state_dict(state_dict.pop(prefix + 'optimizer'))
-        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
-    def get_adversary_pred(self, x):
-        """Run adversary model, validating expected output format."""
-        logits, fmaps = self.adversary(x)
-        assert isinstance(logits, list) and all([isinstance(t, torch.Tensor) for t in logits]), \
-            f'Expecting a list of tensors as logits but {type(logits)} found.'
-        assert isinstance(fmaps, list), f'Expecting a list of features maps but {type(fmaps)} found.'
-        for fmap in fmaps:
-            assert isinstance(fmap, list) and all([isinstance(f, torch.Tensor) for f in fmap]), \
-                f'Expecting a list of tensors as feature maps but {type(fmap)} found.'
-        return logits, fmaps
-    def train_adv(self, fake: torch.Tensor, real: torch.Tensor) -> torch.Tensor:
-        """Train the adversary with the given fake and real example.
-        We assume the adversary output is the following format: Tuple[List[torch.Tensor], List[List[torch.Tensor]]].
-        The first item being the logits and second item being a list of feature maps for each sub-discriminator.
-        This will automatically synchronize gradients (with `flashy.distrib.eager_sync_model`)
-        and call the optimizer.
-        """
-        loss = torch.tensor(0., device=fake.device)
-        all_logits_fake_is_fake, _ = self.get_adversary_pred(fake.detach())
-        all_logits_real_is_fake, _ = self.get_adversary_pred(real.detach())
-        n_sub_adversaries = len(all_logits_fake_is_fake)
-        for logit_fake_is_fake, logit_real_is_fake in zip(all_logits_fake_is_fake, all_logits_real_is_fake):
-            loss += self.loss_fake(logit_fake_is_fake) + self.loss_real(logit_real_is_fake)
-        if self.normalize:
-            loss /= n_sub_adversaries
-        self.optimizer.zero_grad()
-        with flashy.distrib.eager_sync_model(self.adversary):
-            loss.backward()
-        self.optimizer.step()
-        return loss
-    def forward(self, fake: torch.Tensor, real: torch.Tensor) -> tp.Tuple[torch.Tensor, torch.Tensor]:
-        """Return the loss for the generator, i.e. trying to fool the adversary,
-        and feature matching loss if provided.
-        """
-        adv = torch.tensor(0., device=fake.device)
-        feat = torch.tensor(0., device=fake.device)
-        with flashy.utils.readonly(self.adversary):
-            all_logits_fake_is_fake, all_fmap_fake = self.get_adversary_pred(fake)
-            all_logits_real_is_fake, all_fmap_real = self.get_adversary_pred(real)
-            n_sub_adversaries = len(all_logits_fake_is_fake)
-            for logit_fake_is_fake in all_logits_fake_is_fake:
-                adv += self.loss(logit_fake_is_fake)
-            if self.loss_feat:
-                for fmap_fake, fmap_real in zip(all_fmap_fake, all_fmap_real):
-                    feat += self.loss_feat(fmap_fake, fmap_real)
-        if self.normalize:
-            adv /= n_sub_adversaries
-            feat /= n_sub_adversaries
-        return adv, feat
-def get_adv_criterion(loss_type: str) -> tp.Callable:
-    assert loss_type in ADVERSARIAL_LOSSES
-    if loss_type == 'mse':
-        return mse_loss
-    elif loss_type == 'hinge':
-        return hinge_loss
-    elif loss_type == 'hinge2':
-        return hinge2_loss
-    raise ValueError('Unsupported loss')
-def get_fake_criterion(loss_type: str) -> tp.Callable:
-    assert loss_type in ADVERSARIAL_LOSSES
-    if loss_type == 'mse':
-        return mse_fake_loss
-    elif loss_type in ['hinge', 'hinge2']:
-        return hinge_fake_loss
-    raise ValueError('Unsupported loss')
-def get_real_criterion(loss_type: str) -> tp.Callable:
-    assert loss_type in ADVERSARIAL_LOSSES
-    if loss_type == 'mse':
-        return mse_real_loss
-    elif loss_type in ['hinge', 'hinge2']:
-        return hinge_real_loss
-    raise ValueError('Unsupported loss')
-def mse_real_loss(x: torch.Tensor) -> torch.Tensor:
-    return F.mse_loss(x, torch.tensor(1., device=x.device).expand_as(x))
-def mse_fake_loss(x: torch.Tensor) -> torch.Tensor:
-    return F.mse_loss(x, torch.tensor(0., device=x.device).expand_as(x))
-def hinge_real_loss(x: torch.Tensor) -> torch.Tensor:
-    return -torch.mean(torch.min(x - 1, torch.tensor(0., device=x.device).expand_as(x)))
-def hinge_fake_loss(x: torch.Tensor) -> torch.Tensor:
-    return -torch.mean(torch.min(-x - 1, torch.tensor(0., device=x.device).expand_as(x)))
-def mse_loss(x: torch.Tensor) -> torch.Tensor:
-    if x.numel() == 0:
-        return torch.tensor([0.0], device=x.device)
-    return F.mse_loss(x, torch.tensor(1., device=x.device).expand_as(x))
-def hinge_loss(x: torch.Tensor) -> torch.Tensor:
-    if x.numel() == 0:
-        return torch.tensor([0.0], device=x.device)
-    return -x.mean()
-def hinge2_loss(x: torch.Tensor) -> torch.Tensor:
-    if x.numel() == 0:
-        return torch.tensor([0.0])
-    return -torch.mean(torch.min(x - 1, torch.tensor(0., device=x.device).expand_as(x)))
-class FeatureMatchingLoss(nn.Module):
-    """Feature matching loss for adversarial training.
-    Args:
-        loss (nn.Module): Loss to use for feature matching (default=torch.nn.L1).
-        normalize (bool): Whether to normalize the loss.
-            by number of feature maps.
-    """
-    def __init__(self, loss: nn.Module = torch.nn.L1Loss(), normalize: bool = True):
-        super().__init__()
-        self.loss = loss
-        self.normalize = normalize
-    def forward(self, fmap_fake: tp.List[torch.Tensor], fmap_real: tp.List[torch.Tensor]) -> torch.Tensor:
-        assert len(fmap_fake) == len(fmap_real) and len(fmap_fake) > 0
-        feat_loss = torch.tensor(0., device=fmap_fake[0].device)
-        feat_scale = torch.tensor(0., device=fmap_fake[0].device)
-        n_fmaps = 0
-        for (feat_fake, feat_real) in zip(fmap_fake, fmap_real):
-            assert feat_fake.shape == feat_real.shape
-            n_fmaps += 1
-            feat_loss += self.loss(feat_fake, feat_real)
-            feat_scale += torch.mean(torch.abs(feat_real))
-        if self.normalize:
-            feat_loss /= n_fmaps
-        return feat_loss

audiocraft/audiocraft/data/__init__.py DELETED Viewed

@@ -1,10 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""Audio loading and writing support. Datasets for raw audio
-or also including some metadata."""
-# flake8: noqa
-from . import audio, audio_dataset, info_audio_dataset, music_dataset, sound_dataset, btc_chords

audiocraft/audiocraft/data/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (493 Bytes)

audiocraft/audiocraft/data/__pycache__/audio.cpython-311.pyc DELETED Viewed

Binary file (14.9 kB)

audiocraft/audiocraft/data/__pycache__/audio_dataset.cpython-311.pyc DELETED Viewed

Binary file (36.7 kB)

audiocraft/audiocraft/data/__pycache__/audio_utils.cpython-311.pyc DELETED Viewed

Binary file (21.4 kB)

audiocraft/audiocraft/data/__pycache__/btc_chords.cpython-311.pyc DELETED Viewed

Binary file (23.4 kB)

audiocraft/audiocraft/data/__pycache__/chords.cpython-311.pyc DELETED Viewed

Binary file (23.4 kB)

audiocraft/audiocraft/data/__pycache__/info_audio_dataset.cpython-311.pyc DELETED Viewed

Binary file (7.63 kB)

audiocraft/audiocraft/data/__pycache__/music_dataset.cpython-311.pyc DELETED Viewed

Binary file (21.8 kB)

audiocraft/audiocraft/data/__pycache__/sound_dataset.cpython-311.pyc DELETED Viewed

Binary file (18.8 kB)

audiocraft/audiocraft/data/__pycache__/zip.cpython-311.pyc DELETED Viewed

Binary file (3.68 kB)

audiocraft/audiocraft/data/audio.py DELETED Viewed

@@ -1,257 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Audio IO methods are defined in this module (info, read, write),
-We rely on av library for faster read when possible, otherwise on torchaudio.
-"""
-from dataclasses import dataclass
-from pathlib import Path
-import logging
-import typing as tp
-import numpy as np
-import soundfile
-import torch
-from torch.nn import functional as F
-import torchaudio as ta
-import av
-from .audio_utils import f32_pcm, i16_pcm, normalize_audio
-_av_initialized = False
-def _init_av():
-    global _av_initialized
-    if _av_initialized:
-        return
-    logger = logging.getLogger('libav.mp3')
-    logger.setLevel(logging.ERROR)
-    _av_initialized = True
-@dataclass(frozen=True)
-class AudioFileInfo:
-    sample_rate: int
-    duration: float
-    channels: int
-def _av_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
-    _init_av()
-    with av.open(str(filepath)) as af:
-        stream = af.streams.audio[0]
-        sample_rate = stream.codec_context.sample_rate
-        duration = float(stream.duration * stream.time_base)
-        channels = stream.channels
-        return AudioFileInfo(sample_rate, duration, channels)
-def _soundfile_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
-    info = soundfile.info(filepath)
-    return AudioFileInfo(info.samplerate, info.duration, info.channels)
-def audio_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
-    # torchaudio no longer returns useful duration informations for some formats like mp3s.
-    filepath = Path(filepath)
-    if filepath.suffix in ['.flac', '.ogg']:  # TODO: Validate .ogg can be safely read with av_info
-        # ffmpeg has some weird issue with flac.
-        return _soundfile_info(filepath)
-    else:
-        return _av_info(filepath)
-def _av_read(filepath: tp.Union[str, Path], seek_time: float = 0, duration: float = -1.) -> tp.Tuple[torch.Tensor, int]:
-    """FFMPEG-based audio file reading using PyAV bindings.
-    Soundfile cannot read mp3 and av_read is more efficient than torchaudio.
-    Args:
-        filepath (str or Path): Path to audio file to read.
-        seek_time (float): Time at which to start reading in the file.
-        duration (float): Duration to read from the file. If set to -1, the whole file is read.
-    Returns:
-        tuple of torch.Tensor, int: Tuple containing audio data and sample rate
-    """
-    _init_av()
-    with av.open(str(filepath)) as af:
-        stream = af.streams.audio[0]
-        sr = stream.codec_context.sample_rate
-        num_frames = int(sr * duration) if duration >= 0 else -1
-        frame_offset = int(sr * seek_time)
-        # we need a small negative offset otherwise we get some edge artifact
-        # from the mp3 decoder.
-        af.seek(int(max(0, (seek_time - 0.1)) / stream.time_base), stream=stream)
-        frames = []
-        length = 0
-        for frame in af.decode(streams=stream.index):
-            current_offset = int(frame.rate * frame.pts * frame.time_base)
-            strip = max(0, frame_offset - current_offset)
-            buf = torch.from_numpy(frame.to_ndarray())
-            if buf.shape[0] != stream.channels:
-                buf = buf.view(-1, stream.channels).t()
-            buf = buf[:, strip:]
-            frames.append(buf)
-            length += buf.shape[1]
-            if num_frames > 0 and length >= num_frames:
-                break
-        assert frames
-        # If the above assert fails, it is likely because we seeked past the end of file point,
-        # in which case ffmpeg returns a single frame with only zeros, and a weird timestamp.
-        # This will need proper debugging, in due time.
-        wav = torch.cat(frames, dim=1)
-        assert wav.shape[0] == stream.channels
-        if num_frames > 0:
-            wav = wav[:, :num_frames]
-        return f32_pcm(wav), sr
-def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
-               duration: float = -1., pad: bool = False) -> tp.Tuple[torch.Tensor, int]:
-    """Read audio by picking the most appropriate backend tool based on the audio format.
-    Args:
-        filepath (str or Path): Path to audio file to read.
-        seek_time (float): Time at which to start reading in the file.
-        duration (float): Duration to read from the file. If set to -1, the whole file is read.
-        pad (bool): Pad output audio if not reaching expected duration.
-    Returns:
-        tuple of torch.Tensor, int: Tuple containing audio data and sample rate.
-    """
-    fp = Path(filepath)
-    if fp.suffix in ['.flac', '.ogg']:  # TODO: check if we can safely use av_read for .ogg
-        # There is some bug with ffmpeg and reading flac
-        info = _soundfile_info(filepath)
-        frames = -1 if duration <= 0 else int(duration * info.sample_rate)
-        frame_offset = int(seek_time * info.sample_rate)
-        wav, sr = soundfile.read(filepath, start=frame_offset, frames=frames, dtype=np.float32)
-        assert info.sample_rate == sr, f"Mismatch of sample rates {info.sample_rate} {sr}"
-        wav = torch.from_numpy(wav).t().contiguous()
-        if len(wav.shape) == 1:
-            wav = torch.unsqueeze(wav, 0)
-    elif (
-        fp.suffix in ['.wav', '.mp3'] and fp.suffix[1:] in ta.utils.sox_utils.list_read_formats()
-        and duration <= 0 and seek_time == 0
-    ):
-        # Torchaudio is faster if we load an entire file at once.
-        wav, sr = ta.load(fp)
-    else:
-        wav, sr = _av_read(filepath, seek_time, duration)
-    if pad and duration > 0:
-        expected_frames = int(duration * sr)
-        wav = F.pad(wav, (0, expected_frames - wav.shape[-1]))
-    return wav, sr
-def audio_write(stem_name: tp.Union[str, Path],
-                wav: torch.Tensor, sample_rate: int,
-                format: str = 'wav', mp3_rate: int = 320, normalize: bool = True,
-                strategy: str = 'peak', peak_clip_headroom_db: float = 1,
-                rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
-                loudness_compressor: bool = False,
-                log_clipping: bool = True, make_parent_dir: bool = True,
-                add_suffix: bool = True) -> Path:
-    """Convenience function for saving audio to disk. Returns the filename the audio was written to.
-    Args:
-        stem_name (str or Path): Filename without extension which will be added automatically.
-        format (str): Either "wav" or "mp3".
-        mp3_rate (int): kbps when using mp3s.
-        normalize (bool): if `True` (default), normalizes according to the prescribed
-            strategy (see after). If `False`, the strategy is only used in case clipping
-            would happen.
-        strategy (str): Can be either 'clip', 'peak', or 'rms'. Default is 'peak',
-            i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
-            with extra headroom to avoid clipping. 'clip' just clips.
-        peak_clip_headroom_db (float): Headroom in dB when doing 'peak' or 'clip' strategy.
-        rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
-            than the `peak_clip` one to avoid further clipping.
-        loudness_headroom_db (float): Target loudness for loudness normalization.
-        loudness_compressor (bool): Uses tanh for soft clipping when strategy is 'loudness'.
-         when strategy is 'loudness' log_clipping (bool): If True, basic logging on stderr when clipping still
-            occurs despite strategy (only for 'rms').
-        make_parent_dir (bool): Make parent directory if it doesn't exist.
-    Returns:
-        Path: Path of the saved audio.
-    """
-    assert wav.dtype.is_floating_point, "wav is not floating point"
-    if wav.dim() == 1:
-        wav = wav[None]
-    elif wav.dim() > 2:
-        raise ValueError("Input wav should be at most 2 dimension.")
-    assert wav.isfinite().all()
-    wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
-                          rms_headroom_db, loudness_headroom_db, loudness_compressor,
-                          log_clipping=log_clipping, sample_rate=sample_rate,
-                          stem_name=str(stem_name))
-    kwargs: dict = {}
-    if format == 'mp3':
-        suffix = '.mp3'
-        kwargs.update({"compression": mp3_rate})
-    elif format == 'wav':
-        wav = i16_pcm(wav)
-        suffix = '.wav'
-        kwargs.update({"encoding": "PCM_S", "bits_per_sample": 16})
-    else:
-        raise RuntimeError(f"Invalid format {format}. Only wav or mp3 are supported.")
-    if not add_suffix:
-        suffix = ''
-    path = Path(str(stem_name) + suffix)
-    if make_parent_dir:
-        path.parent.mkdir(exist_ok=True, parents=True)
-    try:
-        ta.save(path, wav, sample_rate, **kwargs)
-    except Exception:
-        if path.exists():
-            # we do not want to leave half written files around.
-            path.unlink()
-        raise
-    return path
-def audio_postproc(wav: torch.Tensor, sample_rate: int, normalize: bool = True,
-                strategy: str = 'peak', peak_clip_headroom_db: float = 1,
-                rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
-                loudness_compressor: bool = False, log_clipping: bool = True) -> Path:
-    """Convenience function for saving audio to disk. Returns the filename the audio was written to.
-    Args:
-        wav (torch.Tensor): Audio data to save.
-        sample_rate (int): Sample rate of audio data.
-        format (str): Either "wav" or "mp3".
-        mp3_rate (int): kbps when using mp3s.
-        normalize (bool): if `True` (default), normalizes according to the prescribed
-            strategy (see after). If `False`, the strategy is only used in case clipping
-            would happen.
-        strategy (str): Can be either 'clip', 'peak', or 'rms'. Default is 'peak',
-            i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
-            with extra headroom to avoid clipping. 'clip' just clips.
-        peak_clip_headroom_db (float): Headroom in dB when doing 'peak' or 'clip' strategy.
-        rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
-            than the `peak_clip` one to avoid further clipping.
-        loudness_headroom_db (float): Target loudness for loudness normalization.
-        loudness_compressor (bool): Uses tanh for soft clipping when strategy is 'loudness'.
-         when strategy is 'loudness' log_clipping (bool): If True, basic logging on stderr when clipping still
-            occurs despite strategy (only for 'rms').
-        make_parent_dir (bool): Make parent directory if it doesn't exist.
-    Returns:
-        Path: Path of the saved audio.
-    """
-    assert wav.dtype.is_floating_point, "wav is not floating point"
-    if wav.dim() == 1:
-        wav = wav[None]
-    elif wav.dim() > 2:
-        raise ValueError("Input wav should be at most 2 dimension.")
-    assert wav.isfinite().all()
-    wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
-                          rms_headroom_db, loudness_headroom_db, loudness_compressor,
-                          log_clipping=log_clipping, sample_rate=sample_rate,
-                          stem_name=None)
-    return wav

audiocraft/audiocraft/data/audio_dataset.py DELETED Viewed

@@ -1,614 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""AudioDataset support. In order to handle a larger number of files
-without having to scan again the folders, we precompute some metadata
-(filename, sample rate, duration), and use that to efficiently sample audio segments.
-"""
-import argparse
-import copy
-from concurrent.futures import ThreadPoolExecutor, Future
-from dataclasses import dataclass, fields
-from contextlib import ExitStack
-from functools import lru_cache
-import gzip
-import json
-import logging
-import os
-from pathlib import Path
-import random
-import sys
-import typing as tp
-import torch
-import torch.nn.functional as F
-from .audio import audio_read, audio_info
-from .audio_utils import convert_audio
-from .zip import PathInZip
-try:
-    import dora
-except ImportError:
-    dora = None  # type: ignore
-@dataclass(order=True)
-class BaseInfo:
-    @classmethod
-    def _dict2fields(cls, dictionary: dict):
-        return {
-            field.name: dictionary[field.name]
-            for field in fields(cls) if field.name in dictionary
-        }
-    @classmethod
-    def from_dict(cls, dictionary: dict):
-        _dictionary = cls._dict2fields(dictionary)
-        return cls(**_dictionary)
-    def to_dict(self):
-        return {
-            field.name: self.__getattribute__(field.name)
-            for field in fields(self)
-            }
-@dataclass(order=True)
-class AudioMeta(BaseInfo):
-    path: str
-    duration: float
-    sample_rate: int
-    bpm: float
-    # meter: int
-    amplitude: tp.Optional[float] = None
-    weight: tp.Optional[float] = None
-    phr_start: tp.List[tp.Optional[float]] = None
-    # info_path is used to load additional information about the audio file that is stored in zip files.
-    info_path: tp.Optional[PathInZip] = None
-    @classmethod
-    def from_dict(cls, dictionary: dict):
-        base = cls._dict2fields(dictionary)
-        if 'info_path' in base and base['info_path'] is not None:
-            base['info_path'] = PathInZip(base['info_path'])
-        return cls(**base)
-    def to_dict(self):
-        d = super().to_dict()
-        if d['info_path'] is not None:
-            d['info_path'] = str(d['info_path'])
-        return d
-@dataclass(order=True)
-class SegmentInfo(BaseInfo):
-    meta: AudioMeta
-    seek_time: float
-    # The following values are given once the audio is processed, e.g.
-    # at the target sample rate and target number of channels.
-    n_frames: int      # actual number of frames without padding
-    total_frames: int  # total number of frames, padding included
-    sample_rate: int   # actual sample rate
-    channels: int      # number of audio channels.
-DEFAULT_EXTS = ['.wav', '.mp3', '.flac', '.ogg', '.m4a']
-logger = logging.getLogger(__name__)
-def _get_audio_meta(file_path: str, minimal: bool = True) -> AudioMeta:
-    """AudioMeta from a path to an audio file.
-    Args:
-        file_path (str): Resolved path of valid audio file.
-        minimal (bool): Whether to only load the minimal set of metadata (takes longer if not).
-    Returns:
-        AudioMeta: Audio file path and its metadata.
-    """
-    info = audio_info(file_path)
-    amplitude: tp.Optional[float] = None
-    if not minimal:
-        wav, sr = audio_read(file_path)
-        amplitude = wav.abs().max().item()
-    # load json info
-    json_file = file_path.replace('.wav', '.json')
-    with open(json_file ,'r') as f:
-        json_str = f.read()
-    info_json = json.loads(json_str)
-    if "phr_start" not in info_json.keys():
-        info_json["phr_start"] = None
-    # return AudioMeta(file_path, info.duration, info.sample_rate, info_json["bpm"], info_json["meter"], amplitude, None, info_json["phr_start"])
-    return AudioMeta(file_path, info.duration, info.sample_rate, info_json["bpm"], amplitude, None, info_json["phr_start"])
-def _resolve_audio_meta(m: AudioMeta, fast: bool = True) -> AudioMeta:
-    """If Dora is available as a dependency, try to resolve potential relative paths
-    in list of AudioMeta. This method is expected to be used when loading meta from file.
-    Args:
-        m (AudioMeta): Audio meta to resolve.
-        fast (bool): If True, uses a really fast check for determining if a file
-            is already absolute or not. Only valid on Linux/Mac.
-    Returns:
-        AudioMeta: Audio meta with resolved path.
-    """
-    def is_abs(m):
-        if fast:
-            return str(m)[0] == '/'
-        else:
-            os.path.isabs(str(m))
-    if not dora:
-        return m
-    if not is_abs(m.path):
-        m.path = dora.git_save.to_absolute_path(m.path)
-    if m.info_path is not None and not is_abs(m.info_path.zip_path):
-        m.info_path.zip_path = dora.git_save.to_absolute_path(m.path)
-    return m
-def find_audio_files(path: tp.Union[Path, str],
-                     exts: tp.List[str] = DEFAULT_EXTS,
-                     resolve: bool = True,
-                     minimal: bool = True,
-                     progress: bool = False,
-                     workers: int = 0) -> tp.List[AudioMeta]:
-    """Build a list of AudioMeta from a given path,
-    collecting relevant audio files and fetching meta info.
-    Args:
-        path (str or Path): Path to folder containing audio files.
-        exts (list of str): List of file extensions to consider for audio files.
-        minimal (bool): Whether to only load the minimal set of metadata (takes longer if not).
-        progress (bool): Whether to log progress on audio files collection.
-        workers (int): number of parallel workers, if 0, use only the current thread.
-    Returns:
-        list of AudioMeta: List of audio file path and its metadata.
-    """
-    audio_files = []
-    futures: tp.List[Future] = []
-    pool: tp.Optional[ThreadPoolExecutor] = None
-    with ExitStack() as stack:
-        if workers > 0:
-            pool = ThreadPoolExecutor(workers)
-            stack.enter_context(pool)
-        if progress:
-            print("Finding audio files...")
-        for root, folders, files in os.walk(path, followlinks=True):
-            for file in files:
-                full_path = Path(root) / file
-                if full_path.suffix.lower() in exts:
-                    audio_files.append(full_path)
-                    if pool is not None:
-                        futures.append(pool.submit(_get_audio_meta, str(audio_files[-1]), minimal))
-                    if progress:
-                        print(format(len(audio_files), " 8d"), end='\r', file=sys.stderr)
-        if progress:
-            print("Getting audio metadata...")
-        meta: tp.List[AudioMeta] = []
-        for idx, file_path in enumerate(audio_files):
-            try:
-                if pool is None:
-                    m = _get_audio_meta(str(file_path), minimal)
-                else:
-                    m = futures[idx].result()
-                if resolve:
-                    m = _resolve_audio_meta(m)
-            except Exception as err:
-                print("Error with", str(file_path), err, file=sys.stderr)
-                continue
-            meta.append(m)
-            if progress:
-                print(format((1 + idx) / len(audio_files), " 3.1%"), end='\r', file=sys.stderr)
-    meta.sort()
-    return meta
-def load_audio_meta(path: tp.Union[str, Path],
-                    resolve: bool = True, fast: bool = True) -> tp.List[AudioMeta]:
-    """Load list of AudioMeta from an optionally compressed json file.
-    Args:
-        path (str or Path): Path to JSON file.
-        resolve (bool): Whether to resolve the path from AudioMeta (default=True).
-        fast (bool): activates some tricks to make things faster.
-    Returns:
-        list of AudioMeta: List of audio file path and its total duration.
-    """
-    open_fn = gzip.open if str(path).lower().endswith('.gz') else open
-    with open_fn(path, 'rb') as fp:  # type: ignore
-        lines = fp.readlines()
-    meta = []
-    for line in lines:
-        d = json.loads(line)
-        m = AudioMeta.from_dict(d)
-        if resolve:
-            m = _resolve_audio_meta(m, fast=fast)
-        meta.append(m)
-    return meta
-def save_audio_meta(path: tp.Union[str, Path], meta: tp.List[AudioMeta]):
-    """Save the audio metadata to the file pointer as json.
-    Args:
-        path (str or Path): Path to JSON file.
-        metadata (list of BaseAudioMeta): List of audio meta to save.
-    """
-    Path(path).parent.mkdir(exist_ok=True, parents=True)
-    open_fn = gzip.open if str(path).lower().endswith('.gz') else open
-    with open_fn(path, 'wb') as fp:  # type: ignore
-        for m in meta:
-            json_str = json.dumps(m.to_dict()) + '\n'
-            json_bytes = json_str.encode('utf-8')
-            fp.write(json_bytes)
-class AudioDataset:
-    """Base audio dataset.
-    The dataset takes a list of AudioMeta and create a dataset composed of segments of audio
-    and potentially additional information, by creating random segments from the list of audio
-    files referenced in the metadata and applying minimal data pre-processing such as resampling,
-    mixing of channels, padding, etc.
-    If no segment_duration value is provided, the AudioDataset will return the full wav for each
-    audio file. Otherwise, it will randomly sample audio files and create a segment of the specified
-    duration, applying padding if required.
-    By default, only the torch Tensor corresponding to the waveform is returned. Setting return_info=True
-    allows to return a tuple containing the torch Tensor and additional metadata on the segment and the
-    original audio meta.
-    Note that you can call `start_epoch(epoch)` in order to get
-    a deterministic "randomization" for `shuffle=True`.
-    For a given epoch and dataset index, this will always return the same extract.
-    You can get back some diversity by setting the `shuffle_seed` param.
-    Args:
-        meta (list of AudioMeta): List of audio files metadata.
-        segment_duration (float, optional): Optional segment duration of audio to load.
-            If not specified, the dataset will load the full audio segment from the file.
-        shuffle (bool): Set to `True` to have the data reshuffled at every epoch.
-        sample_rate (int): Target sample rate of the loaded audio samples.
-        channels (int): Target number of channels of the loaded audio samples.
-        sample_on_duration (bool): Set to `True` to sample segments with probability
-            dependent on audio file duration. This is only used if `segment_duration` is provided.
-        sample_on_weight (bool): Set to `True` to sample segments using the `weight` entry of
-            `AudioMeta`. If `sample_on_duration` is also True, the actual weight will be the product
-            of the file duration and file weight. This is only used if `segment_duration` is provided.
-        min_segment_ratio (float): Minimum segment ratio to use when the audio file
-            is shorter than the desired segment.
-        max_read_retry (int): Maximum number of retries to sample an audio segment from the dataset.
-        return_info (bool): Whether to return the wav only or return wav along with segment info and metadata.
-        min_audio_duration (float, optional): Minimum audio file duration, in seconds, if provided
-            audio shorter than this will be filtered out.
-        max_audio_duration (float, optional): Maximal audio file duration in seconds, if provided
-            audio longer than this will be filtered out.
-        shuffle_seed (int): can be used to further randomize
-        load_wav (bool): if False, skip loading the wav but returns a tensor of 0
-            with the expected segment_duration (which must be provided if load_wav is False).
-        permutation_on_files (bool): only if `sample_on_weight` and `sample_on_duration`
-            are False. Will ensure a permutation on files when going through the dataset.
-            In that case the epoch number must be provided in order for the model
-            to continue the permutation across epochs. In that case, it is assumed
-            that `num_samples = total_batch_size * num_updates_per_epoch`, with
-            `total_batch_size` the overall batch size accounting for all gpus.
-    """
-    def __init__(self,
-                 meta: tp.List[AudioMeta],
-                 segment_duration: tp.Optional[float] = None,
-                 shuffle: bool = True,
-                 num_samples: int = 10_000,
-                 sample_rate: int = 48_000,
-                 channels: int = 2,
-                 pad: bool = True,
-                 sample_on_duration: bool = True,
-                 sample_on_weight: bool = True,
-                 min_segment_ratio: float = 1,
-                 max_read_retry: int = 10,
-                 return_info: bool = False,
-                 min_audio_duration: tp.Optional[float] = None,
-                 max_audio_duration: tp.Optional[float] = None,
-                 shuffle_seed: int = 0,
-                 load_wav: bool = True,
-                 permutation_on_files: bool = False,
-                 ):
-        assert len(meta) > 0, "No audio meta provided to AudioDataset. Please check loading of audio meta."
-        assert segment_duration is None or segment_duration > 0
-        assert segment_duration is None or min_segment_ratio >= 0
-        self.segment_duration = segment_duration
-        self.min_segment_ratio = min_segment_ratio
-        self.max_audio_duration = max_audio_duration
-        self.min_audio_duration = min_audio_duration
-        if self.min_audio_duration is not None and self.max_audio_duration is not None:
-            assert self.min_audio_duration <= self.max_audio_duration
-        self.meta: tp.List[AudioMeta] = self._filter_duration(meta)
-        assert len(self.meta)  # Fail fast if all data has been filtered.
-        self.total_duration = sum(d.duration for d in self.meta)
-        if segment_duration is None:
-            num_samples = len(self.meta)
-        self.num_samples = num_samples
-        self.shuffle = shuffle
-        self.sample_rate = sample_rate
-        self.channels = channels
-        self.pad = pad
-        self.sample_on_weight = sample_on_weight
-        self.sample_on_duration = sample_on_duration
-        self.sampling_probabilities = self._get_sampling_probabilities()
-        self.max_read_retry = max_read_retry
-        self.return_info = return_info
-        self.shuffle_seed = shuffle_seed
-        self.current_epoch: tp.Optional[int] = None
-        self.load_wav = load_wav
-        if not load_wav:
-            assert segment_duration is not None
-        self.permutation_on_files = permutation_on_files
-        if permutation_on_files:
-            assert not self.sample_on_duration
-            assert not self.sample_on_weight
-            assert self.shuffle
-    def start_epoch(self, epoch: int):
-        self.current_epoch = epoch
-    def __len__(self):
-        return self.num_samples
-    def _get_sampling_probabilities(self, normalized: bool = True):
-        """Return the sampling probabilities for each file inside `self.meta`."""
-        scores: tp.List[float] = []
-        for file_meta in self.meta:
-            score = 1.
-            if self.sample_on_weight and file_meta.weight is not None:
-                score *= file_meta.weight
-            if self.sample_on_duration:
-                score *= file_meta.duration
-            scores.append(score)
-        probabilities = torch.tensor(scores)
-        if normalized:
-            probabilities /= probabilities.sum()
-        return probabilities
-    @staticmethod
-    @lru_cache(16)
-    def _get_file_permutation(num_files: int, permutation_index: int, base_seed: int):
-        # Used to keep the most recent files permutation in memory implicitely.
-        # will work unless someone is using a lot of Datasets in parallel.
-        rng = torch.Generator()
-        rng.manual_seed(base_seed + permutation_index)
-        return torch.randperm(num_files, generator=rng)
-    def sample_file(self, index: int, rng: torch.Generator) -> AudioMeta:
-        """Sample a given file from `self.meta`. Can be overridden in subclasses.
-        This is only called if `segment_duration` is not None.
-        You must use the provided random number generator `rng` for reproducibility.
-        You can further make use of the index accessed.
-        """
-        if self.permutation_on_files:
-            assert self.current_epoch is not None
-            total_index = self.current_epoch * len(self) + index
-            permutation_index = total_index // len(self.meta)
-            relative_index = total_index % len(self.meta)
-            permutation = AudioDataset._get_file_permutation(
-                len(self.meta), permutation_index, self.shuffle_seed)
-            file_index = permutation[relative_index]
-            return self.meta[file_index]
-        if not self.sample_on_weight and not self.sample_on_duration:
-            file_index = int(torch.randint(len(self.sampling_probabilities), (1,), generator=rng).item())
-        else:
-            file_index = int(torch.multinomial(self.sampling_probabilities, 1, generator=rng).item())
-        return self.meta[file_index]
-    def _audio_read(self, path: str, seek_time: float = 0, duration: float = -1):
-        # Override this method in subclass if needed.
-        if self.load_wav:
-            return audio_read(path, seek_time, duration, pad=False)
-        else:
-            assert self.segment_duration is not None
-            n_frames = int(self.sample_rate * self.segment_duration)
-            return torch.zeros(self.channels, n_frames), self.sample_rate
-    def __getitem__(self, index: int) -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, SegmentInfo]]:
-        if self.segment_duration is None:
-            file_meta = self.meta[index]
-            out, sr = audio_read(file_meta.path)
-            out = convert_audio(out, sr, self.sample_rate, self.channels)
-            n_frames = out.shape[-1]
-            segment_info = SegmentInfo(file_meta, seek_time=0., n_frames=n_frames, total_frames=n_frames,
-                                       sample_rate=self.sample_rate, channels=out.shape[0])
-        else:
-            rng = torch.Generator()
-            if self.shuffle:
-                # We use index, plus extra randomness, either totally random if we don't know the epoch.
-                # otherwise we make use of the epoch number and optional shuffle_seed.
-                if self.current_epoch is None:
-                    rng.manual_seed(index + self.num_samples * random.randint(0, 2**24))
-                else:
-                    rng.manual_seed(index + self.num_samples * (self.current_epoch + self.shuffle_seed))
-            else:
-                # We only use index
-                rng.manual_seed(index)
-            for retry in range(self.max_read_retry):
-                file_meta = self.sample_file(index, rng)
-                # We add some variance in the file position even if audio file is smaller than segment
-                # without ending up with empty segments
-                # sample with phrase
-                if file_meta.phr_start is not None:
-                    # max_seek = max(0, len(file_meta.phr_start[:-1]))
-                    max_seek = max(0, len([start for start in file_meta.phr_start if start + self.segment_duration <= file_meta.duration])) # sample with time
-                    seek_time = file_meta.phr_start[int(torch.rand(1, generator=rng).item() * max_seek)] # choose from phrase
-                else:
-                    max_seek = max(0, file_meta.duration - self.segment_duration * self.min_segment_ratio)
-                    seek_time = torch.rand(1, generator=rng).item() * max_seek # can be change to choose phrase start
-                    if file_meta.duration == self.segment_duration:
-                        seek_time = 0
-                # phr_dur = 60./file_meta.bpm * (file_meta.meter * 4.) # if meter=4 then 16 beats per phrase
-                try:
-                    out, sr = audio_read(file_meta.path, seek_time, self.segment_duration, pad=False)
-                    # out, sr = audio_read(file_meta.path, seek_time, phr_dur, pad=False) # use phrase trunk as input
-                    out = convert_audio(out, sr, self.sample_rate, self.channels)
-                    n_frames = out.shape[-1]
-                    target_frames = int(self.segment_duration * self.sample_rate)
-                    if self.pad:
-                        out = F.pad(out, (0, target_frames - n_frames))
-                    segment_info = SegmentInfo(file_meta, seek_time, n_frames=n_frames, total_frames=target_frames,
-                                               sample_rate=self.sample_rate, channels=out.shape[0])
-                except Exception as exc:
-                    logger.warning("Error opening file %s: %r", file_meta.path, exc)
-                    if retry == self.max_read_retry - 1:
-                        raise
-                else:
-                    break
-        if self.return_info:
-            # Returns the wav and additional information on the wave segment
-            return out, segment_info
-        else:
-            return out
-    def collater(self, samples):
-        """The collater function has to be provided to the dataloader
-        if AudioDataset has return_info=True in order to properly collate
-        the samples of a batch.
-        """
-        if self.segment_duration is None and len(samples) > 1:
-            assert self.pad, "Must allow padding when batching examples of different durations."
-        # In this case the audio reaching the collater is of variable length as segment_duration=None.
-        to_pad = self.segment_duration is None and self.pad
-        if to_pad:
-            max_len = max([wav.shape[-1] for wav, _ in samples])
-            def _pad_wav(wav):
-                return F.pad(wav, (0, max_len - wav.shape[-1]))
-        if self.return_info:
-            if len(samples) > 0:
-                assert len(samples[0]) == 2
-                assert isinstance(samples[0][0], torch.Tensor)
-                assert isinstance(samples[0][1], SegmentInfo)
-            wavs = [wav for wav, _ in samples]
-            segment_infos = [copy.deepcopy(info) for _, info in samples]
-            if to_pad:
-                # Each wav could be of a different duration as they are not segmented.
-                for i in range(len(samples)):
-                    # Determines the total length of the signal with padding, so we update here as we pad.
-                    segment_infos[i].total_frames = max_len
-                    wavs[i] = _pad_wav(wavs[i])
-            wav = torch.stack(wavs)
-            return wav, segment_infos
-        else:
-            assert isinstance(samples[0], torch.Tensor)
-            if to_pad:
-                samples = [_pad_wav(s) for s in samples]
-            return torch.stack(samples)
-    def _filter_duration(self, meta: tp.List[AudioMeta]) -> tp.List[AudioMeta]:
-        """Filters out audio files with audio durations that will not allow to sample examples from them."""
-        orig_len = len(meta)
-        # Filter data that is too short.
-        if self.min_audio_duration is not None:
-            meta = [m for m in meta if m.duration >= self.min_audio_duration]
-        # Filter data that is too long.
-        if self.max_audio_duration is not None:
-            meta = [m for m in meta if m.duration <= self.max_audio_duration]
-        filtered_len = len(meta)
-        removed_percentage = 100*(1-float(filtered_len)/orig_len)
-        msg = 'Removed %.2f percent of the data because it was too short or too long.' % removed_percentage
-        if removed_percentage < 10:
-            logging.debug(msg)
-        else:
-            logging.warning(msg)
-        return meta
-    @classmethod
-    def from_meta(cls, root: tp.Union[str, Path], **kwargs):
-        """Instantiate AudioDataset from a path to a directory containing a manifest as a jsonl file.
-        Args:
-            root (str or Path): Path to root folder containing audio files.
-            kwargs: Additional keyword arguments for the AudioDataset.
-        """
-        root = Path(root)
-        if root.is_dir():
-            if (root / 'data.jsonl').exists():
-                root = root / 'data.jsonl'
-            elif (root / 'data.jsonl.gz').exists():
-                root = root / 'data.jsonl.gz'
-            else:
-                raise ValueError("Don't know where to read metadata from in the dir. "
-                                 "Expecting either a data.jsonl or data.jsonl.gz file but none found.")
-        meta = load_audio_meta(root)
-        return cls(meta, **kwargs)
-    @classmethod
-    def from_path(cls, root: tp.Union[str, Path], minimal_meta: bool = True,
-                  exts: tp.List[str] = DEFAULT_EXTS, **kwargs):
-        """Instantiate AudioDataset from a path containing (possibly nested) audio files.
-        Args:
-            root (str or Path): Path to root folder containing audio files.
-            minimal_meta (bool): Whether to only load minimal metadata or not.
-            exts (list of str): Extensions for audio files.
-            kwargs: Additional keyword arguments for the AudioDataset.
-        """
-        root = Path(root)
-        if root.is_file():
-            meta = load_audio_meta(root, resolve=True)
-        else:
-            meta = find_audio_files(root, exts, minimal=minimal_meta, resolve=True)
-        return cls(meta, **kwargs)
-def main():
-    logging.basicConfig(stream=sys.stderr, level=logging.INFO)
-    parser = argparse.ArgumentParser(
-        prog='audio_dataset',
-        description='Generate .jsonl files by scanning a folder.')
-    parser.add_argument('root', help='Root folder with all the audio files')
-    parser.add_argument('output_meta_file',
-                        help='Output file to store the metadata, ')
-    parser.add_argument('--complete',
-                        action='store_false', dest='minimal', default=True,
-                        help='Retrieve all metadata, even the one that are expansive '
-                             'to compute (e.g. normalization).')
-    parser.add_argument('--resolve',
-                        action='store_true', default=False,
-                        help='Resolve the paths to be absolute and with no symlinks.')
-    parser.add_argument('--workers',
-                        default=10, type=int,
-                        help='Number of workers.')
-    args = parser.parse_args()
-    meta = find_audio_files(args.root, DEFAULT_EXTS, progress=True,
-                            resolve=args.resolve, minimal=args.minimal, workers=args.workers)
-    save_audio_meta(args.output_meta_file, meta)
-if __name__ == '__main__':
-    main()

audiocraft/audiocraft/data/audio_utils.py DELETED Viewed

@@ -1,385 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""Various utilities for audio convertion (pcm format, sample rate and channels),
-and volume normalization."""
-import sys
-import typing as tp
-import julius
-import torch
-import torchaudio
-import numpy as np
-from .chords import Chords
-chords = Chords() # initiate object
-def convert_audio_channels(wav: torch.Tensor, channels: int = 2) -> torch.Tensor:
-    """Convert audio to the given number of channels.
-    Args:
-        wav (torch.Tensor): Audio wave of shape [B, C, T].
-        channels (int): Expected number of channels as output.
-    Returns:
-        torch.Tensor: Downmixed or unchanged audio wave [B, C, T].
-    """
-    *shape, src_channels, length = wav.shape
-    if src_channels == channels:
-        pass
-    elif channels == 1:
-        # Case 1:
-        # The caller asked 1-channel audio, and the stream has multiple
-        # channels, downmix all channels.
-        wav = wav.mean(dim=-2, keepdim=True)
-    elif src_channels == 1:
-        # Case 2:
-        # The caller asked for multiple channels, but the input file has
-        # a single channel, replicate the audio over all channels.
-        wav = wav.expand(*shape, channels, length)
-    elif src_channels >= channels:
-        # Case 3:
-        # The caller asked for multiple channels, and the input file has
-        # more channels than requested. In that case return the first channels.
-        wav = wav[..., :channels, :]
-    else:
-        # Case 4: What is a reasonable choice here?
-        raise ValueError('The audio file has less channels than requested but is not mono.')
-    return wav
-def convert_audio(wav: torch.Tensor, from_rate: float,
-                  to_rate: float, to_channels: int) -> torch.Tensor:
-    """Convert audio to new sample rate and number of audio channels."""
-    wav = julius.resample_frac(wav, int(from_rate), int(to_rate))
-    wav = convert_audio_channels(wav, to_channels)
-    return wav
-def normalize_loudness(wav: torch.Tensor, sample_rate: int, loudness_headroom_db: float = 14,
-                       loudness_compressor: bool = False, energy_floor: float = 2e-3):
-    """Normalize an input signal to a user loudness in dB LKFS.
-    Audio loudness is defined according to the ITU-R BS.1770-4 recommendation.
-    Args:
-        wav (torch.Tensor): Input multichannel audio data.
-        sample_rate (int): Sample rate.
-        loudness_headroom_db (float): Target loudness of the output in dB LUFS.
-        loudness_compressor (bool): Uses tanh for soft clipping.
-        energy_floor (float): anything below that RMS level will not be rescaled.
-    Returns:
-        torch.Tensor: Loudness normalized output data.
-    """
-    energy = wav.pow(2).mean().sqrt().item()
-    if energy < energy_floor:
-        return wav
-    transform = torchaudio.transforms.Loudness(sample_rate)
-    input_loudness_db = transform(wav).item()
-    # calculate the gain needed to scale to the desired loudness level
-    delta_loudness = -loudness_headroom_db - input_loudness_db
-    gain = 10.0 ** (delta_loudness / 20.0)
-    output = gain * wav
-    if loudness_compressor:
-        output = torch.tanh(output)
-    assert output.isfinite().all(), (input_loudness_db, wav.pow(2).mean().sqrt())
-    return output
-def _clip_wav(wav: torch.Tensor, log_clipping: bool = False, stem_name: tp.Optional[str] = None) -> None:
-    """Utility function to clip the audio with logging if specified."""
-    max_scale = wav.abs().max()
-    if log_clipping and max_scale > 1:
-        clamp_prob = (wav.abs() > 1).float().mean().item()
-        print(f"CLIPPING {stem_name or ''} happening with proba (a bit of clipping is okay):",
-              clamp_prob, "maximum scale: ", max_scale.item(), file=sys.stderr)
-    wav.clamp_(-1, 1)
-def normalize_audio(wav: torch.Tensor, normalize: bool = True,
-                    strategy: str = 'peak', peak_clip_headroom_db: float = 1,
-                    rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
-                    loudness_compressor: bool = False, log_clipping: bool = False,
-                    sample_rate: tp.Optional[int] = None,
-                    stem_name: tp.Optional[str] = None) -> torch.Tensor:
-    """Normalize the audio according to the prescribed strategy (see after).
-    Args:
-        wav (torch.Tensor): Audio data.
-        normalize (bool): if `True` (default), normalizes according to the prescribed
-            strategy (see after). If `False`, the strategy is only used in case clipping
-            would happen.
-        strategy (str): Can be either 'clip', 'peak', or 'rms'. Default is 'peak',
-            i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
-            with extra headroom to avoid clipping. 'clip' just clips.
-        peak_clip_headroom_db (float): Headroom in dB when doing 'peak' or 'clip' strategy.
-        rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
-            than the `peak_clip` one to avoid further clipping.
-        loudness_headroom_db (float): Target loudness for loudness normalization.
-        loudness_compressor (bool): If True, uses tanh based soft clipping.
-        log_clipping (bool): If True, basic logging on stderr when clipping still
-            occurs despite strategy (only for 'rms').
-        sample_rate (int): Sample rate for the audio data (required for loudness).
-        stem_name (str, optional): Stem name for clipping logging.
-    Returns:
-        torch.Tensor: Normalized audio.
-    """
-    scale_peak = 10 ** (-peak_clip_headroom_db / 20)
-    scale_rms = 10 ** (-rms_headroom_db / 20)
-    if strategy == 'peak':
-        rescaling = (scale_peak / wav.abs().max())
-        if normalize or rescaling < 1:
-            wav = wav * rescaling
-    elif strategy == 'clip':
-        wav = wav.clamp(-scale_peak, scale_peak)
-    elif strategy == 'rms':
-        mono = wav.mean(dim=0)
-        rescaling = scale_rms / mono.pow(2).mean().sqrt()
-        if normalize or rescaling < 1:
-            wav = wav * rescaling
-        _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
-    elif strategy == 'loudness':
-        assert sample_rate is not None, "Loudness normalization requires sample rate."
-        wav = normalize_loudness(wav, sample_rate, loudness_headroom_db, loudness_compressor)
-        _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
-    else:
-        assert wav.abs().max() < 1
-        assert strategy == '' or strategy == 'none', f"Unexpected strategy: '{strategy}'"
-    return wav
-def f32_pcm(wav: torch.Tensor) -> torch.Tensor:
-    """Convert audio to float 32 bits PCM format.
-    """
-    if wav.dtype.is_floating_point:
-        return wav
-    elif wav.dtype == torch.int16:
-        return wav.float() / 2**15
-    elif wav.dtype == torch.int32:
-        return wav.float() / 2**31
-    raise ValueError(f"Unsupported wav dtype: {wav.dtype}")
-def i16_pcm(wav: torch.Tensor) -> torch.Tensor:
-    """Convert audio to int 16 bits PCM format.
-    ..Warning:: There exist many formula for doing this conversion. None are perfect
-    due to the asymmetry of the int16 range. One either have possible clipping, DC offset,
-    or inconsistencies with f32_pcm. If the given wav doesn't have enough headroom,
-    it is possible that `i16_pcm(f32_pcm)) != Identity`.
-    """
-    if wav.dtype.is_floating_point:
-        assert wav.abs().max() <= 1
-        candidate = (wav * 2 ** 15).round()
-        if candidate.max() >= 2 ** 15:  # clipping would occur
-            candidate = (wav * (2 ** 15 - 1)).round()
-        return candidate.short()
-    else:
-        assert wav.dtype == torch.int16
-        return wav
-def convert_txtchord2chroma_orig(text_chords, bpms, meters, gen_sec):
-    chromas = []
-    # total_len = int(gen_sec * 44100 / 512)
-    total_len = int(gen_sec * 32000 / 640)
-    for chord, bpm, meter in zip(text_chords, bpms, meters):
-        phr_len = int(60. / bpm * (meter * 4) * 32000 / 640)
-        # phr_len = int(60. / bpm * (meter * 4) * 44100 / 2048)
-        chroma = torch.zeros([total_len, 12])
-        count = 0
-        offset = 0
-        stext = chord.split(" ")
-        timebin = phr_len // 4 # frames per bar
-        while count < total_len:
-            for tokens in stext:
-                if count >= total_len:
-                    break
-                stoken = tokens.split(',')
-                for token in stoken:
-                    off_timebin = timebin + offset
-                    rounded_timebin = round(off_timebin)
-                    offset = off_timebin - rounded_timebin
-                    offset = offset/len(stoken)
-                    add_step = rounded_timebin//len(stoken)
-                    mhot = chords.chord(token)
-                    rolled = np.roll(mhot[2], mhot[0])
-                    for i in range(count, count + add_step):
-                        if count >= total_len:
-                            break
-                        chroma[i] = torch.Tensor(rolled)
-                        count += 1
-        chromas.append(chroma)
-    chroma = torch.stack(chromas)
-    return chroma
-def convert_txtchord2chroma(chord, bpm, meter, gen_sec):
-    total_len = int(gen_sec * 32000 / 640)
-    phr_len = int(60. / bpm * (meter * 4) * 32000 / 640)
-    # phr_len = int(60. / bpm * (meter * 4) * 44100 / 2048)
-    chroma = torch.zeros([total_len, 12])
-    count = 0
-    offset = 0
-    stext = chord.split(" ")
-    timebin = phr_len // 4 # frames per bar
-    while count < total_len:
-        for tokens in stext:
-            if count >= total_len:
-                break
-            stoken = tokens.split(',')
-            for token in stoken:
-                off_timebin = timebin + offset
-                rounded_timebin = round(off_timebin)
-                offset = off_timebin - rounded_timebin
-                offset = offset/len(stoken)
-                add_step = rounded_timebin//len(stoken)
-                mhot = chords.chord(token)
-                rolled = np.roll(mhot[2], mhot[0])
-                for i in range(count, count + add_step):
-                    if count >= total_len:
-                        break
-                    chroma[i] = torch.Tensor(rolled)
-                    count += 1
-    return chroma
-def convert_txtchord2chroma_24(chord, bpm, meter, gen_sec):
-    total_len = int(gen_sec * 32000 / 640)
-    phr_len = int(60. / bpm * (meter * 4) * 32000 / 640)
-    # phr_len = int(60. / bpm * (meter * 4) * 44100 / 2048)
-    chroma = torch.zeros([total_len, 24])
-    count = 0
-    offset = 0
-    stext = chord.split(" ")
-    timebin = phr_len // 4 # frames per bar
-    while count < total_len:
-        for tokens in stext:
-            if count >= total_len:
-                break
-            stoken = tokens.split(',')
-            for token in stoken:
-                off_timebin = timebin + offset
-                rounded_timebin = round(off_timebin)
-                offset = off_timebin - rounded_timebin
-                offset = offset/len(stoken)
-                add_step = rounded_timebin//len(stoken)
-                root, bass, ivs_vec, _ = chords.chord(token)
-                root_vec = torch.zeros(12)
-                root_vec[root] = 1
-                final_vec = np.concatenate([root_vec, ivs_vec]) # [C]
-                for i in range(count, count + add_step):
-                    if count >= total_len:
-                        break
-                    chroma[i] = torch.Tensor(final_vec)
-                    count += 1
-    return chroma
-def get_chroma_chord_from_lab(chord_path, gen_sec):
-    total_len = int(gen_sec * 32000 / 640)
-    feat_hz = 32000/640
-    intervals = []
-    labels = []
-    feat_chord = np.zeros((12, total_len)) # root| ivs
-    with open(chord_path, 'r') as f:
-        for line in f.readlines():
-            splits = line.split()
-            if len(splits) == 3:
-                st_sec, ed_sec, ctag = splits
-                st_sec = float(st_sec)
-                ed_sec = float(ed_sec)
-                st_frame = int(st_sec*feat_hz)
-                ed_frame = int(ed_sec*feat_hz)
-                mhot = chords.chord(ctag)
-                final_vec = np.roll(mhot[2], mhot[0])
-                final_vec = final_vec[..., None] # [C, T]
-                feat_chord[:, st_frame:ed_frame] = final_vec
-    feat_chord = torch.from_numpy(feat_chord)
-    return feat_chord
-def get_chroma_chord_from_text(text_chord, bpm, meter, gen_sec):
-    total_len = int(gen_sec * 32000 / 640)
-    phr_len = int(60. / bpm * (meter * 4) * 32000 / 640)
-    chroma = np.zeros([12, total_len])
-    count = 0
-    offset = 0
-    stext = chord.split(" ")
-    timebin = phr_len // 4 # frames per bar
-    while count < total_len:
-        for tokens in stext:
-            if count >= total_len:
-                break
-            stoken = tokens.split(',')
-            for token in stoken:
-                off_timebin = timebin + offset
-                rounded_timebin = round(off_timebin)
-                offset = off_timebin - rounded_timebin
-                offset = offset/len(stoken)
-                add_step = rounded_timebin//len(stoken)
-                mhot = chords.chord(token)
-                final_vec = np.roll(mhot[2], mhot[0])
-                final_vec = final_vec[..., None] # [C, T]
-                for i in range(count, count + add_step):
-                    if count >= total_len:
-                        break
-                    chroma[:, i] = final_vec
-                    count += 1
-    feat_chord = torch.from_numpy(feat_chord)
-    return feat_chord
-def get_beat_from_npy(beat_path, gen_sec):
-    total_len = int(gen_sec * 32000 / 640)
-    beats_np = np.load(beat_path, allow_pickle=True)
-    feat_beats = np.zeros((2, total_len))
-    meter = int(max(beats_np.T[1]))
-    beat_time = beats_np[:, 0]
-    bar_time = beats_np[np.where(beats_np[:, 1] == 1)[0], 0]
-    beat_frame = [int((t)*feat_hz) for t in beat_time if (t >= 0 and t < duration)]
-    bar_frame =[int((t)*feat_hz) for t in bar_time if (t >= 0 and t < duration)]
-    feat_beats[0, beat_frame] = 1
-    feat_beats[1, bar_frame] = 1
-    kernel = np.array([0.05, 0.1, 0.3, 0.9, 0.3, 0.1, 0.05])
-    feat_beats[0] = np.convolve(feat_beats[0] , kernel, 'same') # apply soft kernel
-    beat_events = feat_beats[0] + feat_beats[1]
-    beat_events = torch.tensor(beat_events).unsqueeze(0) # [T] -> [1, T]
-    bpm = 60 // np.mean([j-i for i, j in zip(beat_time[:-1], beat_time[1:])])
-    return beat_events, bpm, meter
-def get_beat_from_bpm(bpm, meter, gen_sec):
-    total_len = int(gen_sec * 32000 / 640)
-    feat_beats = np.zeros((2, total_len))
-    beat_time_gap = 60 / bpm
-    beat_gap = 60 / bpm * feat_hz
-    beat_time = np.arange(0, duration, beat_time_gap)
-    beat_frame = np.round(np.arange(0, n_frames_feat, beat_gap)).astype(int)
-    if beat_frame[-1] == n_frames_feat:
-        beat_frame = beat_frame[:-1]
-    bar_frame = beat_frame[::meter]
-    feat_beats[0, beat_frame] = 1
-    feat_beats[1, bar_frame] = 1
-    kernel = np.array([0.05, 0.1, 0.3, 0.9, 0.3, 0.1, 0.05])
-    feat_beats[0] = np.convolve(feat_beats[0] , kernel, 'same') # apply soft kernel
-    beat_events = feat_beats[0] + feat_beats[1]
-    beat_events = torch.tensor(beat_events).unsqueeze(0) # [T] -> [1, T]
-    return beat_events, beat_time, meter

audiocraft/audiocraft/data/btc_chords.py DELETED Viewed

@@ -1,524 +0,0 @@
-# encoding: utf-8
-"""
-This module contains chord evaluation functionality.
-It provides the evaluation measures used for the MIREX ACE task, and
-tries to follow [1]_ and [2]_ as closely as possible.
-Notes
------
-This implementation tries to follow the references and their implementation
-(e.g., https://github.com/jpauwels/MusOOEvaluator for [2]_). However, there
-are some known (and possibly some unknown) differences. If you find one not
-listed in the following, please file an issue:
- - Detected chord segments are adjusted to fit the length of the annotations.
-   In particular, this means that, if necessary, filler segments of 'no chord'
-   are added at beginnings and ends. This can result in different segmentation
-   scores compared to the original implementation.
-References
-----------
-.. [1] Christopher Harte, "Towards Automatic Extraction of Harmony Information
-       from Music Signals." Dissertation,
-       Department for Electronic Engineering, Queen Mary University of London,
-       2010.
-.. [2] Johan Pauwels and Geoffroy Peeters.
-       "Evaluating Automatically Estimated Chord Sequences."
-       In Proceedings of ICASSP 2013, Vancouver, Canada, 2013.
-"""
-import numpy as np
-import pandas as pd
-CHORD_DTYPE = [('root', np.int_),
-               ('bass', np.int_),
-               ('intervals', np.int_, (12,)),
-               ('is_major',np.bool_)]
-CHORD_ANN_DTYPE = [('start', np.float32),
-                   ('end', np.float32),
-                   ('chord', CHORD_DTYPE)]
-NO_CHORD = (-1, -1, np.zeros(12, dtype=np.int_), False)
-UNKNOWN_CHORD = (-1, -1, np.ones(12, dtype=np.int_) * -1, False)
-PITCH_CLASS = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
-def idx_to_chord(idx):
-    if idx == 24:
-        return "-"
-    elif idx == 25:
-        return u"\u03B5"
-    minmaj = idx % 2
-    root = idx // 2
-    return PITCH_CLASS[root] + ("M" if minmaj == 0 else "m")
-class Chords:
-    def __init__(self):
-        self._shorthands = {
-            'maj': self.interval_list('(1,3,5)'),
-            'min': self.interval_list('(1,b3,5)'),
-            'dim': self.interval_list('(1,b3,b5)'),
-            'aug': self.interval_list('(1,3,#5)'),
-            'maj7': self.interval_list('(1,3,5,7)'),
-            'min7': self.interval_list('(1,b3,5,b7)'),
-            '7': self.interval_list('(1,3,5,b7)'),
-            '6': self.interval_list('(1,6)'),  # custom
-            '5': self.interval_list('(1,5)'),
-            '4': self.interval_list('(1,4)'),  # custom
-            '1': self.interval_list('(1)'),
-            'dim7': self.interval_list('(1,b3,b5,bb7)'),
-            'hdim7': self.interval_list('(1,b3,b5,b7)'),
-            'minmaj7': self.interval_list('(1,b3,5,7)'),
-            'maj6': self.interval_list('(1,3,5,6)'),
-            'min6': self.interval_list('(1,b3,5,6)'),
-            '9': self.interval_list('(1,3,5,b7,9)'),
-            'maj9': self.interval_list('(1,3,5,7,9)'),
-            'min9': self.interval_list('(1,b3,5,b7,9)'),
-            'add9': self.interval_list('(1,3,5,9)'), # custom
-            'sus2': self.interval_list('(1,2,5)'),
-            'sus4': self.interval_list('(1,4,5)'),
-            '7sus2': self.interval_list('(1,2,5,b7)'), # custom
-            '7sus4': self.interval_list('(1,4,5,b7)'), # custom
-            '11': self.interval_list('(1,3,5,b7,9,11)'),
-            'min11': self.interval_list('(1,b3,5,b7,9,11)'),
-            '13': self.interval_list('(1,3,5,b7,13)'),
-            'maj13': self.interval_list('(1,3,5,7,13)'),
-            'min13': self.interval_list('(1,b3,5,b7,13)')
-        }
-    def chords(self, labels):
-        """
-        Transform a list of chord labels into an array of internal numeric
-        representations.
-        Parameters
-        ----------
-        labels : list
-            List of chord labels (str).
-        Returns
-        -------
-        chords : numpy.array
-            Structured array with columns 'root', 'bass', and 'intervals',
-            containing a numeric representation of chords.
-        """
-        crds = np.zeros(len(labels), dtype=CHORD_DTYPE)
-        cache = {}
-        for i, lbl in enumerate(labels):
-            cv = cache.get(lbl, None)
-            if cv is None:
-                cv = self.chord(lbl)
-                cache[lbl] = cv
-            crds[i] = cv
-        return crds
-    def label_error_modify(self, label):
-        if label == 'Emin/4': label = 'E:min/4'
-        elif label == 'A7/3': label = 'A:7/3'
-        elif label == 'Bb7/3': label = 'Bb:7/3'
-        elif label == 'Bb7/5': label = 'Bb:7/5'
-        elif label.find(':') == -1:
-            if label.find('min') != -1:
-                label = label[:label.find('min')] + ':' + label[label.find('min'):]
-        return label
-    def chord(self, label):
-        """
-        Transform a chord label into the internal numeric represenation of
-        (root, bass, intervals array).
-        Parameters
-        ----------
-        label : str
-            Chord label.
-        Returns
-        -------
-        chord : tuple
-            Numeric representation of the chord: (root, bass, intervals array).
-        """
-        is_major = False
-        if label == 'N':
-            return NO_CHORD
-        if label == 'X':
-            return UNKNOWN_CHORD
-        label = self.label_error_modify(label)
-        c_idx = label.find(':')
-        s_idx = label.find('/')
-        if c_idx == -1:
-            quality_str = 'maj'
-            if s_idx == -1:
-                root_str = label
-                bass_str = ''
-            else:
-                root_str = label[:s_idx]
-                bass_str = label[s_idx + 1:]
-        else:
-            root_str = label[:c_idx]
-            if s_idx == -1:
-                quality_str = label[c_idx + 1:]
-                bass_str = ''
-            else:
-                quality_str = label[c_idx + 1:s_idx]
-                bass_str = label[s_idx + 1:]
-        root = self.pitch(root_str)
-        bass = self.interval(bass_str) if bass_str else 0
-        ivs = self.chord_intervals(quality_str)
-        ivs[bass] = 1
-        if 'min' in quality_str:
-            is_major = False
-        else:
-            is_major = True
-        return root, bass, ivs, is_major
-    _l = [0, 1, 1, 0, 1, 1, 1]
-    _chroma_id = (np.arange(len(_l) * 2) + 1) + np.array(_l + _l).cumsum() - 1
-    def modify(self, base_pitch, modifier):
-        """
-        Modify a pitch class in integer representation by a given modifier string.
-        A modifier string can be any sequence of 'b' (one semitone down)
-        and '#' (one semitone up).
-        Parameters
-        ----------
-        base_pitch : int
-            Pitch class as integer.
-        modifier : str
-            String of modifiers ('b' or '#').
-        Returns
-        -------
-        modified_pitch : int
-            Modified root note.
-        """
-        for m in modifier:
-            if m == 'b':
-                base_pitch -= 1
-            elif m == '#':
-                base_pitch += 1
-            else:
-                raise ValueError('Unknown modifier: {}'.format(m))
-        return base_pitch
-    def pitch(self, pitch_str):
-        """
-        Convert a string representation of a pitch class (consisting of root
-        note and modifiers) to an integer representation.
-        Parameters
-        ----------
-        pitch_str : str
-            String representation of a pitch class.
-        Returns
-        -------
-        pitch : int
-            Integer representation of a pitch class.
-        """
-        return self.modify(self._chroma_id[(ord(pitch_str[0]) - ord('C')) % 7],
-                      pitch_str[1:]) % 12
-    def interval(self, interval_str):
-        """
-        Convert a string representation of a musical interval into a pitch class
-        (e.g. a minor seventh 'b7' into 10, because it is 10 semitones above its
-        base note).
-        Parameters
-        ----------
-        interval_str : str
-            Musical interval.
-        Returns
-        -------
-        pitch_class : int
-            Number of semitones to base note of interval.
-        """
-        for i, c in enumerate(interval_str):
-            if c.isdigit():
-                return self.modify(self._chroma_id[int(interval_str[i:]) - 1],
-                              interval_str[:i]) % 12
-    def interval_list(self, intervals_str, given_pitch_classes=None):
-        """
-        Convert a list of intervals given as string to a binary pitch class
-        representation. For example, 'b3, 5' would become
-        [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0].
-        Parameters
-        ----------
-        intervals_str : str
-            List of intervals as comma-separated string (e.g. 'b3, 5').
-        given_pitch_classes : None or numpy array
-            If None, start with empty pitch class array, if numpy array of length
-            12, this array will be modified.
-        Returns
-        -------
-        pitch_classes : numpy array
-            Binary pitch class representation of intervals.
-        """
-        if given_pitch_classes is None:
-            given_pitch_classes = np.zeros(12, dtype=np.int_)
-        for int_def in intervals_str[1:-1].split(','):
-            int_def = int_def.strip()
-            if int_def[0] == '*':
-                given_pitch_classes[self.interval(int_def[1:])] = 0
-            else:
-                given_pitch_classes[self.interval(int_def)] = 1
-        return given_pitch_classes
-    # mapping of shorthand interval notations to the actual interval representation
-    def chord_intervals(self, quality_str):
-        """
-        Convert a chord quality string to a pitch class representation. For
-        example, 'maj' becomes [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0].
-        Parameters
-        ----------
-        quality_str : str
-            String defining the chord quality.
-        Returns
-        -------
-        pitch_classes : numpy array
-            Binary pitch class representation of chord quality.
-        """
-        list_idx = quality_str.find('(')
-        if list_idx == -1:
-            return self._shorthands[quality_str].copy()
-        if list_idx != 0:
-            ivs = self._shorthands[quality_str[:list_idx]].copy()
-        else:
-            ivs = np.zeros(12, dtype=np.int_)
-        return self.interval_list(quality_str[list_idx:], ivs)
-    def load_chords(self, filename):
-        """
-        Load chords from a text file.
-        The chord must follow the syntax defined in [1]_.
-        Parameters
-        ----------
-        filename : str
-            File containing chord segments.
-        Returns
-        -------
-        crds : numpy structured array
-            Structured array with columns "start", "end", and "chord",
-            containing the beginning, end, and chord definition of chord
-            segments.
-        References
-        ----------
-        .. [1] Christopher Harte, "Towards Automatic Extraction of Harmony
-               Information from Music Signals." Dissertation,
-               Department for Electronic Engineering, Queen Mary University of
-               London, 2010.
-        """
-        start, end, chord_labels = [], [], []
-        with open(filename, 'r') as f:
-            for line in f:
-                if line:
-                    splits = line.split()
-                    if len(splits) == 3:
-                        s = splits[0]
-                        e = splits[1]
-                        l = splits[2]
-                        start.append(float(s))
-                        end.append(float(e))
-                        chord_labels.append(l)
-        crds = np.zeros(len(start), dtype=CHORD_ANN_DTYPE)
-        crds['start'] = start
-        crds['end'] = end
-        crds['chord'] = self.chords(chord_labels)
-        return crds
-    def reduce_to_triads(self, chords, keep_bass=False):
-        """
-        Reduce chords to triads.
-        The function follows the reduction rules implemented in [1]_. If a chord
-        chord does not contain a third, major second or fourth, it is reduced to
-        a power chord. If it does not contain neither a third nor a fifth, it is
-        reduced to a single note "chord".
-        Parameters
-        ----------
-        chords : numpy structured array
-            Chords to be reduced.
-        keep_bass : bool
-            Indicates whether to keep the bass note or set it to 0.
-        Returns
-        -------
-        reduced_chords : numpy structured array
-            Chords reduced to triads.
-        References
-        ----------
-        .. [1] Johan Pauwels and Geoffroy Peeters.
-               "Evaluating Automatically Estimated Chord Sequences."
-               In Proceedings of ICASSP 2013, Vancouver, Canada, 2013.
-        """
-        unison = chords['intervals'][:, 0].astype(bool)
-        maj_sec = chords['intervals'][:, 2].astype(bool)
-        min_third = chords['intervals'][:, 3].astype(bool)
-        maj_third = chords['intervals'][:, 4].astype(bool)
-        perf_fourth = chords['intervals'][:, 5].astype(bool)
-        dim_fifth = chords['intervals'][:, 6].astype(bool)
-        perf_fifth = chords['intervals'][:, 7].astype(bool)
-        aug_fifth = chords['intervals'][:, 8].astype(bool)
-        no_chord = (chords['intervals'] == NO_CHORD[-1]).all(axis=1)
-        reduced_chords = chords.copy()
-        ivs = reduced_chords['intervals']
-        ivs[~no_chord] = self.interval_list('(1)')
-        ivs[unison & perf_fifth] = self.interval_list('(1,5)')
-        ivs[~perf_fourth & maj_sec] = self._shorthands['sus2']
-        ivs[perf_fourth & ~maj_sec] = self._shorthands['sus4']
-        ivs[min_third] = self._shorthands['min']
-        ivs[min_third & aug_fifth & ~perf_fifth] = self.interval_list('(1,b3,#5)')
-        ivs[min_third & dim_fifth & ~perf_fifth] = self._shorthands['dim']
-        ivs[maj_third] = self._shorthands['maj']
-        ivs[maj_third & dim_fifth & ~perf_fifth] = self.interval_list('(1,3,b5)')
-        ivs[maj_third & aug_fifth & ~perf_fifth] = self._shorthands['aug']
-        if not keep_bass:
-            reduced_chords['bass'] = 0
-        else:
-            # remove bass notes if they are not part of the intervals anymore
-            reduced_chords['bass'] *= ivs[range(len(reduced_chords)),
-                                          reduced_chords['bass']]
-        # keep -1 in bass for no chords
-        reduced_chords['bass'][no_chord] = -1
-        return reduced_chords
-    def convert_to_id(self, root, is_major):
-        if root == -1:
-            return 24
-        else:
-            if is_major:
-                return root * 2
-            else:
-                return root * 2 + 1
-    def get_converted_chord(self, filename):
-        loaded_chord = self.load_chords(filename)
-        triads = self.reduce_to_triads(loaded_chord['chord'])
-        df = self.assign_chord_id(triads)
-        df['start'] = loaded_chord['start']
-        df['end'] = loaded_chord['end']
-        return df
-    def assign_chord_id(self, entry):
-        # maj, min chord only
-        # if you want to add other chord, change this part and get_converted_chord(reduce_to_triads)
-        df = pd.DataFrame(data=entry[['root', 'is_major']])
-        df['chord_id'] = df.apply(lambda row: self.convert_to_id(row['root'], row['is_major']), axis=1)
-        return df
-    def convert_to_id_voca(self, root, quality):
-        if root == -1:
-            return 169
-        else:
-            if quality == 'min':
-                return root * 14
-            elif quality == 'maj':
-                return root * 14 + 1
-            elif quality == 'dim':
-                return root * 14 + 2
-            elif quality == 'aug':
-                return root * 14 + 3
-            elif quality == 'min6':
-                return root * 14 + 4
-            elif quality == 'maj6':
-                return root * 14 + 5
-            elif quality == 'min7':
-                return root * 14 + 6
-            elif quality == 'minmaj7':
-                return root * 14 + 7
-            elif quality == 'maj7':
-                return root * 14 + 8
-            elif quality == '7':
-                return root * 14 + 9
-            elif quality == 'dim7':
-                return root * 14 + 10
-            elif quality == 'hdim7':
-                return root * 14 + 11
-            elif quality == 'sus2':
-                return root * 14 + 12
-            elif quality == 'sus4':
-                return root * 14 + 13
-            else:
-                return 168
-    def lab_file_error_modify(self, ref_labels):
-        for i in range(len(ref_labels)):
-            if ref_labels[i][-2:] == ':4':
-                ref_labels[i] = ref_labels[i].replace(':4', ':sus4')
-            elif ref_labels[i][-2:] == ':6':
-                ref_labels[i] = ref_labels[i].replace(':6', ':maj6')
-            elif ref_labels[i][-4:] == ':6/2':
-                ref_labels[i] = ref_labels[i].replace(':6/2', ':maj6/2')
-            elif ref_labels[i] == 'Emin/4':
-                ref_labels[i] = 'E:min/4'
-            elif ref_labels[i] == 'A7/3':
-                ref_labels[i] = 'A:7/3'
-            elif ref_labels[i] == 'Bb7/3':
-                ref_labels[i] = 'Bb:7/3'
-            elif ref_labels[i] == 'Bb7/5':
-                ref_labels[i] = 'Bb:7/5'
-            elif ref_labels[i].find(':') == -1:
-                if ref_labels[i].find('min') != -1:
-                    ref_labels[i] = ref_labels[i][:ref_labels[i].find('min')] + ':' + ref_labels[i][ref_labels[i].find('min'):]
-        return ref_labels

audiocraft/audiocraft/data/chords.py DELETED Viewed

@@ -1,524 +0,0 @@
-# encoding: utf-8
-"""
-This module contains chord evaluation functionality.
-It provides the evaluation measures used for the MIREX ACE task, and
-tries to follow [1]_ and [2]_ as closely as possible.
-Notes
------
-This implementation tries to follow the references and their implementation
-(e.g., https://github.com/jpauwels/MusOOEvaluator for [2]_). However, there
-are some known (and possibly some unknown) differences. If you find one not
-listed in the following, please file an issue:
- - Detected chord segments are adjusted to fit the length of the annotations.
-   In particular, this means that, if necessary, filler segments of 'no chord'
-   are added at beginnings and ends. This can result in different segmentation
-   scores compared to the original implementation.
-References
-----------
-.. [1] Christopher Harte, "Towards Automatic Extraction of Harmony Information
-       from Music Signals." Dissertation,
-       Department for Electronic Engineering, Queen Mary University of London,
-       2010.
-.. [2] Johan Pauwels and Geoffroy Peeters.
-       "Evaluating Automatically Estimated Chord Sequences."
-       In Proceedings of ICASSP 2013, Vancouver, Canada, 2013.
-"""
-import numpy as np
-import pandas as pd
-CHORD_DTYPE = [('root', np.int_),
-               ('bass', np.int_),
-               ('intervals', np.int_, (12,)),
-               ('is_major',np.bool_)]
-CHORD_ANN_DTYPE = [('start', np.float32),
-                   ('end', np.float32),
-                   ('chord', CHORD_DTYPE)]
-NO_CHORD = (-1, -1, np.zeros(12, dtype=np.int_), False)
-UNKNOWN_CHORD = (-1, -1, np.ones(12, dtype=np.int_) * -1, False)
-PITCH_CLASS = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
-def idx_to_chord(idx):
-    if idx == 24:
-        return "-"
-    elif idx == 25:
-        return u"\u03B5"
-    minmaj = idx % 2
-    root = idx // 2
-    return PITCH_CLASS[root] + ("M" if minmaj == 0 else "m")
-class Chords:
-    def __init__(self):
-        self._shorthands = {
-            'maj': self.interval_list('(1,3,5)'),
-            'min': self.interval_list('(1,b3,5)'),
-            'dim': self.interval_list('(1,b3,b5)'),
-            'aug': self.interval_list('(1,3,#5)'),
-            'maj7': self.interval_list('(1,3,5,7)'),
-            'min7': self.interval_list('(1,b3,5,b7)'),
-            '7': self.interval_list('(1,3,5,b7)'),
-            '6': self.interval_list('(1,6)'),  # custom
-            '5': self.interval_list('(1,5)'),
-            '4': self.interval_list('(1,4)'),  # custom
-            '1': self.interval_list('(1)'),
-            'dim7': self.interval_list('(1,b3,b5,bb7)'),
-            'hdim7': self.interval_list('(1,b3,b5,b7)'),
-            'minmaj7': self.interval_list('(1,b3,5,7)'),
-            'maj6': self.interval_list('(1,3,5,6)'),
-            'min6': self.interval_list('(1,b3,5,6)'),
-            '9': self.interval_list('(1,3,5,b7,9)'),
-            'maj9': self.interval_list('(1,3,5,7,9)'),
-            'min9': self.interval_list('(1,b3,5,b7,9)'),
-            'add9': self.interval_list('(1,3,5,9)'), # custom
-            'sus2': self.interval_list('(1,2,5)'),
-            'sus4': self.interval_list('(1,4,5)'),
-            '7sus2': self.interval_list('(1,2,5,b7)'), # custom
-            '7sus4': self.interval_list('(1,4,5,b7)'), # custom
-            '11': self.interval_list('(1,3,5,b7,9,11)'),
-            'min11': self.interval_list('(1,b3,5,b7,9,11)'),
-            '13': self.interval_list('(1,3,5,b7,13)'),
-            'maj13': self.interval_list('(1,3,5,7,13)'),
-            'min13': self.interval_list('(1,b3,5,b7,13)')
-        }
-    def chords(self, labels):
-        """
-        Transform a list of chord labels into an array of internal numeric
-        representations.
-        Parameters
-        ----------
-        labels : list
-            List of chord labels (str).
-        Returns
-        -------
-        chords : numpy.array
-            Structured array with columns 'root', 'bass', and 'intervals',
-            containing a numeric representation of chords.
-        """
-        crds = np.zeros(len(labels), dtype=CHORD_DTYPE)
-        cache = {}
-        for i, lbl in enumerate(labels):
-            cv = cache.get(lbl, None)
-            if cv is None:
-                cv = self.chord(lbl)
-                cache[lbl] = cv
-            crds[i] = cv
-        return crds
-    def label_error_modify(self, label):
-        if label == 'Emin/4': label = 'E:min/4'
-        elif label == 'A7/3': label = 'A:7/3'
-        elif label == 'Bb7/3': label = 'Bb:7/3'
-        elif label == 'Bb7/5': label = 'Bb:7/5'
-        elif label.find(':') == -1:
-            if label.find('min') != -1:
-                label = label[:label.find('min')] + ':' + label[label.find('min'):]
-        return label
-    def chord(self, label):
-        """
-        Transform a chord label into the internal numeric represenation of
-        (root, bass, intervals array).
-        Parameters
-        ----------
-        label : str
-            Chord label.
-        Returns
-        -------
-        chord : tuple
-            Numeric representation of the chord: (root, bass, intervals array).
-        """
-        is_major = False
-        if label == 'N':
-            return NO_CHORD
-        if label == 'X':
-            return UNKNOWN_CHORD
-        label = self.label_error_modify(label)
-        c_idx = label.find(':')
-        s_idx = label.find('/')
-        if c_idx == -1:
-            quality_str = 'maj'
-            if s_idx == -1:
-                root_str = label
-                bass_str = ''
-            else:
-                root_str = label[:s_idx]
-                bass_str = label[s_idx + 1:]
-        else:
-            root_str = label[:c_idx]
-            if s_idx == -1:
-                quality_str = label[c_idx + 1:]
-                bass_str = ''
-            else:
-                quality_str = label[c_idx + 1:s_idx]
-                bass_str = label[s_idx + 1:]
-        root = self.pitch(root_str)
-        bass = self.interval(bass_str) if bass_str else 0
-        ivs = self.chord_intervals(quality_str)
-        ivs[bass] = 1
-        if 'min' in quality_str:
-            is_major = False
-        else:
-            is_major = True
-        return root, bass, ivs, is_major
-    _l = [0, 1, 1, 0, 1, 1, 1]
-    _chroma_id = (np.arange(len(_l) * 2) + 1) + np.array(_l + _l).cumsum() - 1
-    def modify(self, base_pitch, modifier):
-        """
-        Modify a pitch class in integer representation by a given modifier string.
-        A modifier string can be any sequence of 'b' (one semitone down)
-        and '#' (one semitone up).
-        Parameters
-        ----------
-        base_pitch : int
-            Pitch class as integer.
-        modifier : str
-            String of modifiers ('b' or '#').
-        Returns
-        -------
-        modified_pitch : int
-            Modified root note.
-        """
-        for m in modifier:
-            if m == 'b':
-                base_pitch -= 1
-            elif m == '#':
-                base_pitch += 1
-            else:
-                raise ValueError('Unknown modifier: {}'.format(m))
-        return base_pitch
-    def pitch(self, pitch_str):
-        """
-        Convert a string representation of a pitch class (consisting of root
-        note and modifiers) to an integer representation.
-        Parameters
-        ----------
-        pitch_str : str
-            String representation of a pitch class.
-        Returns
-        -------
-        pitch : int
-            Integer representation of a pitch class.
-        """
-        return self.modify(self._chroma_id[(ord(pitch_str[0]) - ord('C')) % 7],
-                      pitch_str[1:]) % 12
-    def interval(self, interval_str):
-        """
-        Convert a string representation of a musical interval into a pitch class
-        (e.g. a minor seventh 'b7' into 10, because it is 10 semitones above its
-        base note).
-        Parameters
-        ----------
-        interval_str : str
-            Musical interval.
-        Returns
-        -------
-        pitch_class : int
-            Number of semitones to base note of interval.
-        """
-        for i, c in enumerate(interval_str):
-            if c.isdigit():
-                return self.modify(self._chroma_id[int(interval_str[i:]) - 1],
-                              interval_str[:i]) % 12
-    def interval_list(self, intervals_str, given_pitch_classes=None):
-        """
-        Convert a list of intervals given as string to a binary pitch class
-        representation. For example, 'b3, 5' would become
-        [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0].
-        Parameters
-        ----------
-        intervals_str : str
-            List of intervals as comma-separated string (e.g. 'b3, 5').
-        given_pitch_classes : None or numpy array
-            If None, start with empty pitch class array, if numpy array of length
-            12, this array will be modified.
-        Returns
-        -------
-        pitch_classes : numpy array
-            Binary pitch class representation of intervals.
-        """
-        if given_pitch_classes is None:
-            given_pitch_classes = np.zeros(12, dtype=np.int_)
-        for int_def in intervals_str[1:-1].split(','):
-            int_def = int_def.strip()
-            if int_def[0] == '*':
-                given_pitch_classes[self.interval(int_def[1:])] = 0
-            else:
-                given_pitch_classes[self.interval(int_def)] = 1
-        return given_pitch_classes
-    # mapping of shorthand interval notations to the actual interval representation
-    def chord_intervals(self, quality_str):
-        """
-        Convert a chord quality string to a pitch class representation. For
-        example, 'maj' becomes [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0].
-        Parameters
-        ----------
-        quality_str : str
-            String defining the chord quality.
-        Returns
-        -------
-        pitch_classes : numpy array
-            Binary pitch class representation of chord quality.
-        """
-        list_idx = quality_str.find('(')
-        if list_idx == -1:
-            return self._shorthands[quality_str].copy()
-        if list_idx != 0:
-            ivs = self._shorthands[quality_str[:list_idx]].copy()
-        else:
-            ivs = np.zeros(12, dtype=np.int_)
-        return self.interval_list(quality_str[list_idx:], ivs)
-    def load_chords(self, filename):
-        """
-        Load chords from a text file.
-        The chord must follow the syntax defined in [1]_.
-        Parameters
-        ----------
-        filename : str
-            File containing chord segments.
-        Returns
-        -------
-        crds : numpy structured array
-            Structured array with columns "start", "end", and "chord",
-            containing the beginning, end, and chord definition of chord
-            segments.
-        References
-        ----------
-        .. [1] Christopher Harte, "Towards Automatic Extraction of Harmony
-               Information from Music Signals." Dissertation,
-               Department for Electronic Engineering, Queen Mary University of
-               London, 2010.
-        """
-        start, end, chord_labels = [], [], []
-        with open(filename, 'r') as f:
-            for line in f:
-                if line:
-                    splits = line.split()
-                    if len(splits) == 3:
-                        s = splits[0]
-                        e = splits[1]
-                        l = splits[2]
-                        start.append(float(s))
-                        end.append(float(e))
-                        chord_labels.append(l)
-        crds = np.zeros(len(start), dtype=CHORD_ANN_DTYPE)
-        crds['start'] = start
-        crds['end'] = end
-        crds['chord'] = self.chords(chord_labels)
-        return crds
-    def reduce_to_triads(self, chords, keep_bass=False):
-        """
-        Reduce chords to triads.
-        The function follows the reduction rules implemented in [1]_. If a chord
-        chord does not contain a third, major second or fourth, it is reduced to
-        a power chord. If it does not contain neither a third nor a fifth, it is
-        reduced to a single note "chord".
-        Parameters
-        ----------
-        chords : numpy structured array
-            Chords to be reduced.
-        keep_bass : bool
-            Indicates whether to keep the bass note or set it to 0.
-        Returns
-        -------
-        reduced_chords : numpy structured array
-            Chords reduced to triads.
-        References
-        ----------
-        .. [1] Johan Pauwels and Geoffroy Peeters.
-               "Evaluating Automatically Estimated Chord Sequences."
-               In Proceedings of ICASSP 2013, Vancouver, Canada, 2013.
-        """
-        unison = chords['intervals'][:, 0].astype(bool)
-        maj_sec = chords['intervals'][:, 2].astype(bool)
-        min_third = chords['intervals'][:, 3].astype(bool)
-        maj_third = chords['intervals'][:, 4].astype(bool)
-        perf_fourth = chords['intervals'][:, 5].astype(bool)
-        dim_fifth = chords['intervals'][:, 6].astype(bool)
-        perf_fifth = chords['intervals'][:, 7].astype(bool)
-        aug_fifth = chords['intervals'][:, 8].astype(bool)
-        no_chord = (chords['intervals'] == NO_CHORD[-1]).all(axis=1)
-        reduced_chords = chords.copy()
-        ivs = reduced_chords['intervals']
-        ivs[~no_chord] = self.interval_list('(1)')
-        ivs[unison & perf_fifth] = self.interval_list('(1,5)')
-        ivs[~perf_fourth & maj_sec] = self._shorthands['sus2']
-        ivs[perf_fourth & ~maj_sec] = self._shorthands['sus4']
-        ivs[min_third] = self._shorthands['min']
-        ivs[min_third & aug_fifth & ~perf_fifth] = self.interval_list('(1,b3,#5)')
-        ivs[min_third & dim_fifth & ~perf_fifth] = self._shorthands['dim']
-        ivs[maj_third] = self._shorthands['maj']
-        ivs[maj_third & dim_fifth & ~perf_fifth] = self.interval_list('(1,3,b5)')
-        ivs[maj_third & aug_fifth & ~perf_fifth] = self._shorthands['aug']
-        if not keep_bass:
-            reduced_chords['bass'] = 0
-        else:
-            # remove bass notes if they are not part of the intervals anymore
-            reduced_chords['bass'] *= ivs[range(len(reduced_chords)),
-                                          reduced_chords['bass']]
-        # keep -1 in bass for no chords
-        reduced_chords['bass'][no_chord] = -1
-        return reduced_chords
-    def convert_to_id(self, root, is_major):
-        if root == -1:
-            return 24
-        else:
-            if is_major:
-                return root * 2
-            else:
-                return root * 2 + 1
-    def get_converted_chord(self, filename):
-        loaded_chord = self.load_chords(filename)
-        triads = self.reduce_to_triads(loaded_chord['chord'])
-        df = self.assign_chord_id(triads)
-        df['start'] = loaded_chord['start']
-        df['end'] = loaded_chord['end']
-        return df
-    def assign_chord_id(self, entry):
-        # maj, min chord only
-        # if you want to add other chord, change this part and get_converted_chord(reduce_to_triads)
-        df = pd.DataFrame(data=entry[['root', 'is_major']])
-        df['chord_id'] = df.apply(lambda row: self.convert_to_id(row['root'], row['is_major']), axis=1)
-        return df
-    def convert_to_id_voca(self, root, quality):
-        if root == -1:
-            return 169
-        else:
-            if quality == 'min':
-                return root * 14
-            elif quality == 'maj':
-                return root * 14 + 1
-            elif quality == 'dim':
-                return root * 14 + 2
-            elif quality == 'aug':
-                return root * 14 + 3
-            elif quality == 'min6':
-                return root * 14 + 4
-            elif quality == 'maj6':
-                return root * 14 + 5
-            elif quality == 'min7':
-                return root * 14 + 6
-            elif quality == 'minmaj7':
-                return root * 14 + 7
-            elif quality == 'maj7':
-                return root * 14 + 8
-            elif quality == '7':
-                return root * 14 + 9
-            elif quality == 'dim7':
-                return root * 14 + 10
-            elif quality == 'hdim7':
-                return root * 14 + 11
-            elif quality == 'sus2':
-                return root * 14 + 12
-            elif quality == 'sus4':
-                return root * 14 + 13
-            else:
-                return 168
-    def lab_file_error_modify(self, ref_labels):
-        for i in range(len(ref_labels)):
-            if ref_labels[i][-2:] == ':4':
-                ref_labels[i] = ref_labels[i].replace(':4', ':sus4')
-            elif ref_labels[i][-2:] == ':6':
-                ref_labels[i] = ref_labels[i].replace(':6', ':maj6')
-            elif ref_labels[i][-4:] == ':6/2':
-                ref_labels[i] = ref_labels[i].replace(':6/2', ':maj6/2')
-            elif ref_labels[i] == 'Emin/4':
-                ref_labels[i] = 'E:min/4'
-            elif ref_labels[i] == 'A7/3':
-                ref_labels[i] = 'A:7/3'
-            elif ref_labels[i] == 'Bb7/3':
-                ref_labels[i] = 'Bb:7/3'
-            elif ref_labels[i] == 'Bb7/5':
-                ref_labels[i] = 'Bb:7/5'
-            elif ref_labels[i].find(':') == -1:
-                if ref_labels[i].find('min') != -1:
-                    ref_labels[i] = ref_labels[i][:ref_labels[i].find('min')] + ':' + ref_labels[i][ref_labels[i].find('min'):]
-        return ref_labels

audiocraft/audiocraft/data/info_audio_dataset.py DELETED Viewed

@@ -1,110 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""Base classes for the datasets that also provide non-audio metadata,
-e.g. description, text transcription etc.
-"""
-from dataclasses import dataclass
-import logging
-import math
-import re
-import typing as tp
-import torch
-from .audio_dataset import AudioDataset, AudioMeta
-from ..environment import AudioCraftEnvironment
-from ..modules.conditioners import SegmentWithAttributes, ConditioningAttributes
-logger = logging.getLogger(__name__)
-def _clusterify_meta(meta: AudioMeta) -> AudioMeta:
-    """Monkey-patch meta to match cluster specificities."""
-    meta.path = AudioCraftEnvironment.apply_dataset_mappers(meta.path)
-    if meta.info_path is not None:
-        meta.info_path.zip_path = AudioCraftEnvironment.apply_dataset_mappers(meta.info_path.zip_path)
-    return meta
-def clusterify_all_meta(meta: tp.List[AudioMeta]) -> tp.List[AudioMeta]:
-    """Monkey-patch all meta to match cluster specificities."""
-    return [_clusterify_meta(m) for m in meta]
-@dataclass
-class AudioInfo(SegmentWithAttributes):
-    """Dummy SegmentInfo with empty attributes.
-    The InfoAudioDataset is expected to return metadata that inherits
-    from SegmentWithAttributes class and can return conditioning attributes.
-    This basically guarantees all datasets will be compatible with current
-    solver that contain conditioners requiring this.
-    """
-    audio_tokens: tp.Optional[torch.Tensor] = None  # populated when using cached batch for training a LM.
-    def to_condition_attributes(self) -> ConditioningAttributes:
-        return ConditioningAttributes()
-class InfoAudioDataset(AudioDataset):
-    """AudioDataset that always returns metadata as SegmentWithAttributes along with the audio waveform.
-    See `audiocraft.data.audio_dataset.AudioDataset` for initialization arguments.
-    """
-    def __init__(self, meta: tp.List[AudioMeta], **kwargs):
-        super().__init__(clusterify_all_meta(meta), **kwargs)
-    def __getitem__(self, index: int) -> tp.Union[torch.Tensor, tp.Tuple[torch.Tensor, SegmentWithAttributes]]:
-        if not self.return_info:
-            wav = super().__getitem__(index)
-            assert isinstance(wav, torch.Tensor)
-            return wav
-        wav, meta = super().__getitem__(index)
-        return wav, AudioInfo(**meta.to_dict())
-def get_keyword_or_keyword_list(value: tp.Optional[str]) -> tp.Union[tp.Optional[str], tp.Optional[tp.List[str]]]:
-    """Preprocess a single keyword or possible a list of keywords."""
-    if isinstance(value, list):
-        return get_keyword_list(value)
-    else:
-        return get_keyword(value)
-def get_string(value: tp.Optional[str]) -> tp.Optional[str]:
-    """Preprocess a single keyword."""
-    if value is None or (not isinstance(value, str)) or len(value) == 0 or value == 'None':
-        return None
-    else:
-        return value.strip()
-def get_keyword(value: tp.Optional[str]) -> tp.Optional[str]:
-    """Preprocess a single keyword."""
-    if value is None or (not isinstance(value, str)) or len(value) == 0 or value == 'None':
-        return None
-    else:
-        return value.strip().lower()
-def get_keyword_list(values: tp.Union[str, tp.List[str]]) -> tp.Optional[tp.List[str]]:
-    """Preprocess a list of keywords."""
-    if isinstance(values, str):
-        values = [v.strip() for v in re.split(r'[,\s]', values)]
-    elif isinstance(values, float) and math.isnan(values):
-        values = []
-    if not isinstance(values, list):
-        logger.debug(f"Unexpected keyword list {values}")
-        values = [str(values)]
-    kws = [get_keyword(v) for v in values]
-    kw_list = [k for k in kws if k is not None]
-    if len(kw_list) == 0:
-        return None
-    else:
-        return kw_list

audiocraft/audiocraft/data/music_dataset.py DELETED Viewed

@@ -1,349 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""Dataset of music tracks with rich metadata.
-"""
-from dataclasses import dataclass, field, fields, replace
-import gzip
-import json
-import logging
-from pathlib import Path
-import random
-import typing as tp
-import pretty_midi
-import numpy as np
-import torch
-import torch.nn.functional as F
-from .btc_chords import Chords
-from .info_audio_dataset import (
-    InfoAudioDataset,
-    AudioInfo,
-    get_keyword_list,
-    get_keyword,
-    get_string
-)
-from ..modules.conditioners import (
-    ConditioningAttributes,
-    JointEmbedCondition,
-    WavCondition,
-    ChordCondition,
-    BeatCondition
-)
-from ..utils.utils import warn_once
-logger = logging.getLogger(__name__)
-CHORDS = Chords()
-@dataclass
-class MusicInfo(AudioInfo):
-    """Segment info augmented with music metadata.
-    """
-    # music-specific metadata
-    title: tp.Optional[str] = None
-    artist: tp.Optional[str] = None  # anonymized artist id, used to ensure no overlap between splits
-    key: tp.Optional[str] = None
-    bpm: tp.Optional[float] = None
-    genre: tp.Optional[str] = None
-    moods: tp.Optional[list] = None
-    keywords: tp.Optional[list] = None
-    description: tp.Optional[str] = None
-    name: tp.Optional[str] = None
-    instrument: tp.Optional[str] = None
-    chord: tp.Optional[ChordCondition] = None
-    beat: tp.Optional[BeatCondition] = None
-    # original wav accompanying the metadata
-    self_wav: tp.Optional[WavCondition] = None
-    # dict mapping attributes names to tuple of wav, text and metadata
-    joint_embed: tp.Dict[str, JointEmbedCondition] = field(default_factory=dict)
-    @property
-    def has_music_meta(self) -> bool:
-        return self.name is not None
-    def to_condition_attributes(self) -> ConditioningAttributes:
-        out = ConditioningAttributes()
-        for _field in fields(self):
-            key, value = _field.name, getattr(self, _field.name)
-            if key == 'self_wav':
-                out.wav[key] = value
-            elif key == 'chord':
-                out.chord[key] = value
-            elif key == 'beat':
-                out.beat[key] = value
-            elif key == 'joint_embed':
-                for embed_attribute, embed_cond in value.items():
-                    out.joint_embed[embed_attribute] = embed_cond
-            else:
-                if isinstance(value, list):
-                    value = ' '.join(value)
-                out.text[key] = value
-        return out
-    @staticmethod
-    def attribute_getter(attribute):
-        if attribute == 'bpm':
-            preprocess_func = get_bpm
-        elif attribute == 'key':
-            preprocess_func = get_musical_key
-        elif attribute in ['moods', 'keywords']:
-            preprocess_func = get_keyword_list
-        elif attribute in ['genre', 'name', 'instrument']:
-            preprocess_func = get_keyword
-        elif attribute in ['title', 'artist', 'description']:
-            preprocess_func = get_string
-        else:
-            preprocess_func = None
-        return preprocess_func
-    @classmethod
-    def from_dict(cls, dictionary: dict, fields_required: bool = False):
-        _dictionary: tp.Dict[str, tp.Any] = {}
-        # allow a subset of attributes to not be loaded from the dictionary
-        # these attributes may be populated later
-        post_init_attributes = ['self_wav', 'chord', 'beat', 'joint_embed']
-        optional_fields = ['keywords']
-        for _field in fields(cls):
-            if _field.name in post_init_attributes:
-                continue
-            elif _field.name not in dictionary:
-                if fields_required and _field.name not in optional_fields:
-                    raise KeyError(f"Unexpected missing key: {_field.name}")
-            else:
-                preprocess_func: tp.Optional[tp.Callable] = cls.attribute_getter(_field.name)
-                value = dictionary[_field.name]
-                if preprocess_func:
-                    value = preprocess_func(value)
-                _dictionary[_field.name] = value
-        return cls(**_dictionary)
-def augment_music_info_description(music_info: MusicInfo, merge_text_p: float = 0.,
-                                   drop_desc_p: float = 0., drop_other_p: float = 0.) -> MusicInfo:
-    """Augment MusicInfo description with additional metadata fields and potential dropout.
-    Additional textual attributes are added given probability 'merge_text_conditions_p' and
-    the original textual description is dropped from the augmented description given probability drop_desc_p.
-    Args:
-        music_info (MusicInfo): The music metadata to augment.
-        merge_text_p (float): Probability of merging additional metadata to the description.
-            If provided value is 0, then no merging is performed.
-        drop_desc_p (float): Probability of dropping the original description on text merge.
-            if provided value is 0, then no drop out is performed.
-        drop_other_p (float): Probability of dropping the other fields used for text augmentation.
-    Returns:
-        MusicInfo: The MusicInfo with augmented textual description.
-    """
-    def is_valid_field(field_name: str, field_value: tp.Any) -> bool:
-        valid_field_name = field_name in ['key', 'bpm', 'genre', 'moods', 'instrument', 'keywords']
-        valid_field_value = field_value is not None and isinstance(field_value, (int, float, str, list))
-        keep_field = random.uniform(0, 1) < drop_other_p
-        return valid_field_name and valid_field_value and keep_field
-    def process_value(v: tp.Any) -> str:
-        if isinstance(v, (int, float, str)):
-            return str(v)
-        if isinstance(v, list):
-            return ", ".join(v)
-        else:
-            raise ValueError(f"Unknown type for text value! ({type(v), v})")
-    description = music_info.description
-    metadata_text = ""
-    # metadata_text = "rock style music, consistent rhythm, catchy song."
-    if random.uniform(0, 1) < merge_text_p:
-        meta_pairs = [f'{_field.name}: {process_value(getattr(music_info, _field.name))}'
-                      for _field in fields(music_info) if is_valid_field(_field.name, getattr(music_info, _field.name))]
-        random.shuffle(meta_pairs)
-        metadata_text = ". ".join(meta_pairs)
-        description = description if not random.uniform(0, 1) < drop_desc_p else None
-        logger.debug(f"Applying text augmentation on MMI info. description: {description}, metadata: {metadata_text}")
-    if description is None:
-        description = metadata_text if len(metadata_text) > 1 else None
-    else:
-        description = ". ".join([description.rstrip('.'), metadata_text])
-    description = description.strip() if description else None
-    music_info = replace(music_info)
-    music_info.description = description
-    return music_info
-class Paraphraser:
-    def __init__(self, paraphrase_source: tp.Union[str, Path], paraphrase_p: float = 0.):
-        self.paraphrase_p = paraphrase_p
-        open_fn = gzip.open if str(paraphrase_source).lower().endswith('.gz') else open
-        with open_fn(paraphrase_source, 'rb') as f:  # type: ignore
-            self.paraphrase_source = json.loads(f.read())
-        logger.info(f"loaded paraphrasing source from: {paraphrase_source}")
-    def sample_paraphrase(self, audio_path: str, description: str):
-        if random.random() >= self.paraphrase_p:
-            return description
-        info_path = Path(audio_path).with_suffix('.json')
-        if info_path not in self.paraphrase_source:
-            warn_once(logger, f"{info_path} not in paraphrase source!")
-            return description
-        new_desc = random.choice(self.paraphrase_source[info_path])
-        logger.debug(f"{description} -> {new_desc}")
-        return new_desc
-class MusicDataset(InfoAudioDataset):
-    """Music dataset is an AudioDataset with music-related metadata.
-    Args:
-        info_fields_required (bool): Whether to enforce having required fields.
-        merge_text_p (float): Probability of merging additional metadata to the description.
-        drop_desc_p (float): Probability of dropping the original description on text merge.
-        drop_other_p (float): Probability of dropping the other fields used for text augmentation.
-        joint_embed_attributes (list[str]): A list of attributes for which joint embedding metadata is returned.
-        paraphrase_source (str, optional): Path to the .json or .json.gz file containing the
-            paraphrases for the description. The json should be a dict with keys are the
-            original info path (e.g. track_path.json) and each value is a list of possible
-            paraphrased.
-        paraphrase_p (float): probability of taking a paraphrase.
-    See `audiocraft.data.info_audio_dataset.InfoAudioDataset` for full initialization arguments.
-    """
-    def __init__(self, *args, info_fields_required: bool = True,
-                 merge_text_p: float = 0., drop_desc_p: float = 0., drop_other_p: float = 0.,
-                 joint_embed_attributes: tp.List[str] = [],
-                 paraphrase_source: tp.Optional[str] = None, paraphrase_p: float = 0,
-                 **kwargs):
-        kwargs['return_info'] = True  # We require the info for each song of the dataset.
-        super().__init__(*args, **kwargs)
-        self.info_fields_required = info_fields_required
-        self.merge_text_p = merge_text_p
-        self.drop_desc_p = drop_desc_p
-        self.drop_other_p = drop_other_p
-        self.joint_embed_attributes = joint_embed_attributes
-        self.paraphraser = None
-        self.downsample_rate = 640
-        self.sr = 32000
-        if paraphrase_source is not None:
-            self.paraphraser = Paraphraser(paraphrase_source, paraphrase_p)
-    def __getitem__(self, index):
-        wav, info = super().__getitem__(index) # wav_seg and seg_info
-        info_data = info.to_dict()
-        # unpack info
-        target_sr = self.sr
-        n_frames_wave = info.n_frames
-        n_frames_feat = int(info.n_frames // self.downsample_rate)
-        music_info_path = str(info.meta.path).replace('no_vocal.wav', 'tags.json')
-        chord_path = str(info.meta.path).replace('no_vocal.wav', 'chord.lab')
-        beats_path = str(info.meta.path).replace('no_vocal.wav', 'beats.npy')
-        if all([
-            not Path(music_info_path).exists(),
-            not Path(beats_path).exists(),
-            not Path(chord_path).exists(),
-        ]):
-            raise FileNotFoundError
-        ### music info
-        with open(music_info_path, 'r') as json_file:
-            music_data = json.load(json_file)
-            music_data.update(info_data)
-            music_info = MusicInfo.from_dict(music_data, fields_required=self.info_fields_required)
-        if self.paraphraser is not None:
-                music_info.description = self.paraphraser.sample(music_info.meta.path, music_info.description)
-        if self.merge_text_p:
-            music_info = augment_music_info_description(
-                music_info, self.merge_text_p, self.drop_desc_p, self.drop_other_p)
-        ### load features to tensors ###
-        feat_hz = target_sr/self.downsample_rate
-        ## beat&bar: 2 x T
-        feat_beats = np.zeros((2, n_frames_feat))
-        beats_np = np.load(beats_path)
-        beat_time = beats_np[:, 0]
-        bar_time = beats_np[np.where(beats_np[:, 1] == 1)[0], 0]
-        beat_frame = [
-            int((t-info.seek_time)*feat_hz) for t in beat_time
-                if (t >= info.seek_time and t < info.seek_time + self.segment_duration)]
-        bar_frame =[
-            int((t-info.seek_time)*feat_hz) for t in bar_time
-                if (t >= info.seek_time and t < info.seek_time + self.segment_duration)]
-        feat_beats[0, beat_frame] = 1
-        feat_beats[1, bar_frame] = 1
-        kernel = np.array([0.05, 0.1, 0.3, 0.9, 0.3, 0.1, 0.05])
-        feat_beats[0] = np.convolve(feat_beats[0] , kernel, 'same') # apply soft kernel
-        beat_events = feat_beats[0] + feat_beats[1]
-        beat_events = torch.tensor(beat_events).unsqueeze(0) # [T] -> [1, T]
-        music_info.beat = BeatCondition(beat=beat_events[None], length=torch.tensor([n_frames_feat]),
-                                        bpm=[music_data["bpm"]], path=[music_info_path], seek_frame=[info.seek_time*target_sr//self.downsample_rate])
-        ## chord: 12 x T
-        feat_chord = np.zeros((12, n_frames_feat)) # root| ivs
-        with open(chord_path, 'r') as f:
-            for line in f.readlines():
-                splits = line.split()
-                if len(splits) == 3:
-                    st_sec, ed_sec, ctag = splits
-                    st_sec = float(st_sec) - info.seek_time
-                    ed_sec = float(ed_sec) - info.seek_time
-                    st_frame = int(st_sec*feat_hz)
-                    ed_frame = int(ed_sec*feat_hz)
-                    # 12 chorma
-                    mhot = CHORDS.chord(ctag)
-                    final_vec = np.roll(mhot[2], mhot[0])
-                    final_vec = final_vec[..., None]
-                    feat_chord[:, st_frame:ed_frame] = final_vec
-        feat_chord = torch.from_numpy(feat_chord)
-        music_info.chord = ChordCondition(
-                chord=feat_chord[None], length=torch.tensor([n_frames_feat]),
-                bpm=[music_data["bpm"]], path=[chord_path], seek_frame=[info.seek_time*self.sr//self.downsample_rate])
-        music_info.self_wav = WavCondition(
-            wav=wav[None], length=torch.tensor([info.n_frames]),
-            sample_rate=[info.sample_rate], path=[info.meta.path], seek_time=[info.seek_time])
-        for att in self.joint_embed_attributes:
-            att_value = getattr(music_info, att)
-            joint_embed_cond = JointEmbedCondition(
-                wav[None], [att_value], torch.tensor([info.n_frames]),
-                sample_rate=[info.sample_rate], path=[info.meta.path], seek_time=[info.seek_time])
-            music_info.joint_embed[att] = joint_embed_cond
-        return wav, music_info
-def get_musical_key(value: tp.Optional[str]) -> tp.Optional[str]:
-    """Preprocess key keywords, discarding them if there are multiple key defined."""
-    if value is None or (not isinstance(value, str)) or len(value) == 0 or value == 'None':
-        return None
-    elif ',' in value:
-        # For now, we discard when multiple keys are defined separated with comas
-        return None
-    else:
-        return value.strip().lower()
-def get_bpm(value: tp.Optional[str]) -> tp.Optional[float]:
-    """Preprocess to a float."""
-    if value is None:
-        return None
-    try:
-        return float(value)
-    except ValueError:
-        return None

audiocraft/audiocraft/data/sound_dataset.py DELETED Viewed

@@ -1,330 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""Dataset of audio with a simple description.
-"""
-from dataclasses import dataclass, fields, replace
-import json
-from pathlib import Path
-import random
-import typing as tp
-import numpy as np
-import torch
-from .info_audio_dataset import (
-    InfoAudioDataset,
-    get_keyword_or_keyword_list
-)
-from ..modules.conditioners import (
-    ConditioningAttributes,
-    SegmentWithAttributes,
-    WavCondition,
-)
-EPS = torch.finfo(torch.float32).eps
-TARGET_LEVEL_LOWER = -35
-TARGET_LEVEL_UPPER = -15
-@dataclass
-class SoundInfo(SegmentWithAttributes):
-    """Segment info augmented with Sound metadata.
-    """
-    description: tp.Optional[str] = None
-    self_wav: tp.Optional[torch.Tensor] = None
-    @property
-    def has_sound_meta(self) -> bool:
-        return self.description is not None
-    def to_condition_attributes(self) -> ConditioningAttributes:
-        out = ConditioningAttributes()
-        for _field in fields(self):
-            key, value = _field.name, getattr(self, _field.name)
-            if key == 'self_wav':
-                out.wav[key] = value
-            else:
-                out.text[key] = value
-        return out
-    @staticmethod
-    def attribute_getter(attribute):
-        if attribute == 'description':
-            preprocess_func = get_keyword_or_keyword_list
-        else:
-            preprocess_func = None
-        return preprocess_func
-    @classmethod
-    def from_dict(cls, dictionary: dict, fields_required: bool = False):
-        _dictionary: tp.Dict[str, tp.Any] = {}
-        # allow a subset of attributes to not be loaded from the dictionary
-        # these attributes may be populated later
-        post_init_attributes = ['self_wav']
-        for _field in fields(cls):
-            if _field.name in post_init_attributes:
-                continue
-            elif _field.name not in dictionary:
-                if fields_required:
-                    raise KeyError(f"Unexpected missing key: {_field.name}")
-            else:
-                preprocess_func: tp.Optional[tp.Callable] = cls.attribute_getter(_field.name)
-                value = dictionary[_field.name]
-                if preprocess_func:
-                    value = preprocess_func(value)
-                _dictionary[_field.name] = value
-        return cls(**_dictionary)
-class SoundDataset(InfoAudioDataset):
-    """Sound audio dataset: Audio dataset with environmental sound-specific metadata.
-    Args:
-        info_fields_required (bool): Whether all the mandatory metadata fields should be in the loaded metadata.
-        external_metadata_source (tp.Optional[str]): Folder containing JSON metadata for the corresponding dataset.
-            The metadata files contained in this folder are expected to match the stem of the audio file with
-            a json extension.
-        aug_p (float): Probability of performing audio mixing augmentation on the batch.
-        mix_p (float): Proportion of batch items that are mixed together when applying audio mixing augmentation.
-        mix_snr_low (int): Lowerbound for SNR value sampled for mixing augmentation.
-        mix_snr_high (int): Upperbound for SNR value sampled for mixing augmentation.
-        mix_min_overlap (float): Minimum overlap between audio files when performing mixing augmentation.
-        kwargs: Additional arguments for AudioDataset.
-    See `audiocraft.data.info_audio_dataset.InfoAudioDataset` for full initialization arguments.
-    """
-    def __init__(
-        self,
-        *args,
-        info_fields_required: bool = True,
-        external_metadata_source: tp.Optional[str] = None,
-        aug_p: float = 0.,
-        mix_p: float = 0.,
-        mix_snr_low: int = -5,
-        mix_snr_high: int = 5,
-        mix_min_overlap: float = 0.5,
-        **kwargs
-    ):
-        kwargs['return_info'] = True  # We require the info for each song of the dataset.
-        super().__init__(*args, **kwargs)
-        self.info_fields_required = info_fields_required
-        self.external_metadata_source = external_metadata_source
-        self.aug_p = aug_p
-        self.mix_p = mix_p
-        if self.aug_p > 0:
-            assert self.mix_p > 0, "Expecting some mixing proportion mix_p if aug_p > 0"
-            assert self.channels == 1, "SoundDataset with audio mixing considers only monophonic audio"
-        self.mix_snr_low = mix_snr_low
-        self.mix_snr_high = mix_snr_high
-        self.mix_min_overlap = mix_min_overlap
-    def _get_info_path(self, path: tp.Union[str, Path]) -> Path:
-        """Get path of JSON with metadata (description, etc.).
-        If there exists a JSON with the same name as 'path.name', then it will be used.
-        Else, such JSON will be searched for in an external json source folder if it exists.
-        """
-        info_path = Path(path).with_suffix('.json')
-        if Path(info_path).exists():
-            return info_path
-        elif self.external_metadata_source and (Path(self.external_metadata_source) / info_path.name).exists():
-            return Path(self.external_metadata_source) / info_path.name
-        else:
-            raise Exception(f"Unable to find a metadata JSON for path: {path}")
-    def __getitem__(self, index):
-        wav, info = super().__getitem__(index)
-        info_data = info.to_dict()
-        info_path = self._get_info_path(info.meta.path)
-        if Path(info_path).exists():
-            with open(info_path, 'r') as json_file:
-                sound_data = json.load(json_file)
-                sound_data.update(info_data)
-                sound_info = SoundInfo.from_dict(sound_data, fields_required=self.info_fields_required)
-                # if there are multiple descriptions, sample one randomly
-                if isinstance(sound_info.description, list):
-                    sound_info.description = random.choice(sound_info.description)
-        else:
-            sound_info = SoundInfo.from_dict(info_data, fields_required=False)
-        sound_info.self_wav = WavCondition(
-            wav=wav[None], length=torch.tensor([info.n_frames]),
-            sample_rate=[sound_info.sample_rate], path=[info.meta.path], seek_time=[info.seek_time])
-        return wav, sound_info
-    def collater(self, samples):
-        # when training, audio mixing is performed in the collate function
-        wav, sound_info = super().collater(samples)  # SoundDataset always returns infos
-        if self.aug_p > 0:
-            wav, sound_info = mix_samples(wav, sound_info, self.aug_p, self.mix_p,
-                                          snr_low=self.mix_snr_low, snr_high=self.mix_snr_high,
-                                          min_overlap=self.mix_min_overlap)
-        return wav, sound_info
-def rms_f(x: torch.Tensor) -> torch.Tensor:
-    return (x ** 2).mean(1).pow(0.5)
-def normalize(audio: torch.Tensor, target_level: int = -25) -> torch.Tensor:
-    """Normalize the signal to the target level."""
-    rms = rms_f(audio)
-    scalar = 10 ** (target_level / 20) / (rms + EPS)
-    audio = audio * scalar.unsqueeze(1)
-    return audio
-def is_clipped(audio: torch.Tensor, clipping_threshold: float = 0.99) -> torch.Tensor:
-    return (abs(audio) > clipping_threshold).any(1)
-def mix_pair(src: torch.Tensor, dst: torch.Tensor, min_overlap: float) -> torch.Tensor:
-    start = random.randint(0, int(src.shape[1] * (1 - min_overlap)))
-    remainder = src.shape[1] - start
-    if dst.shape[1] > remainder:
-        src[:, start:] = src[:, start:] + dst[:, :remainder]
-    else:
-        src[:, start:start+dst.shape[1]] = src[:, start:start+dst.shape[1]] + dst
-    return src
-def snr_mixer(clean: torch.Tensor, noise: torch.Tensor, snr: int, min_overlap: float,
-              target_level: int = -25, clipping_threshold: float = 0.99) -> torch.Tensor:
-    """Function to mix clean speech and noise at various SNR levels.
-    Args:
-        clean (torch.Tensor): Clean audio source to mix, of shape [B, T].
-        noise (torch.Tensor): Noise audio source to mix, of shape [B, T].
-        snr (int): SNR level when mixing.
-        min_overlap (float): Minimum overlap between the two mixed sources.
-        target_level (int): Gain level in dB.
-        clipping_threshold (float): Threshold for clipping the audio.
-    Returns:
-        torch.Tensor: The mixed audio, of shape [B, T].
-    """
-    if clean.shape[1] > noise.shape[1]:
-        noise = torch.nn.functional.pad(noise, (0, clean.shape[1] - noise.shape[1]))
-    else:
-        noise = noise[:, :clean.shape[1]]
-    # normalizing to -25 dB FS
-    clean = clean / (clean.max(1)[0].abs().unsqueeze(1) + EPS)
-    clean = normalize(clean, target_level)
-    rmsclean = rms_f(clean)
-    noise = noise / (noise.max(1)[0].abs().unsqueeze(1) + EPS)
-    noise = normalize(noise, target_level)
-    rmsnoise = rms_f(noise)
-    # set the noise level for a given SNR
-    noisescalar = (rmsclean / (10 ** (snr / 20)) / (rmsnoise + EPS)).unsqueeze(1)
-    noisenewlevel = noise * noisescalar
-    # mix noise and clean speech
-    noisyspeech = mix_pair(clean, noisenewlevel, min_overlap)
-    # randomly select RMS value between -15 dBFS and -35 dBFS and normalize noisyspeech with that value
-    # there is a chance of clipping that might happen with very less probability, which is not a major issue.
-    noisy_rms_level = np.random.randint(TARGET_LEVEL_LOWER, TARGET_LEVEL_UPPER)
-    rmsnoisy = rms_f(noisyspeech)
-    scalarnoisy = (10 ** (noisy_rms_level / 20) / (rmsnoisy + EPS)).unsqueeze(1)
-    noisyspeech = noisyspeech * scalarnoisy
-    clean = clean * scalarnoisy
-    noisenewlevel = noisenewlevel * scalarnoisy
-    # final check to see if there are any amplitudes exceeding +/- 1. If so, normalize all the signals accordingly
-    clipped = is_clipped(noisyspeech)
-    if clipped.any():
-        noisyspeech_maxamplevel = noisyspeech[clipped].max(1)[0].abs().unsqueeze(1) / (clipping_threshold - EPS)
-        noisyspeech[clipped] = noisyspeech[clipped] / noisyspeech_maxamplevel
-    return noisyspeech
-def snr_mix(src: torch.Tensor, dst: torch.Tensor, snr_low: int, snr_high: int, min_overlap: float):
-    if snr_low == snr_high:
-        snr = snr_low
-    else:
-        snr = np.random.randint(snr_low, snr_high)
-    mix = snr_mixer(src, dst, snr, min_overlap)
-    return mix
-def mix_text(src_text: str, dst_text: str):
-    """Mix text from different sources by concatenating them."""
-    if src_text == dst_text:
-        return src_text
-    return src_text + " " + dst_text
-def mix_samples(wavs: torch.Tensor, infos: tp.List[SoundInfo], aug_p: float, mix_p: float,
-                snr_low: int, snr_high: int, min_overlap: float):
-    """Mix samples within a batch, summing the waveforms and concatenating the text infos.
-    Args:
-        wavs (torch.Tensor): Audio tensors of shape [B, C, T].
-        infos (list[SoundInfo]): List of SoundInfo items corresponding to the audio.
-        aug_p (float): Augmentation probability.
-        mix_p (float): Proportion of items in the batch to mix (and merge) together.
-        snr_low (int): Lowerbound for sampling SNR.
-        snr_high (int): Upperbound for sampling SNR.
-        min_overlap (float): Minimum overlap between mixed samples.
-    Returns:
-        tuple[torch.Tensor, list[SoundInfo]]: A tuple containing the mixed wavs
-            and mixed SoundInfo for the given batch.
-    """
-    # no mixing to perform within the batch
-    if mix_p == 0:
-        return wavs, infos
-    if random.uniform(0, 1) < aug_p:
-        # perform all augmentations on waveforms as [B, T]
-        # randomly picking pairs of audio to mix
-        assert wavs.size(1) == 1, f"Mix samples requires monophonic audio but C={wavs.size(1)}"
-        wavs = wavs.mean(dim=1, keepdim=False)
-        B, T = wavs.shape
-        k = int(mix_p * B)
-        mixed_sources_idx = torch.randperm(B)[:k]
-        mixed_targets_idx = torch.randperm(B)[:k]
-        aug_wavs = snr_mix(
-            wavs[mixed_sources_idx],
-            wavs[mixed_targets_idx],
-            snr_low,
-            snr_high,
-            min_overlap,
-        )
-        # mixing textual descriptions in metadata
-        descriptions = [info.description for info in infos]
-        aug_infos = []
-        for i, j in zip(mixed_sources_idx, mixed_targets_idx):
-            text = mix_text(descriptions[i], descriptions[j])
-            m = replace(infos[i])
-            m.description = text
-            aug_infos.append(m)
-        # back to [B, C, T]
-        aug_wavs = aug_wavs.unsqueeze(1)
-        assert aug_wavs.shape[0] > 0, "Samples mixing returned empty batch."
-        assert aug_wavs.dim() == 3, f"Returned wav should be [B, C, T] but dim = {aug_wavs.dim()}"
-        assert aug_wavs.shape[0] == len(aug_infos), "Mismatch between number of wavs and infos in the batch"
-        return aug_wavs, aug_infos  # [B, C, T]
-    else:
-        # randomly pick samples in the batch to match
-        # the batch size when performing audio mixing
-        B, C, T = wavs.shape
-        k = int(mix_p * B)
-        wav_idx = torch.randperm(B)[:k]
-        wavs = wavs[wav_idx]
-        infos = [infos[i] for i in wav_idx]
-        assert wavs.shape[0] == len(infos), "Mismatch between number of wavs and infos in the batch"
-        return wavs, infos  # [B, C, T]

audiocraft/audiocraft/data/zip.py DELETED Viewed

@@ -1,76 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""Utility for reading some info from inside a zip file.
-"""
-import typing
-import zipfile
-from dataclasses import dataclass
-from functools import lru_cache
-from typing_extensions import Literal
-DEFAULT_SIZE = 32
-MODE = Literal['r', 'w', 'x', 'a']
-@dataclass(order=True)
-class PathInZip:
-    """Hold a path of file within a zip file.
-    Args:
-        path (str): The convention is <path_to_zip>:<relative_path_inside_zip>.
-            Let's assume there is a zip file /some/location/foo.zip
-            and inside of it is a json file located at /data/file1.json,
-            Then we expect path = "/some/location/foo.zip:/data/file1.json".
-    """
-    INFO_PATH_SEP = ':'
-    zip_path: str
-    file_path: str
-    def __init__(self, path: str) -> None:
-        split_path = path.split(self.INFO_PATH_SEP)
-        assert len(split_path) == 2
-        self.zip_path, self.file_path = split_path
-    @classmethod
-    def from_paths(cls, zip_path: str, file_path: str):
-        return cls(zip_path + cls.INFO_PATH_SEP + file_path)
-    def __str__(self) -> str:
-        return self.zip_path + self.INFO_PATH_SEP + self.file_path
-def _open_zip(path: str, mode: MODE = 'r'):
-    return zipfile.ZipFile(path, mode)
-_cached_open_zip = lru_cache(DEFAULT_SIZE)(_open_zip)
-def set_zip_cache_size(max_size: int):
-    """Sets the maximal LRU caching for zip file opening.
-    Args:
-        max_size (int): the maximal LRU cache.
-    """
-    global _cached_open_zip
-    _cached_open_zip = lru_cache(max_size)(_open_zip)
-def open_file_in_zip(path_in_zip: PathInZip, mode: str = 'r') -> typing.IO:
-    """Opens a file stored inside a zip and returns a file-like object.
-    Args:
-        path_in_zip (PathInZip): A PathInZip object representing the file to return a file-like object of.
-        mode (str): The mode in which to open the file with.
-    Returns:
-        A file-like object for PathInZip.
-    """
-    zf = _cached_open_zip(path_in_zip.zip_path)
-    return zf.open(path_in_zip.file_path)

audiocraft/audiocraft/environment.py DELETED Viewed

@@ -1,176 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Provides cluster and tools configuration across clusters (slurm, dora, utilities).
-"""
-import logging
-import os
-from pathlib import Path
-import re
-import typing as tp
-import omegaconf
-from .utils.cluster import _guess_cluster_type
-logger = logging.getLogger(__name__)
-class AudioCraftEnvironment:
-    """Environment configuration for teams and clusters.
-    AudioCraftEnvironment picks compute cluster settings (slurm, dora) from the current running environment
-    or declared variable and the loaded team configuration. Additionally, the AudioCraftEnvironment
-    provides pointers to a reference folder resolved automatically across clusters that is shared across team members,
-    allowing to share sigs or other files to run jobs. Finally, it provides dataset mappers to automatically
-    map dataset file paths to new locations across clusters, allowing to use the same manifest of files across cluters.
-    The cluster type is identified automatically and base configuration file is read from config/teams.yaml.
-    Use the following environment variables to specify the cluster, team or configuration:
-        AUDIOCRAFT_CLUSTER (optional): Cluster type to enforce. Useful if the cluster type
-            cannot be inferred automatically.
-        AUDIOCRAFT_CONFIG (optional): Path to yaml config holding the teams configuration.
-            If not set, configuration is read from config/teams.yaml.
-        AUDIOCRAFT_TEAM (optional): Name of the team. Recommended to set to your own team.
-            Cluster configuration are shared across teams to match compute allocation,
-            specify your cluster configuration in the configuration file under a key mapping
-            your team name.
-    """
-    _instance = None
-    DEFAULT_TEAM = "default"
-    def __init__(self) -> None:
-        """Loads configuration."""
-        self.team: str = os.getenv("AUDIOCRAFT_TEAM", self.DEFAULT_TEAM)
-        cluster_type = _guess_cluster_type()
-        cluster = os.getenv(
-            "AUDIOCRAFT_CLUSTER", cluster_type.value
-        )
-        logger.info("Detecting cluster type %s", cluster_type)
-        self.cluster: str = cluster
-        config_path = os.getenv(
-            "AUDIOCRAFT_CONFIG",
-            Path(__file__)
-            .parent.parent.joinpath("config/teams", self.team)
-            .with_suffix(".yaml"),
-        )
-        self.config = omegaconf.OmegaConf.load(config_path)
-        self._dataset_mappers = []
-        cluster_config = self._get_cluster_config()
-        if "dataset_mappers" in cluster_config:
-            for pattern, repl in cluster_config["dataset_mappers"].items():
-                regex = re.compile(pattern)
-                self._dataset_mappers.append((regex, repl))
-    def _get_cluster_config(self) -> omegaconf.DictConfig:
-        assert isinstance(self.config, omegaconf.DictConfig)
-        return self.config[self.cluster]
-    @classmethod
-    def instance(cls):
-        if cls._instance is None:
-            cls._instance = cls()
-        return cls._instance
-    @classmethod
-    def reset(cls):
-        """Clears the environment and forces a reload on next invocation."""
-        cls._instance = None
-    @classmethod
-    def get_team(cls) -> str:
-        """Gets the selected team as dictated by the AUDIOCRAFT_TEAM env var.
-        If not defined, defaults to "labs".
-        """
-        return cls.instance().team
-    @classmethod
-    def get_cluster(cls) -> str:
-        """Gets the detected cluster.
-        This value can be overridden by the AUDIOCRAFT_CLUSTER env var.
-        """
-        return cls.instance().cluster
-    @classmethod
-    def get_dora_dir(cls) -> Path:
-        """Gets the path to the dora directory for the current team and cluster.
-        Value is overridden by the AUDIOCRAFT_DORA_DIR env var.
-        """
-        cluster_config = cls.instance()._get_cluster_config()
-        dora_dir = os.getenv("AUDIOCRAFT_DORA_DIR", cluster_config["dora_dir"])
-        logger.warning(f"Dora directory: {dora_dir}")
-        return Path(dora_dir)
-    @classmethod
-    def get_reference_dir(cls) -> Path:
-        """Gets the path to the reference directory for the current team and cluster.
-        Value is overridden by the AUDIOCRAFT_REFERENCE_DIR env var.
-        """
-        cluster_config = cls.instance()._get_cluster_config()
-        return Path(os.getenv("AUDIOCRAFT_REFERENCE_DIR", cluster_config["reference_dir"]))
-    @classmethod
-    def get_slurm_exclude(cls) -> tp.Optional[str]:
-        """Get the list of nodes to exclude for that cluster."""
-        cluster_config = cls.instance()._get_cluster_config()
-        return cluster_config.get("slurm_exclude")
-    @classmethod
-    def get_slurm_partitions(cls, partition_types: tp.Optional[tp.List[str]] = None) -> str:
-        """Gets the requested partitions for the current team and cluster as a comma-separated string.
-        Args:
-            partition_types (list[str], optional): partition types to retrieve. Values must be
-                from ['global', 'team']. If not provided, the global partition is returned.
-        """
-        if not partition_types:
-            partition_types = ["global"]
-        cluster_config = cls.instance()._get_cluster_config()
-        partitions = [
-            cluster_config["partitions"][partition_type]
-            for partition_type in partition_types
-        ]
-        return ",".join(partitions)
-    @classmethod
-    def resolve_reference_path(cls, path: tp.Union[str, Path]) -> Path:
-        """Converts reference placeholder in path with configured reference dir to resolve paths.
-        Args:
-            path (str or Path): Path to resolve.
-        Returns:
-            Path: Resolved path.
-        """
-        path = str(path)
-        if path.startswith("//reference"):
-            reference_dir = cls.get_reference_dir()
-            logger.warn(f"Reference directory: {reference_dir}")
-            assert (
-                reference_dir.exists() and reference_dir.is_dir()
-            ), f"Reference directory does not exist: {reference_dir}."
-            path = re.sub("^//reference", str(reference_dir), path)
-        return Path(path)
-    @classmethod
-    def apply_dataset_mappers(cls, path: str) -> str:
-        """Applies dataset mapping regex rules as defined in the configuration.
-        If no rules are defined, the path is returned as-is.
-        """
-        instance = cls.instance()
-        for pattern, repl in instance._dataset_mappers:
-            path = pattern.sub(repl, path)
-        return path

audiocraft/audiocraft/grids/__init__.py DELETED Viewed

@@ -1,6 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""Dora Grids."""

audiocraft/audiocraft/grids/_base_explorers.py DELETED Viewed

@@ -1,80 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-from abc import ABC, abstractmethod
-import time
-import typing as tp
-from dora import Explorer
-import treetable as tt
-def get_sheep_ping(sheep) -> tp.Optional[str]:
-    """Return the amount of time since the Sheep made some update
-    to its log. Returns a str using the relevant time unit."""
-    ping = None
-    if sheep.log is not None and sheep.log.exists():
-        delta = time.time() - sheep.log.stat().st_mtime
-        if delta > 3600 * 24:
-            ping = f'{delta / (3600 * 24):.1f}d'
-        elif delta > 3600:
-            ping = f'{delta / (3600):.1f}h'
-        elif delta > 60:
-            ping = f'{delta / 60:.1f}m'
-        else:
-            ping = f'{delta:.1f}s'
-    return ping
-class BaseExplorer(ABC, Explorer):
-    """Base explorer for AudioCraft grids.
-    All task specific solvers are expected to implement the `get_grid_metrics`
-    method to specify logic about metrics to display for a given task.
-    If additional stages are used, the child explorer must define how to handle
-    these new stages in the `process_history` and `process_sheep` methods.
-    """
-    def stages(self):
-        return ["train", "valid", "evaluate"]
-    def get_grid_meta(self):
-        """Returns the list of Meta information to display for each XP/job.
-        """
-        return [
-            tt.leaf("index", align=">"),
-            tt.leaf("name", wrap=140),
-            tt.leaf("state"),
-            tt.leaf("sig", align=">"),
-            tt.leaf("sid", align="<"),
-        ]
-    @abstractmethod
-    def get_grid_metrics(self):
-        """Return the metrics that should be displayed in the tracking table.
-        """
-        ...
-    def process_sheep(self, sheep, history):
-        train = {
-            "epoch": len(history),
-        }
-        parts = {"train": train}
-        for metrics in history:
-            for key, sub in metrics.items():
-                part = parts.get(key, {})
-                if 'duration' in sub:
-                    # Convert to minutes for readability.
-                    sub['duration'] = sub['duration'] / 60.
-                part.update(sub)
-                parts[key] = part
-        ping = get_sheep_ping(sheep)
-        if ping is not None:
-            for name in self.stages():
-                if name not in parts:
-                    parts[name] = {}
-                # Add the ping to each part for convenience.
-                parts[name]['ping'] = ping
-        return parts

audiocraft/audiocraft/grids/audiogen/__init__.py DELETED Viewed

@@ -1,6 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""AudioGen grids."""

audiocraft/audiocraft/grids/audiogen/audiogen_base_16khz.py DELETED Viewed

@@ -1,23 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-from ..musicgen._explorers import LMExplorer
-from ...environment import AudioCraftEnvironment
-@LMExplorer
-def explorer(launcher):
-    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
-    launcher.slurm_(gpus=64, partition=partitions)
-    launcher.bind_(solver='audiogen/audiogen_base_16khz')
-    # replace this by the desired environmental sound dataset
-    launcher.bind_(dset='internal/sounds_16khz')
-    fsdp = {'autocast': False, 'fsdp.use': True}
-    medium = {'model/lm/model_scale': 'medium'}
-    launcher.bind_(fsdp)
-    launcher(medium)

audiocraft/audiocraft/grids/audiogen/audiogen_pretrained_16khz_eval.py DELETED Viewed

@@ -1,68 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Evaluation with objective metrics for the pretrained AudioGen models.
-This grid takes signature from the training grid and runs evaluation-only stage.
-When running the grid for the first time, please use:
-REGEN=1 dora grid audiogen.audiogen_pretrained_16khz_eval
-and re-use the REGEN=1 option when the grid is changed to force regenerating it.
-Note that you need the proper metrics external libraries setup to use all
-the objective metrics activated in this grid. Refer to the README for more information.
-"""
-import os
-from ..musicgen._explorers import GenerationEvalExplorer
-from ...environment import AudioCraftEnvironment
-from ... import train
-def eval(launcher, batch_size: int = 32):
-    opts = {
-        'dset': 'audio/audiocaps_16khz',
-        'solver/audiogen/evaluation': 'objective_eval',
-        'execute_only': 'evaluate',
-        '+dataset.evaluate.batch_size': batch_size,
-        '+metrics.fad.tf.batch_size': 32,
-    }
-    # binary for FAD computation: replace this path with your own path
-    metrics_opts = {
-        'metrics.fad.tf.bin': '/data/home/jadecopet/local/usr/opt/google-research'
-    }
-    opt1 = {'generate.lm.use_sampling': True, 'generate.lm.top_k': 250, 'generate.lm.top_p': 0.}
-    opt2 = {'transformer_lm.two_step_cfg': True}
-    sub = launcher.bind(opts)
-    sub.bind_(metrics_opts)
-    # base objective metrics
-    sub(opt1, opt2)
-@GenerationEvalExplorer
-def explorer(launcher):
-    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
-    launcher.slurm_(gpus=4, partition=partitions)
-    if 'REGEN' not in os.environ:
-        folder = train.main.dora.dir / 'grids' / __name__.split('.', 2)[-1]
-        with launcher.job_array():
-            for sig in folder.iterdir():
-                if not sig.is_symlink():
-                    continue
-                xp = train.main.get_xp_from_sig(sig.name)
-                launcher(xp.argv)
-        return
-    audiogen_base = launcher.bind(solver="audiogen/audiogen_base_16khz")
-    audiogen_base.bind_({'autocast': False, 'fsdp.use': True})
-    audiogen_base_medium = audiogen_base.bind({'continue_from': '//pretrained/facebook/audiogen-medium'})
-    audiogen_base_medium.bind_({'model/lm/model_scale': 'medium'})
-    eval(audiogen_base_medium, batch_size=128)

audiocraft/audiocraft/grids/compression/__init__.py DELETED Viewed

@@ -1,6 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""EnCodec grids."""

audiocraft/audiocraft/grids/compression/_explorers.py DELETED Viewed

@@ -1,55 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import treetable as tt
-from .._base_explorers import BaseExplorer
-class CompressionExplorer(BaseExplorer):
-    eval_metrics = ["sisnr", "visqol"]
-    def stages(self):
-        return ["train", "valid", "evaluate"]
-    def get_grid_meta(self):
-        """Returns the list of Meta information to display for each XP/job.
-        """
-        return [
-            tt.leaf("index", align=">"),
-            tt.leaf("name", wrap=140),
-            tt.leaf("state"),
-            tt.leaf("sig", align=">"),
-        ]
-    def get_grid_metrics(self):
-        """Return the metrics that should be displayed in the tracking table.
-        """
-        return [
-            tt.group(
-                "train",
-                [
-                    tt.leaf("epoch"),
-                    tt.leaf("bandwidth", ".2f"),
-                    tt.leaf("adv", ".4f"),
-                    tt.leaf("d_loss", ".4f"),
-                ],
-                align=">",
-            ),
-            tt.group(
-                "valid",
-                [
-                    tt.leaf("bandwidth", ".2f"),
-                    tt.leaf("adv", ".4f"),
-                    tt.leaf("msspec", ".4f"),
-                    tt.leaf("sisnr", ".2f"),
-                ],
-                align=">",
-            ),
-            tt.group(
-                "evaluate", [tt.leaf(name, ".3f") for name in self.eval_metrics], align=">"
-            ),
-        ]

audiocraft/audiocraft/grids/compression/debug.py DELETED Viewed

@@ -1,31 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Grid search file, simply list all the exp you want in `explorer`.
-Any new exp added there will be scheduled.
-You can cancel and experiment by commenting its line.
-This grid is a minimal example for debugging compression task
-and how to override parameters directly in a grid.
-Learn more about dora grids: https://github.com/facebookresearch/dora
-"""
-from ._explorers import CompressionExplorer
-from ...environment import AudioCraftEnvironment
-@CompressionExplorer
-def explorer(launcher):
-    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
-    launcher.slurm_(gpus=2, partition=partitions)
-    launcher.bind_(solver='compression/debug')
-    with launcher.job_array():
-        # base debug task using config from solver=compression/debug
-        launcher()
-        # we can override parameters in the grid to launch additional xps
-        launcher({'rvq.bins': 2048, 'rvq.n_q': 4})

audiocraft/audiocraft/grids/compression/encodec_audiogen_16khz.py DELETED Viewed

@@ -1,29 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Grid search file, simply list all the exp you want in `explorer`.
-Any new exp added there will be scheduled.
-You can cancel and experiment by commenting its line.
-This grid shows how to train the new AudioGen EnCodec model at 16 kHz.
-"""
-from ._explorers import CompressionExplorer
-from ...environment import AudioCraftEnvironment
-@CompressionExplorer
-def explorer(launcher):
-    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
-    launcher.slurm_(gpus=8, partition=partitions)
-    # use configuration for AudioGen's EnCodec model trained on monophonic audio sampled at 16 kHz
-    # AudioGen's EnCodec is trained with a total stride of 320 leading to a frame rate of 50 hz
-    launcher.bind_(solver='compression/encodec_audiogen_16khz')
-    # replace this by the desired sound dataset
-    launcher.bind_(dset='internal/sounds_16khz')
-    # launch xp
-    launcher()

audiocraft/audiocraft/grids/compression/encodec_base_24khz.py DELETED Viewed

@@ -1,28 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Grid search file, simply list all the exp you want in `explorer`.
-Any new exp added there will be scheduled.
-You can cancel and experiment by commenting its line.
-This grid shows how to train a base causal EnCodec model at 24 kHz.
-"""
-from ._explorers import CompressionExplorer
-from ...environment import AudioCraftEnvironment
-@CompressionExplorer
-def explorer(launcher):
-    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
-    launcher.slurm_(gpus=8, partition=partitions)
-    # base causal EnCodec trained on monophonic audio sampled at 24 kHz
-    launcher.bind_(solver='compression/encodec_base_24khz')
-    # replace this by the desired dataset
-    launcher.bind_(dset='audio/example')
-    # launch xp
-    launcher()

audiocraft/audiocraft/grids/compression/encodec_musicgen_32khz.py DELETED Viewed

@@ -1,34 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Grid search file, simply list all the exp you want in `explorer`.
-Any new exp added there will be scheduled.
-You can cancel and experiment by commenting its line.
-This grid shows how to train a MusicGen EnCodec model at 32 kHz.
-"""
-from ._explorers import CompressionExplorer
-from ...environment import AudioCraftEnvironment
-@CompressionExplorer
-def explorer(launcher):
-    partitions = AudioCraftEnvironment.get_slurm_partitions(['team', 'global'])
-    launcher.slurm_(gpus=8, partition=partitions)
-    # use configuration for MusicGen's EnCodec model trained on monophonic audio sampled at 32 kHz
-    # MusicGen's EnCodec is trained with a total stride of 640 leading to a frame rate of 50 hz
-    launcher.bind_(solver='compression/encodec_musicgen_32khz')
-    # replace this by the desired music dataset
-    launcher.bind_(dset='internal/music_400k_32khz')
-    # launch xp
-    launcher()
-    launcher({
-        'metrics.visqol.bin': '/data/home/jadecopet/local/usr/opt/visqol',
-        'label': 'visqol',
-        'evaluate.metrics.visqol': True
-    })