Spaces:

DeepLearning101
/

Speech-Quality-Inspection_Meta-Denoiser

Running

App Files Files Community

DeepLearning101 commited on Mar 21, 2023

Commit

109bb65

1 Parent(s): 2917403

Upload 17 files

Browse files

Files changed (17) hide show

denoiser/__init__.py +5 -0
denoiser/audio.py +89 -0
denoiser/augment.py +191 -0
denoiser/conv_demucs.py +661 -0
denoiser/data.py +99 -0
denoiser/demucs.py +449 -0
denoiser/distrib.py +100 -0
denoiser/dsp.py +64 -0
denoiser/enhance.py +138 -0
denoiser/evaluate.py +136 -0
denoiser/executor.py +79 -0
denoiser/live.py +161 -0
denoiser/pretrained.py +72 -0
denoiser/resample.py +75 -0
denoiser/solver.py +233 -0
denoiser/stft_loss.py +144 -0
denoiser/utils.py +165 -0

denoiser/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

denoiser/audio.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# author: adefossez
+import json
+from pathlib import Path
+import math
+import os
+import sys
+import torchaudio
+from torch.nn import functional as F
+def find_audio_files(path, exts=[".wav"], progress=True):
+    audio_files = []
+    for root, folders, files in os.walk(path, followlinks=True):
+        for file in files:
+            file = Path(root) / file
+            if file.suffix.lower() in exts:
+                audio_files.append(str(file.resolve()))
+    meta = []
+    for idx, file in enumerate(audio_files):
+        siginfo, _ = torchaudio.info(file)
+        length = siginfo.length // siginfo.channels
+        meta.append((file, length))
+        if progress:
+            print(format((1 + idx) / len(audio_files), " 3.1%"), end='\r', file=sys.stderr)
+    meta.sort()
+    return meta
+class Audioset:
+    def __init__(self, files=None, length=None, stride=None,
+                 pad=True, with_path=False, sample_rate=None):
+        """
+        files should be a list [(file, length)]
+        """
+        self.files = files
+        self.num_examples = []
+        self.length = length
+        self.stride = stride or length
+        self.with_path = with_path
+        self.sample_rate = sample_rate
+        for file, file_length in self.files:
+            if length is None:
+                examples = 1
+            elif file_length < length:
+                examples = 1 if pad else 0
+            elif pad:
+                examples = int(math.ceil((file_length - self.length) / self.stride) + 1)
+            else:
+                examples = (file_length - self.length) // self.stride + 1
+            self.num_examples.append(examples)
+    def __len__(self):
+        return sum(self.num_examples)
+    def __getitem__(self, index):
+        for (file, _), examples in zip(self.files, self.num_examples):
+            if index >= examples:
+                index -= examples
+                continue
+            num_frames = 0
+            offset = 0
+            if self.length is not None:
+                offset = self.stride * index
+                num_frames = self.length
+            out, sr = torchaudio.load(str(file), offset=offset, num_frames=num_frames)
+            if self.sample_rate is not None:
+                if sr != self.sample_rate:
+                    raise RuntimeError(f"Expected {file} to have sample rate of "
+                                       f"{self.sample_rate}, but got {sr}")
+            if num_frames:
+                out = F.pad(out, (0, num_frames - out.shape[-1]))
+            if self.with_path:
+                return out, file
+            else:
+                return out
+if __name__ == "__main__":
+    meta = []
+    for path in sys.argv[1:]:
+        meta += find_audio_files(path)
+    json.dump(meta, sys.stdout, indent=4)

denoiser/augment.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# author: adefossez
+import random
+import torch as th
+from torch import nn
+from torch.nn import functional as F
+from . import dsp
+class Remix(nn.Module):
+    """Remix.
+    Mixes different noises with clean speech within a given batch
+    """
+    def forward(self, sources):
+        noise, clean = sources
+        bs, *other = noise.shape
+        device = noise.device
+        perm = th.argsort(th.rand(bs, device=device), dim=0)
+        return th.stack([noise[perm], clean])
+class RevEcho(nn.Module):
+    """
+    Hacky Reverb but runs on GPU without slowing down training.
+    This reverb adds a succession of attenuated echos of the input
+    signal to itself. Intuitively, the delay of the first echo will happen
+    after roughly 2x the radius of the room and is controlled by `first_delay`.
+    Then RevEcho keeps adding echos with the same delay and further attenuation
+    until the amplitude ratio between the last and first echo is 1e-3.
+    The attenuation factor and the number of echos to adds is controlled
+    by RT60 (measured in seconds). RT60 is the average time to get to -60dB
+    (remember volume is measured over the squared amplitude so this matches
+    the 1e-3 ratio).
+    At each call to RevEcho, `first_delay`, `initial` and `RT60` are
+    sampled from their range. Then, to prevent this reverb from being too regular,
+    the delay time is resampled uniformly within `first_delay +- 10%`,
+    as controlled by the `jitter` parameter. Finally, for a denser reverb,
+    multiple trains of echos are added with different jitter noises.
+    Args:
+        - initial: amplitude of the first echo as a fraction
+            of the input signal. For each sample, actually sampled from
+            `[0, initial]`. Larger values means louder reverb. Physically,
+            this would depend on the absorption of the room walls.
+        - rt60: range of values to sample the RT60 in seconds, i.e.
+            after RT60 seconds, the echo amplitude is 1e-3 of the first echo.
+            The default values follow the recommendations of
+            https://arxiv.org/ftp/arxiv/papers/2001/2001.08662.pdf, Section 2.4.
+            Physically this would also be related to the absorption of the
+            room walls and there is likely a relation between `RT60` and
+            `initial`, which we ignore here.
+        - first_delay: range of values to sample the first echo delay in seconds.
+            The default values are equivalent to sampling a room of 3 to 10 meters.
+        - repeat: how many train of echos with differents jitters to add.
+            Higher values means a denser reverb.
+        - jitter: jitter used to make each repetition of the reverb echo train
+            slightly different. For instance a jitter of 0.1 means
+            the delay between two echos will be in the range `first_delay +- 10%`,
+            with the jittering noise being resampled after each single echo.
+        - keep_clean: fraction of the reverb of the clean speech to add back
+            to the ground truth. 0 = dereverberation, 1 = no dereverberation.
+        - sample_rate: sample rate of the input signals.
+    """
+    def __init__(self, proba=0.5, initial=0.3, rt60=(0.3, 1.3), first_delay=(0.01, 0.03),
+                 repeat=3, jitter=0.1, keep_clean=0.1, sample_rate=16000):
+        super().__init__()
+        self.proba = proba
+        self.initial = initial
+        self.rt60 = rt60
+        self.first_delay = first_delay
+        self.repeat = repeat
+        self.jitter = jitter
+        self.keep_clean = keep_clean
+        self.sample_rate = sample_rate
+    def _reverb(self, source, initial, first_delay, rt60):
+        """
+        Return the reverb for a single source.
+        """
+        length = source.shape[-1]
+        reverb = th.zeros_like(source)
+        for _ in range(self.repeat):
+            frac = 1  # what fraction of the first echo amplitude is still here
+            echo = initial * source
+            while frac > 1e-3:
+                # First jitter noise for the delay
+                jitter = 1 + self.jitter * random.uniform(-1, 1)
+                delay = min(
+                    1 + int(jitter * first_delay * self.sample_rate),
+                    length)
+                # Delay the echo in time by padding with zero on the left
+                echo = F.pad(echo[:, :, :-delay], (delay, 0))
+                reverb += echo
+                # Second jitter noise for the attenuation
+                jitter = 1 + self.jitter * random.uniform(-1, 1)
+                # we want, with `d` the attenuation, d**(rt60 / first_ms) = 1e-3
+                # i.e. log10(d) = -3 * first_ms / rt60, so that
+                attenuation = 10**(-3 * jitter * first_delay / rt60)
+                echo *= attenuation
+                frac *= attenuation
+        return reverb
+    def forward(self, wav):
+        if random.random() >= self.proba:
+            return wav
+        noise, clean = wav
+        # Sample characteristics for the reverb
+        initial = random.random() * self.initial
+        first_delay = random.uniform(*self.first_delay)
+        rt60 = random.uniform(*self.rt60)
+        reverb_noise = self._reverb(noise, initial, first_delay, rt60)
+        # Reverb for the noise is always added back to the noise
+        noise += reverb_noise
+        reverb_clean = self._reverb(clean, initial, first_delay, rt60)
+        # Split clean reverb among the clean speech and noise
+        clean += self.keep_clean * reverb_clean
+        noise += (1 - self.keep_clean) * reverb_clean
+        return th.stack([noise, clean])
+class BandMask(nn.Module):
+    """BandMask.
+    Maskes bands of frequencies. Similar to Park, Daniel S., et al.
+    "Specaugment: A simple data augmentation method for automatic speech recognition."
+    (https://arxiv.org/pdf/1904.08779.pdf) but over the waveform.
+    """
+    def __init__(self, maxwidth=0.2, bands=120, sample_rate=16_000):
+        """__init__.
+        :param maxwidth: the maximum width to remove
+        :param bands: number of bands
+        :param sample_rate: signal sample rate
+        """
+        super().__init__()
+        self.maxwidth = maxwidth
+        self.bands = bands
+        self.sample_rate = sample_rate
+    def forward(self, wav):
+        bands = self.bands
+        bandwidth = int(abs(self.maxwidth) * bands)
+        mels = dsp.mel_frequencies(bands, 40, self.sample_rate/2) / self.sample_rate
+        low = random.randrange(bands)
+        high = random.randrange(low, min(bands, low + bandwidth))
+        filters = dsp.LowPassFilters([mels[low], mels[high]]).to(wav.device)
+        low, midlow = filters(wav)
+        # band pass filtering
+        out = wav - midlow + low
+        return out
+class Shift(nn.Module):
+    """Shift."""
+    def __init__(self, shift=8192, same=False):
+        """__init__.
+        :param shift: randomly shifts the signals up to a given factor
+        :param same: shifts both clean and noisy files by the same factor
+        """
+        super().__init__()
+        self.shift = shift
+        self.same = same
+    def forward(self, wav):
+        sources, batch, channels, length = wav.shape
+        length = length - self.shift
+        if self.shift > 0:
+            if not self.training:
+                wav = wav[..., :length]
+            else:
+                offsets = th.randint(
+                    self.shift,
+                    [1 if self.same else sources, batch, 1, 1], device=wav.device)
+                offsets = offsets.expand(sources, -1, channels, -1)
+                indexes = th.arange(length, device=wav.device)
+                wav = wav.gather(3, indexes + offsets)
+        return wav

denoiser/conv_demucs.py ADDED Viewed

	@@ -0,0 +1,661 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# author: adefossez
+import math
+import time
+import torch
+from torch import nn
+from torch.nn import functional as F
+from .resample import downsample2, upsample2
+from .utils import capture_init
+# class BLSTM(nn.Module):
+#     def __init__(self, dim, layers=2, bi=True):
+#         super().__init__()
+#         klass = nn.LSTM
+#         self.lstm = klass(bidirectional=bi, num_layers=layers, hidden_size=dim, input_size=dim)
+#         self.linear = None
+#         if bi:
+#             self.linear = nn.Linear(2 * dim, dim)
+#     def forward(self, x, hidden=None):
+#         x, hidden = self.lstm(x, hidden)
+#         if self.linear:
+#             x = self.linear(x)
+#         return x, hidden
+EPS = 1e-8
+class Chomp1d(nn.Module):
+    """To ensure the output length is the same as the input.
+    """
+    def __init__(self, chomp_size):
+        super(Chomp1d, self).__init__()
+        self.chomp_size = chomp_size
+    def forward(self, x):
+        """
+        Args:
+            x: [M, H, Kpad]
+        Returns:
+            [M, H, K]
+        """
+        return x[:, :, :-self.chomp_size].contiguous()
+def chose_norm(norm_type, channel_size):
+    """The input of normlization will be (M, C, K), where M is batch size,
+       C is channel size and K is sequence length.
+    """
+    if norm_type == "gLN":
+        return GlobalLayerNorm(channel_size)
+    elif norm_type == "cLN":
+        return ChannelwiseLayerNorm(channel_size)
+    else: # norm_type == "BN":
+        # Given input (M, C, K), nn.BatchNorm1d(C) will accumulate statics
+        # along M and K, so this BN usage is right.
+        return nn.BatchNorm1d(channel_size)
+class ChannelwiseLayerNorm(nn.Module):
+    """Channel-wise Layer Normalization (cLN)"""
+    def __init__(self, channel_size):
+        super(ChannelwiseLayerNorm, self).__init__()
+        self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
+        self.beta = nn.Parameter(torch.Tensor(1, channel_size,1 ))  # [1, N, 1]
+        self.reset_parameters()
+    def reset_parameters(self):
+        self.gamma.data.fill_(1)
+        self.beta.data.zero_()
+    def forward(self, y):
+        """
+        Args:
+            y: [M, N, K], M is batch size, N is channel size, K is length
+        Returns:
+            cLN_y: [M, N, K]
+        """
+        mean = torch.mean(y, dim=1, keepdim=True)  # [M, 1, K]
+        var = torch.var(y, dim=1, keepdim=True, unbiased=False)  # [M, 1, K]
+        cLN_y = self.gamma * (y - mean) / torch.pow(var + EPS, 0.5) + self.beta
+        return cLN_y
+class DepthwiseSeparableConv(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride, padding, dilation, norm_type="gLN", causal=False):
+        super(DepthwiseSeparableConv, self).__init__()
+        # Use `groups` option to implement depthwise convolution
+        # [M, H, K] -> [M, H, K]
+        depthwise_conv = nn.Conv1d(in_channels, in_channels, kernel_size,
+                                   stride=stride, padding=padding,
+                                   dilation=dilation, groups=in_channels,
+                                   bias=False)
+        if causal:
+            chomp = Chomp1d(padding)
+        prelu = nn.PReLU()
+        norm = chose_norm(norm_type, in_channels)
+        # [M, H, K] -> [M, B, K]
+        pointwise_conv = nn.Conv1d(in_channels, out_channels, 1, bias=False)
+        # Put together
+        if causal:
+            self.net = nn.Sequential(depthwise_conv, chomp, prelu, norm,
+                                     pointwise_conv)
+        else:
+            self.net = nn.Sequential(depthwise_conv, prelu, norm,
+                                     pointwise_conv)
+    def forward(self, x):
+        """
+        Args:
+            x: [M, H, K]
+        Returns:
+            result: [M, B, K]
+        """
+        return self.net(x)
+class TemporalBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride, padding, dilation, norm_type="gLN", causal=False):
+        super(TemporalBlock, self).__init__()
+        # [M, B, K] -> [M, H, K]
+        conv1x1 = nn.Conv1d(in_channels, out_channels, 1, bias=False)
+        prelu = nn.PReLU()
+        norm = chose_norm(norm_type, out_channels)
+        # [M, H, K] -> [M, B, K]
+        dsconv = DepthwiseSeparableConv(out_channels, in_channels, kernel_size,
+                                        stride, padding, dilation, norm_type,
+                                        causal)
+        # Put together
+        self.net = nn.Sequential(conv1x1, prelu, norm, dsconv)
+    def forward(self, x):
+        """
+        Args:
+            x: [M, B, K]
+        Returns:
+            [M, B, K]
+        """
+        residual = x
+        out = self.net(x)
+        # TODO: when P = 3 here works fine, but when P = 2 maybe need to pad?
+        return out + residual  # look like w/o F.relu is better than w/ F.relu
+        # return F.relu(out + residual)
+class GlobalLayerNorm(nn.Module):
+    """Global Layer Normalization (gLN)"""
+    def __init__(self, channel_size):
+        super(GlobalLayerNorm, self).__init__()
+        self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1))  # [1, N, 1]
+        self.beta = nn.Parameter(torch.Tensor(1, channel_size,1 ))  # [1, N, 1]
+        self.reset_parameters()
+    def reset_parameters(self):
+        self.gamma.data.fill_(1)
+        self.beta.data.zero_()
+    def forward(self, y):
+        """
+        Args:
+            y: [M, N, K], M is batch size, N is channel size, K is length
+        Returns:
+            gLN_y: [M, N, K]
+        """
+        # TODO: in torch 1.0, torch.mean() support dim list
+        mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True) #[M, 1, 1]
+        var = (torch.pow(y-mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)
+        gLN_y = self.gamma * (y - mean) / torch.pow(var + EPS, 0.5) + self.beta
+        return gLN_y
+class TemporalConvNet(nn.Module):
+    def __init__(self, N=768, B=256, H=512, P=3, X=8, R=4, C=1, norm_type="gLN", causal=1,
+                 mask_nonlinear='relu'):
+        """
+        Args:
+            N: Number of filters in autoencoder
+            B: Number of channels in bottleneck 1 × 1-conv block
+            H: Number of channels in convolutional blocks
+            P: Kernel size in convolutional blocks
+            X: Number of convolutional blocks in each repeat
+            R: Number of repeats
+            C: Number of speakers
+            norm_type: BN, gLN, cLN
+            causal: causal or non-causal
+            mask_nonlinear: use which non-linear function to generate mask
+        """
+        super(TemporalConvNet, self).__init__()
+        # Hyper-parameter
+        self.C = C
+        self.mask_nonlinear = mask_nonlinear
+        # Components
+        # [M, N, K] -> [M, N, K]
+        layer_norm = ChannelwiseLayerNorm(N)
+        # [M, N, K] -> [M, B, K]
+        bottleneck_conv1x1 = nn.Conv1d(N, B, 1, bias=False)
+        # [M, B, K] -> [M, B, K]
+        repeats = []
+        for r in range(R):
+            blocks = []
+            for x in range(X):
+                dilation = 2**x
+                padding = (P - 1) * dilation if causal else (P - 1) * dilation // 2
+                blocks += [TemporalBlock(B, H, P, stride=1,
+                                         padding=padding,
+                                         dilation=dilation,
+                                         norm_type=norm_type,
+                                         causal=causal)]
+            repeats += [nn.Sequential(*blocks)]
+        temporal_conv_net = nn.Sequential(*repeats)
+        # [M, B, K] -> [M, C*N, K]
+        mask_conv1x1 = nn.Conv1d(B, C*N, 1, bias=False)
+        # Put together
+        self.network = nn.Sequential(layer_norm,
+                                     bottleneck_conv1x1,
+                                     temporal_conv_net,
+                                     mask_conv1x1)
+    def forward(self, mixture_w):
+        """
+        Keep this API same with TasNet
+        Args:
+            mixture_w: [M, N, K], M is batch size
+        returns:
+            est_mask: [M, C, N, K]
+        """
+        M, N, K = mixture_w.size()
+        score = self.network(mixture_w)  # [M, N, K] -> [M, C*N, K]
+        score = score.view(M, self.C, N, K) # [M, C*N, K] -> [M, C, N, K]
+        if self.mask_nonlinear == 'softmax':
+            est_mask = F.softmax(score, dim=1)
+            est_mask = est_mask.squeeze(1)
+        elif self.mask_nonlinear == 'relu':
+            est_mask = F.relu(score)
+            est_mask = est_mask.squeeze(1)
+        else:
+            raise ValueError("Unsupported mask non-linear function")
+        return est_mask
+def rescale_conv(conv, reference):
+    std = conv.weight.std().detach()
+    scale = (std / reference)**0.5
+    conv.weight.data /= scale
+    if conv.bias is not None:
+        conv.bias.data /= scale
+def rescale_module(module, reference):
+    for sub in module.modules():
+        if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d)):
+            rescale_conv(sub, reference)
+class Demucs(nn.Module):
+    """
+    Demucs speech enhancement model.
+    Args:
+        - chin (int): number of input channels.
+        - chout (int): number of output channels.
+        - hidden (int): number of initial hidden channels.
+        - depth (int): number of layers.
+        - kernel_size (int): kernel size for each layer.
+        - stride (int): stride for each layer.
+        - causal (bool): if false, uses BiLSTM instead of LSTM.
+        - resample (int): amount of resampling to apply to the input/output.
+            Can be one of 1, 2 or 4.
+        - growth (float): number of channels is multiplied by this for every layer.
+        - max_hidden (int): maximum number of channels. Can be useful to
+            control the size/speed of the model.
+        - normalize (bool): if true, normalize the input.
+        - glu (bool): if true uses GLU instead of ReLU in 1x1 convolutions.
+        - rescale (float): controls custom weight initialization.
+            See https://arxiv.org/abs/1911.13254.
+        - floor (float): stability flooring when normalizing.
+    """
+    @capture_init
+    def __init__(self,
+                 chin=1,
+                 chout=1,
+                 hidden=48,
+                 depth=5,
+                 kernel_size=8,
+                 stride=4,
+                 causal=True,
+                 resample=4,
+                 growth=2,
+                 max_hidden=10_000,
+                 normalize=True,
+                 glu=True,
+                 rescale=0.1,
+                 floor=1e-3):
+        super().__init__()
+        if resample not in [1, 2, 4]:
+            raise ValueError("Resample should be 1, 2 or 4.")
+        self.chin = chin
+        self.chout = chout
+        self.hidden = hidden
+        self.depth = depth
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.causal = causal
+        self.floor = floor
+        self.resample = resample
+        self.normalize = normalize
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+        activation = nn.GLU(1) if glu else nn.ReLU()
+        ch_scale = 2 if glu else 1
+        for index in range(depth):
+            encode = []
+            encode += [
+                nn.Conv1d(chin, hidden, kernel_size, stride),
+                nn.ReLU(),
+                nn.Conv1d(hidden, hidden * ch_scale, 1), activation,
+            ]
+            self.encoder.append(nn.Sequential(*encode))
+            decode = []
+            decode += [
+                nn.Conv1d(hidden, ch_scale * hidden, 1), activation,
+                nn.ConvTranspose1d(hidden, chout, kernel_size, stride),
+            ]
+            if index > 0:
+                decode.append(nn.ReLU())
+            self.decoder.insert(0, nn.Sequential(*decode))
+            chout = hidden
+            chin = hidden
+            hidden = min(int(growth * hidden), max_hidden)
+        # import pdb; pdb.set_trace()
+        self.separator = TemporalConvNet(N=chout)
+        # self.lstm = BLSTM(chin, bi=not causal)
+        if rescale:
+            rescale_module(self, reference=rescale)
+    def valid_length(self, length):
+        """
+        Return the nearest valid length to use with the model so that
+        there is no time steps left over in a convolutions, e.g. for all
+        layers, size of the input - kernel_size % stride = 0.
+        If the mixture has a valid length, the estimated sources
+        will have exactly the same length.
+        """
+        length = math.ceil(length * self.resample)
+        for idx in range(self.depth):
+            length = math.ceil((length - self.kernel_size) / self.stride) + 1
+            length = max(length, 1)
+        for idx in range(self.depth):
+            length = (length - 1) * self.stride + self.kernel_size
+        length = int(math.ceil(length / self.resample))
+        return int(length)
+    @property
+    def total_stride(self):
+        return self.stride ** self.depth // self.resample
+    def forward(self, mix):
+        if mix.dim() == 2:
+            mix = mix.unsqueeze(1)
+        if self.normalize:
+            mono = mix.mean(dim=1, keepdim=True)
+            std = mono.std(dim=-1, keepdim=True)
+            mix = mix / (self.floor + std)
+        else:
+            std = 1
+        length = mix.shape[-1]
+        x = mix
+        x = F.pad(x, (0, self.valid_length(length) - length))
+        if self.resample == 2:
+            x = upsample2(x)
+        elif self.resample == 4:
+            x = upsample2(x)
+            x = upsample2(x)
+        skips = []
+        for encode in self.encoder:
+            x = encode(x)
+            skips.append(x)
+        x = self.separator(x)
+        # x = x.permute(2, 0, 1)
+        # x, _ = self.lstm(x)
+        # x = x.permute(1, 2, 0)
+        # import pdb; pdb.set_trace()
+        for decode in self.decoder:
+            skip = skips.pop(-1)
+            x = x + skip[..., :x.shape[-1]]
+            x = decode(x)
+        if self.resample == 2:
+            x = downsample2(x)
+        elif self.resample == 4:
+            x = downsample2(x)
+            x = downsample2(x)
+        x = x[..., :length]
+        return std * x
+def fast_conv(conv, x):
+    """
+    Faster convolution evaluation if either kernel size is 1
+    or length of sequence is 1.
+    """
+    batch, chin, length = x.shape
+    chout, chin, kernel = conv.weight.shape
+    assert batch == 1
+    if kernel == 1:
+        x = x.view(chin, length)
+        out = th.addmm(conv.bias.view(-1, 1),
+                       conv.weight.view(chout, chin), x)
+    elif length == kernel:
+        x = x.view(chin * kernel, 1)
+        out = th.addmm(conv.bias.view(-1, 1),
+                       conv.weight.view(chout, chin * kernel), x)
+    else:
+        out = conv(x)
+    return out.view(batch, chout, -1)
+class DemucsStreamer:
+    """
+    Streaming implementation for Demucs. It supports being fed with any amount
+    of audio at a time. You will get back as much audio as possible at that
+    point.
+    Args:
+        - demucs (Demucs): Demucs model.
+        - dry (float): amount of dry (e.g. input) signal to keep. 0 is maximum
+            noise removal, 1 just returns the input signal. Small values > 0
+            allows to limit distortions.
+        - num_frames (int): number of frames to process at once. Higher values
+            will increase overall latency but improve the real time factor.
+        - resample_lookahead (int): extra lookahead used for the resampling.
+        - resample_buffer (int): size of the buffer of previous inputs/outputs
+            kept for resampling.
+    """
+    def __init__(self, demucs,
+                 dry=0,
+                 num_frames=1,
+                 resample_lookahead=64,
+                 resample_buffer=256):
+        device = next(iter(demucs.parameters())).device
+        self.demucs = demucs
+        self.lstm_state = None
+        self.conv_state = None
+        self.dry = dry
+        self.resample_lookahead = resample_lookahead
+        self.resample_buffer = resample_buffer
+        self.frame_length = demucs.valid_length(1) + demucs.total_stride * (num_frames - 1)
+        self.total_length = self.frame_length + self.resample_lookahead
+        self.stride = demucs.total_stride * num_frames
+        self.resample_in = torch.zeros(demucs.chin, resample_buffer, device=device)
+        self.resample_out = torch.zeros(demucs.chin, resample_buffer, device=device)
+        self.frames = 0
+        self.total_time = 0
+        self.variance = 0
+        self.pending = torch.zeros(demucs.chin, 0, device=device)
+        bias = demucs.decoder[0][2].bias
+        weight = demucs.decoder[0][2].weight
+        chin, chout, kernel = weight.shape
+        self._bias = bias.view(-1, 1).repeat(1, kernel).view(-1, 1)
+        self._weight = weight.permute(1, 2, 0).contiguous()
+    def reset_time_per_frame(self):
+        self.total_time = 0
+        self.frames = 0
+    @property
+    def time_per_frame(self):
+        return self.total_time / self.frames
+    def flush(self):
+        """
+        Flush remaining audio by padding it with zero. Call this
+        when you have no more input and want to get back the last chunk of audio.
+        """
+        pending_length = self.pending.shape[1]
+        padding = torch.zeros(self.demucs.chin, self.total_length, device=self.pending.device)
+        out = self.feed(padding)
+        return out[:, :pending_length]
+    def feed(self, wav):
+        """
+        Apply the model to mix using true real time evaluation.
+        Normalization is done online as is the resampling.
+        """
+        begin = time.time()
+        demucs = self.demucs
+        resample_buffer = self.resample_buffer
+        stride = self.stride
+        resample = demucs.resample
+        if wav.dim() != 2:
+            raise ValueError("input wav should be two dimensional.")
+        chin, _ = wav.shape
+        if chin != demucs.chin:
+            raise ValueError(f"Expected {demucs.chin} channels, got {chin}")
+        self.pending = torch.cat([self.pending, wav], dim=1)
+        outs = []
+        while self.pending.shape[1] >= self.total_length:
+            self.frames += 1
+            frame = self.pending[:, :self.total_length]
+            dry_signal = frame[:, :stride]
+            if demucs.normalize:
+                mono = frame.mean(0)
+                variance = (mono**2).mean()
+                self.variance = variance / self.frames + (1 - 1 / self.frames) * self.variance
+                frame = frame / (demucs.floor + math.sqrt(self.variance))
+            frame = torch.cat([self.resample_in, frame], dim=-1)
+            self.resample_in[:] = frame[:, stride - resample_buffer:stride]
+            if resample == 4:
+                frame = upsample2(upsample2(frame))
+            elif resample == 2:
+                frame = upsample2(frame)
+            frame = frame[:, resample * resample_buffer:]  # remove pre sampling buffer
+            frame = frame[:, :resample * self.frame_length]  # remove extra samples after window
+            out, extra = self._separate_frame(frame)
+            padded_out = torch.cat([self.resample_out, out, extra], 1)
+            self.resample_out[:] = out[:, -resample_buffer:]
+            if resample == 4:
+                out = downsample2(downsample2(padded_out))
+            elif resample == 2:
+                out = downsample2(padded_out)
+            else:
+                out = padded_out
+            out = out[:, resample_buffer // resample:]
+            out = out[:, :stride]
+            if demucs.normalize:
+                out *= math.sqrt(self.variance)
+            out = self.dry * dry_signal + (1 - self.dry) * out
+            outs.append(out)
+            self.pending = self.pending[:, stride:]
+        self.total_time += time.time() - begin
+        if outs:
+            out = torch.cat(outs, 1)
+        else:
+            out = torch.zeros(chin, 0, device=wav.device)
+        return out
+    def _separate_frame(self, frame):
+        demucs = self.demucs
+        skips = []
+        next_state = []
+        first = self.conv_state is None
+        stride = self.stride * demucs.resample
+        x = frame[None]
+        for idx, encode in enumerate(demucs.encoder):
+            stride //= demucs.stride
+            length = x.shape[2]
+            if idx == demucs.depth - 1:
+                # This is sligthly faster for the last conv
+                x = fast_conv(encode[0], x)
+                x = encode[1](x)
+                x = fast_conv(encode[2], x)
+                x = encode[3](x)
+            else:
+                if not first:
+                    prev = self.conv_state.pop(0)
+                    prev = prev[..., stride:]
+                    tgt = (length - demucs.kernel_size) // demucs.stride + 1
+                    missing = tgt - prev.shape[-1]
+                    offset = length - demucs.kernel_size - demucs.stride * (missing - 1)
+                    x = x[..., offset:]
+                x = encode[1](encode[0](x))
+                x = fast_conv(encode[2], x)
+                x = encode[3](x)
+                if not first:
+                    x = torch.cat([prev, x], -1)
+                next_state.append(x)
+            skips.append(x)
+        x = x.permute(2, 0, 1)
+        x, self.lstm_state = demucs.lstm(x, self.lstm_state)
+        x = x.permute(1, 2, 0)
+        # In the following, x contains only correct samples, i.e. the one
+        # for which each time position is covered by two window of the upper layer.
+        # extra contains extra samples to the right, and is used only as a
+        # better padding for the online resampling.
+        extra = None
+        for idx, decode in enumerate(demucs.decoder):
+            skip = skips.pop(-1)
+            x += skip[..., :x.shape[-1]]
+            x = fast_conv(decode[0], x)
+            x = decode[1](x)
+            if extra is not None:
+                skip = skip[..., x.shape[-1]:]
+                extra += skip[..., :extra.shape[-1]]
+                extra = decode[2](decode[1](decode[0](extra)))
+            x = decode[2](x)
+            next_state.append(x[..., -demucs.stride:] - decode[2].bias.view(-1, 1))
+            if extra is None:
+                extra = x[..., -demucs.stride:]
+            else:
+                extra[..., :demucs.stride] += next_state[-1]
+            x = x[..., :-demucs.stride]
+            if not first:
+                prev = self.conv_state.pop(0)
+                x[..., :demucs.stride] += prev
+            if idx != demucs.depth - 1:
+                x = decode[3](x)
+                extra = decode[3](extra)
+        self.conv_state = next_state
+        return x[0], extra[0]
+def test():
+    import argparse
+    parser = argparse.ArgumentParser(
+        "denoiser.demucs",
+        description="Benchmark the streaming Demucs implementation, "
+                    "as well as checking the delta with the offline implementation.")
+    parser.add_argument("--resample", default=4, type=int)
+    parser.add_argument("--hidden", default=48, type=int)
+    parser.add_argument("--device", default="cpu")
+    parser.add_argument("-t", "--num_threads", type=int)
+    parser.add_argument("-f", "--num_frames", type=int, default=1)
+    args = parser.parse_args()
+    if args.num_threads:
+        torch.set_num_threads(args.num_threads)
+    sr = 16_000
+    sr_ms = sr / 1000
+    demucs = Demucs(hidden=args.hidden, resample=args.resample).to(args.device)
+    x = torch.randn(1, sr * 4).to(args.device)
+    out = demucs(x[None])[0]
+    streamer = DemucsStreamer(demucs, num_frames=args.num_frames)
+    out_rt = []
+    frame_size = streamer.total_length
+    with torch.no_grad():
+        while x.shape[1] > 0:
+            out_rt.append(streamer.feed(x[:, :frame_size]))
+            x = x[:, frame_size:]
+            frame_size = streamer.demucs.total_stride
+    out_rt.append(streamer.flush())
+    out_rt = torch.cat(out_rt, 1)
+    print(f"total lag: {streamer.total_length / sr_ms:.1f}ms, ", end='')
+    print(f"stride: {streamer.stride / sr_ms:.1f}ms, ", end='')
+    print(f"time per frame: {1000 * streamer.time_per_frame:.1f}ms, ", end='')
+    print(f"delta: {torch.norm(out - out_rt) / torch.norm(out):.2%}, ", end='')
+    print(f"RTF: {((1000 * streamer.time_per_frame) / (streamer.stride / sr_ms)):.1f}")
+if __name__ == "__main__":
+    test()

denoiser/data.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# author: adefossez and adiyoss
+import json
+import logging
+import os
+import re
+from .audio import Audioset
+logger = logging.getLogger(__name__)
+def match_dns(noisy, clean):
+    """match_dns.
+    Match noisy and clean DNS dataset filenames.
+    :param noisy: list of the noisy filenames
+    :param clean: list of the clean filenames
+    """
+    logger.debug("Matching noisy and clean for dns dataset")
+    noisydict = {}
+    extra_noisy = []
+    for path, size in noisy:
+        match = re.search(r'fileid_(\d+)\.wav$', path)
+        if match is None:
+            # maybe we are mixing some other dataset in
+            extra_noisy.append((path, size))
+        else:
+            noisydict[match.group(1)] = (path, size)
+    noisy[:] = []
+    extra_clean = []
+    copied = list(clean)
+    clean[:] = []
+    for path, size in copied:
+        match = re.search(r'fileid_(\d+)\.wav$', path)
+        if match is None:
+            extra_clean.append((path, size))
+        else:
+            noisy.append(noisydict[match.group(1)])
+            clean.append((path, size))
+    extra_noisy.sort()
+    extra_clean.sort()
+    clean += extra_clean
+    noisy += extra_noisy
+def match_files(noisy, clean, matching="sort"):
+    """match_files.
+    Sort files to match noisy and clean filenames.
+    :param noisy: list of the noisy filenames
+    :param clean: list of the clean filenames
+    :param matching: the matching function, at this point only sort is supported
+    """
+    if matching == "dns":
+        # dns dataset filenames don't match when sorted, we have to manually match them
+        match_dns(noisy, clean)
+    elif matching == "sort":
+        noisy.sort()
+        clean.sort()
+    else:
+        raise ValueError(f"Invalid value for matching {matching}")
+class NoisyCleanSet:
+    def __init__(self, json_dir, matching="sort", length=None, stride=None,
+                 pad=True, sample_rate=None):
+        """__init__.
+        :param json_dir: directory containing both clean.json and noisy.json
+        :param matching: matching function for the files
+        :param length: maximum sequence length
+        :param stride: the stride used for splitting audio sequences
+        :param pad: pad the end of the sequence with zeros
+        :param sample_rate: the signals sampling rate
+        """
+        noisy_json = os.path.join(json_dir, 'noisy.json')
+        clean_json = os.path.join(json_dir, 'clean.json')
+        with open(noisy_json, 'r') as f:
+            noisy = json.load(f)
+        with open(clean_json, 'r') as f:
+            clean = json.load(f)
+        match_files(noisy, clean, matching)
+        kw = {'length': length, 'stride': stride, 'pad': pad, 'sample_rate': sample_rate}
+        self.clean_set = Audioset(clean, **kw)
+        self.noisy_set = Audioset(noisy, **kw)
+        assert len(self.clean_set) == len(self.noisy_set)
+    def __getitem__(self, index):
+        return self.noisy_set[index], self.clean_set[index]
+    def __len__(self):
+        return len(self.noisy_set)

denoiser/demucs.py ADDED Viewed

	@@ -0,0 +1,449 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# author: adefossez
+import math
+import time
+import torch as th
+from torch import nn
+from torch.nn import functional as F
+from .resample import downsample2, upsample2
+from .utils import capture_init
+class BLSTM(nn.Module):
+    def __init__(self, dim, layers=2, bi=True):
+        super().__init__()
+        klass = nn.LSTM
+        self.lstm = klass(bidirectional=bi, num_layers=layers, hidden_size=dim, input_size=dim)
+        self.linear = None
+        if bi:
+            self.linear = nn.Linear(2 * dim, dim)
+    def forward(self, x, hidden=None):
+        x, hidden = self.lstm(x, hidden)
+        if self.linear:
+            x = self.linear(x)
+        return x, hidden
+def rescale_conv(conv, reference):
+    std = conv.weight.std().detach()
+    scale = (std / reference)**0.5
+    conv.weight.data /= scale
+    if conv.bias is not None:
+        conv.bias.data /= scale
+def rescale_module(module, reference):
+    for sub in module.modules():
+        if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d)):
+            rescale_conv(sub, reference)
+class Demucs(nn.Module):
+    """
+    Demucs speech enhancement model.
+    Args:
+        - chin (int): number of input channels.
+        - chout (int): number of output channels.
+        - hidden (int): number of initial hidden channels.
+        - depth (int): number of layers.
+        - kernel_size (int): kernel size for each layer.
+        - stride (int): stride for each layer.
+        - causal (bool): if false, uses BiLSTM instead of LSTM.
+        - resample (int): amount of resampling to apply to the input/output.
+            Can be one of 1, 2 or 4.
+        - growth (float): number of channels is multiplied by this for every layer.
+        - max_hidden (int): maximum number of channels. Can be useful to
+            control the size/speed of the model.
+        - normalize (bool): if true, normalize the input.
+        - glu (bool): if true uses GLU instead of ReLU in 1x1 convolutions.
+        - rescale (float): controls custom weight initialization.
+            See https://arxiv.org/abs/1911.13254.
+        - floor (float): stability flooring when normalizing.
+    """
+    @capture_init
+    def __init__(self,
+                 chin=1,
+                 chout=1,
+                 hidden=48,
+                 depth=5,
+                 kernel_size=8,
+                 stride=4,
+                 causal=True,
+                 resample=4,
+                 growth=2,
+                 max_hidden=10_000,
+                 normalize=True,
+                 glu=True,
+                 rescale=0.1,
+                 floor=1e-3):
+        super().__init__()
+        if resample not in [1, 2, 4]:
+            raise ValueError("Resample should be 1, 2 or 4.")
+        self.chin = chin
+        self.chout = chout
+        self.hidden = hidden
+        self.depth = depth
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.causal = causal
+        self.floor = floor
+        self.resample = resample
+        self.normalize = normalize
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+        activation = nn.GLU(1) if glu else nn.ReLU()
+        ch_scale = 2 if glu else 1
+        for index in range(depth):
+            encode = []
+            encode += [
+                nn.Conv1d(chin, hidden, kernel_size, stride),
+                nn.ReLU(),
+                nn.Conv1d(hidden, hidden * ch_scale, 1), activation,
+            ]
+            self.encoder.append(nn.Sequential(*encode))
+            decode = []
+            decode += [
+                nn.Conv1d(hidden, ch_scale * hidden, 1), activation,
+                nn.ConvTranspose1d(hidden, chout, kernel_size, stride),
+            ]
+            if index > 0:
+                decode.append(nn.ReLU())
+            self.decoder.insert(0, nn.Sequential(*decode))
+            chout = hidden
+            chin = hidden
+            hidden = min(int(growth * hidden), max_hidden)
+        self.lstm = BLSTM(chin, bi=not causal)
+        if rescale:
+            rescale_module(self, reference=rescale)
+    def valid_length(self, length):
+        """
+        Return the nearest valid length to use with the model so that
+        there is no time steps left over in a convolutions, e.g. for all
+        layers, size of the input - kernel_size % stride = 0.
+        If the mixture has a valid length, the estimated sources
+        will have exactly the same length.
+        """
+        length = math.ceil(length * self.resample)
+        for idx in range(self.depth):
+            length = math.ceil((length - self.kernel_size) / self.stride) + 1
+            length = max(length, 1)
+        for idx in range(self.depth):
+            length = (length - 1) * self.stride + self.kernel_size
+        length = int(math.ceil(length / self.resample))
+        return int(length)
+    @property
+    def total_stride(self):
+        return self.stride ** self.depth // self.resample
+    def forward(self, mix):
+        if mix.dim() == 2:
+            mix = mix.unsqueeze(1)
+        if self.normalize:
+            mono = mix.mean(dim=1, keepdim=True)
+            std = mono.std(dim=-1, keepdim=True)
+            mix = mix / (self.floor + std)
+        else:
+            std = 1
+        length = mix.shape[-1]
+        x = mix
+        x = F.pad(x, (0, self.valid_length(length) - length))
+        if self.resample == 2:
+            x = upsample2(x)
+        elif self.resample == 4:
+            x = upsample2(x)
+            x = upsample2(x)
+        skips = []
+        for encode in self.encoder:
+            x = encode(x)
+            skips.append(x)
+        x = x.permute(2, 0, 1)
+        x, _ = self.lstm(x)
+        x = x.permute(1, 2, 0)
+        for decode in self.decoder:
+            skip = skips.pop(-1)
+            x = x + skip[..., :x.shape[-1]]
+            x = decode(x)
+        if self.resample == 2:
+            x = downsample2(x)
+        elif self.resample == 4:
+            x = downsample2(x)
+            x = downsample2(x)
+        x = x[..., :length]
+        return std * x
+def fast_conv(conv, x):
+    """
+    Faster convolution evaluation if either kernel size is 1
+    or length of sequence is 1.
+    """
+    batch, chin, length = x.shape
+    chout, chin, kernel = conv.weight.shape
+    assert batch == 1
+    if kernel == 1:
+        x = x.view(chin, length)
+        out = th.addmm(conv.bias.view(-1, 1),
+                       conv.weight.view(chout, chin), x)
+    elif length == kernel:
+        x = x.view(chin * kernel, 1)
+        out = th.addmm(conv.bias.view(-1, 1),
+                       conv.weight.view(chout, chin * kernel), x)
+    else:
+        out = conv(x)
+    return out.view(batch, chout, -1)
+class DemucsStreamer:
+    """
+    Streaming implementation for Demucs. It supports being fed with any amount
+    of audio at a time. You will get back as much audio as possible at that
+    point.
+    Args:
+        - demucs (Demucs): Demucs model.
+        - dry (float): amount of dry (e.g. input) signal to keep. 0 is maximum
+            noise removal, 1 just returns the input signal. Small values > 0
+            allows to limit distortions.
+        - num_frames (int): number of frames to process at once. Higher values
+            will increase overall latency but improve the real time factor.
+        - resample_lookahead (int): extra lookahead used for the resampling.
+        - resample_buffer (int): size of the buffer of previous inputs/outputs
+            kept for resampling.
+    """
+    def __init__(self, demucs,
+                 dry=0,
+                 num_frames=1,
+                 resample_lookahead=64,
+                 resample_buffer=256):
+        device = next(iter(demucs.parameters())).device
+        self.demucs = demucs
+        self.lstm_state = None
+        self.conv_state = None
+        self.dry = dry
+        self.resample_lookahead = resample_lookahead
+        self.resample_buffer = resample_buffer
+        self.frame_length = demucs.valid_length(1) + demucs.total_stride * (num_frames - 1)
+        self.total_length = self.frame_length + self.resample_lookahead
+        self.stride = demucs.total_stride * num_frames
+        self.resample_in = th.zeros(demucs.chin, resample_buffer, device=device)
+        self.resample_out = th.zeros(demucs.chin, resample_buffer, device=device)
+        self.frames = 0
+        self.total_time = 0
+        self.variance = 0
+        self.pending = th.zeros(demucs.chin, 0, device=device)
+        bias = demucs.decoder[0][2].bias
+        weight = demucs.decoder[0][2].weight
+        chin, chout, kernel = weight.shape
+        self._bias = bias.view(-1, 1).repeat(1, kernel).view(-1, 1)
+        self._weight = weight.permute(1, 2, 0).contiguous()
+    def reset_time_per_frame(self):
+        self.total_time = 0
+        self.frames = 0
+    @property
+    def time_per_frame(self):
+        return self.total_time / self.frames
+    def flush(self):
+        """
+        Flush remaining audio by padding it with zero. Call this
+        when you have no more input and want to get back the last chunk of audio.
+        """
+        pending_length = self.pending.shape[1]
+        padding = th.zeros(self.demucs.chin, self.total_length, device=self.pending.device)
+        out = self.feed(padding)
+        return out[:, :pending_length]
+    def feed(self, wav):
+        """
+        Apply the model to mix using true real time evaluation.
+        Normalization is done online as is the resampling.
+        """
+        begin = time.time()
+        demucs = self.demucs
+        resample_buffer = self.resample_buffer
+        stride = self.stride
+        resample = demucs.resample
+        if wav.dim() != 2:
+            raise ValueError("input wav should be two dimensional.")
+        chin, _ = wav.shape
+        if chin != demucs.chin:
+            raise ValueError(f"Expected {demucs.chin} channels, got {chin}")
+        self.pending = th.cat([self.pending, wav], dim=1)
+        outs = []
+        while self.pending.shape[1] >= self.total_length:
+            self.frames += 1
+            frame = self.pending[:, :self.total_length]
+            dry_signal = frame[:, :stride]
+            if demucs.normalize:
+                mono = frame.mean(0)
+                variance = (mono**2).mean()
+                self.variance = variance / self.frames + (1 - 1 / self.frames) * self.variance
+                frame = frame / (demucs.floor + math.sqrt(self.variance))
+            frame = th.cat([self.resample_in, frame], dim=-1)
+            self.resample_in[:] = frame[:, stride - resample_buffer:stride]
+            if resample == 4:
+                frame = upsample2(upsample2(frame))
+            elif resample == 2:
+                frame = upsample2(frame)
+            frame = frame[:, resample * resample_buffer:]  # remove pre sampling buffer
+            frame = frame[:, :resample * self.frame_length]  # remove extra samples after window
+            out, extra = self._separate_frame(frame)
+            padded_out = th.cat([self.resample_out, out, extra], 1)
+            self.resample_out[:] = out[:, -resample_buffer:]
+            if resample == 4:
+                out = downsample2(downsample2(padded_out))
+            elif resample == 2:
+                out = downsample2(padded_out)
+            else:
+                out = padded_out
+            out = out[:, resample_buffer // resample:]
+            out = out[:, :stride]
+            if demucs.normalize:
+                out *= math.sqrt(self.variance)
+            out = self.dry * dry_signal + (1 - self.dry) * out
+            outs.append(out)
+            self.pending = self.pending[:, stride:]
+        self.total_time += time.time() - begin
+        if outs:
+            out = th.cat(outs, 1)
+        else:
+            out = th.zeros(chin, 0, device=wav.device)
+        return out
+    def _separate_frame(self, frame):
+        demucs = self.demucs
+        skips = []
+        next_state = []
+        first = self.conv_state is None
+        stride = self.stride * demucs.resample
+        x = frame[None]
+        for idx, encode in enumerate(demucs.encoder):
+            stride //= demucs.stride
+            length = x.shape[2]
+            if idx == demucs.depth - 1:
+                # This is sligthly faster for the last conv
+                x = fast_conv(encode[0], x)
+                x = encode[1](x)
+                x = fast_conv(encode[2], x)
+                x = encode[3](x)
+            else:
+                if not first:
+                    prev = self.conv_state.pop(0)
+                    prev = prev[..., stride:]
+                    tgt = (length - demucs.kernel_size) // demucs.stride + 1
+                    missing = tgt - prev.shape[-1]
+                    offset = length - demucs.kernel_size - demucs.stride * (missing - 1)
+                    x = x[..., offset:]
+                x = encode[1](encode[0](x))
+                x = fast_conv(encode[2], x)
+                x = encode[3](x)
+                if not first:
+                    x = th.cat([prev, x], -1)
+                next_state.append(x)
+            skips.append(x)
+        x = x.permute(2, 0, 1)
+        x, self.lstm_state = demucs.lstm(x, self.lstm_state)
+        x = x.permute(1, 2, 0)
+        # In the following, x contains only correct samples, i.e. the one
+        # for which each time position is covered by two window of the upper layer.
+        # extra contains extra samples to the right, and is used only as a
+        # better padding for the online resampling.
+        extra = None
+        for idx, decode in enumerate(demucs.decoder):
+            skip = skips.pop(-1)
+            x += skip[..., :x.shape[-1]]
+            x = fast_conv(decode[0], x)
+            x = decode[1](x)
+            if extra is not None:
+                skip = skip[..., x.shape[-1]:]
+                extra += skip[..., :extra.shape[-1]]
+                extra = decode[2](decode[1](decode[0](extra)))
+            x = decode[2](x)
+            next_state.append(x[..., -demucs.stride:] - decode[2].bias.view(-1, 1))
+            if extra is None:
+                extra = x[..., -demucs.stride:]
+            else:
+                extra[..., :demucs.stride] += next_state[-1]
+            x = x[..., :-demucs.stride]
+            if not first:
+                prev = self.conv_state.pop(0)
+                x[..., :demucs.stride] += prev
+            if idx != demucs.depth - 1:
+                x = decode[3](x)
+                extra = decode[3](extra)
+        self.conv_state = next_state
+        return x[0], extra[0]
+def test():
+    import argparse
+    parser = argparse.ArgumentParser(
+        "denoiser.demucs",
+        description="Benchmark the streaming Demucs implementation, "
+                    "as well as checking the delta with the offline implementation.")
+    parser.add_argument("--resample", default=4, type=int)
+    parser.add_argument("--hidden", default=48, type=int)
+    parser.add_argument("--device", default="cpu")
+    parser.add_argument("-t", "--num_threads", type=int)
+    parser.add_argument("-f", "--num_frames", type=int, default=1)
+    args = parser.parse_args()
+    if args.num_threads:
+        th.set_num_threads(args.num_threads)
+    sr = 16_000
+    sr_ms = sr / 1000
+    demucs = Demucs(hidden=args.hidden, resample=args.resample).to(args.device)
+    x = th.randn(1, sr * 4).to(args.device)
+    out = demucs(x[None])[0]
+    streamer = DemucsStreamer(demucs, num_frames=args.num_frames)
+    out_rt = []
+    frame_size = streamer.total_length
+    with th.no_grad():
+        while x.shape[1] > 0:
+            out_rt.append(streamer.feed(x[:, :frame_size]))
+            x = x[:, frame_size:]
+            frame_size = streamer.demucs.total_stride
+    out_rt.append(streamer.flush())
+    out_rt = th.cat(out_rt, 1)
+    print(f"total lag: {streamer.total_length / sr_ms:.1f}ms, ", end='')
+    print(f"stride: {streamer.stride / sr_ms:.1f}ms, ", end='')
+    print(f"time per frame: {1000 * streamer.time_per_frame:.1f}ms, ", end='')
+    print(f"delta: {th.norm(out - out_rt) / th.norm(out):.2%}, ", end='')
+    print(f"RTF: {((1000 * streamer.time_per_frame) / (streamer.stride / sr_ms)):.1f}")
+if __name__ == "__main__":
+    test()

denoiser/distrib.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# author: adefossez
+import logging
+import os
+import torch
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data import DataLoader, Subset
+from torch.nn.parallel.distributed import DistributedDataParallel
+logger = logging.getLogger(__name__)
+rank = 0
+world_size = 1
+def init(args):
+    """init.
+    Initialize DDP using the given rendezvous file.
+    """
+    global rank, world_size
+    if args.ddp:
+        assert args.rank is not None and args.world_size is not None
+        rank = args.rank
+        world_size = args.world_size
+    if world_size == 1:
+        return
+    torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(
+        backend=args.ddp_backend,
+        init_method='file://' + os.path.abspath(args.rendezvous_file),
+        world_size=world_size,
+        rank=rank)
+    logger.debug("Distributed rendezvous went well, rank %d/%d", rank, world_size)
+def average(metrics, count=1.):
+    """average.
+    Average all the relevant metrices across processes
+    `metrics`should be a 1D float32 fector. Returns the average of `metrics`
+    over all hosts. You can use `count` to control the weight of each worker.
+    """
+    if world_size == 1:
+        return metrics
+    tensor = torch.tensor(list(metrics) + [1], device='cuda', dtype=torch.float32)
+    tensor *= count
+    torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
+    return (tensor[:-1] / tensor[-1]).cpu().numpy().tolist()
+def wrap(model):
+    """wrap.
+    Wrap a model with DDP if distributed training is enabled.
+    """
+    if world_size == 1:
+        return model
+    else:
+        return DistributedDataParallel(
+            model,
+            device_ids=[torch.cuda.current_device()],
+            output_device=torch.cuda.current_device())
+def barrier():
+    if world_size > 1:
+        torch.distributed.barrier()
+def loader(dataset, *args, shuffle=False, klass=DataLoader, **kwargs):
+    """loader.
+    Create a dataloader properly in case of distributed training.
+    If a gradient is going to be computed you must set `shuffle=True`.
+    :param dataset: the dataset to be parallelized
+    :param args: relevant args for the loader
+    :param shuffle: shuffle examples
+    :param klass: loader class
+    :param kwargs: relevant args
+    """
+    if world_size == 1:
+        return klass(dataset, *args, shuffle=shuffle, **kwargs)
+    if shuffle:
+        # train means we will compute backward, we use DistributedSampler
+        sampler = DistributedSampler(dataset)
+        # We ignore shuffle, DistributedSampler already shuffles
+        return klass(dataset, *args, **kwargs, sampler=sampler)
+    else:
+        # We make a manual shard, as DistributedSampler otherwise replicate some examples
+        dataset = Subset(dataset, list(range(rank, len(dataset), world_size)))
+        return klass(dataset, *args, shuffle=shuffle)

denoiser/dsp.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# author: adefossez
+import numpy as np
+import torch
+from torch.nn import functional as F
+def hz_to_mel(f):
+    return 2595 * np.log10(1 + f / 700)
+def mel_to_hz(m):
+    return 700 * (10**(m / 2595) - 1)
+def mel_frequencies(n_mels, fmin, fmax):
+    low = hz_to_mel(fmin)
+    high = hz_to_mel(fmax)
+    mels = np.linspace(low, high, n_mels)
+    return mel_to_hz(mels)
+class LowPassFilters(torch.nn.Module):
+    """
+    Bank of low pass filters.
+    Args:
+        cutoffs (list[float]): list of cutoff frequencies, in [0, 1] expressed as `f/f_s` where
+            f_s is the samplerate.
+        width (int): width of the filters (i.e. kernel_size=2 * width + 1).
+            Default to `2 / min(cutoffs)`. Longer filters will have better attenuation
+            but more side effects.
+    Shape:
+        - Input: `(*, T)`
+        - Output: `(F, *, T` with `F` the len of `cutoffs`.
+    """
+    def __init__(self, cutoffs: list, width: int = None):
+        super().__init__()
+        self.cutoffs = cutoffs
+        if width is None:
+            width = int(2 / min(cutoffs))
+        self.width = width
+        window = torch.hamming_window(2 * width + 1, periodic=False)
+        t = np.arange(-width, width + 1, dtype=np.float32)
+        filters = []
+        for cutoff in cutoffs:
+            sinc = torch.from_numpy(np.sinc(2 * cutoff * t))
+            filters.append(2 * cutoff * sinc * window)
+        self.register_buffer("filters", torch.stack(filters).unsqueeze(1))
+    def forward(self, input):
+        *others, t = input.shape
+        input = input.view(-1, 1, t)
+        out = F.conv1d(input, self.filters, padding=self.width)
+        return out.permute(1, 0, 2).reshape(-1, *others, t)
+    def __repr__(self):
+        return "LossPassFilters(width={},cutoffs={})".format(self.width, self.cutoffs)

denoiser/enhance.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# author: adiyoss
+import argparse
+import json
+import logging
+import os
+import sys
+import torch
+import torchaudio
+from .audio import Audioset, find_audio_files
+from . import distrib, pretrained
+from .demucs import DemucsStreamer
+from .utils import LogProgress
+logger = logging.getLogger(__name__)
+def add_flags(parser):
+    """
+    Add the flags for the argument parser that are related to model loading and evaluation"
+    """
+    pretrained.add_model_flags(parser)
+    parser.add_argument('--device', default="cpu")
+    parser.add_argument('--dry', type=float, default=0,
+                        help='dry/wet knob coefficient. 0 is only input signal, 1 only denoised.')
+    parser.add_argument('--sample_rate', default=16_000, type=int, help='sample rate')
+    parser.add_argument('--num_workers', type=int, default=10)
+    parser.add_argument('--streaming', action="store_true",
+                        help="true streaming evaluation for Demucs")
+parser = argparse.ArgumentParser(
+        'denoiser.enhance',
+        description="Speech enhancement using Demucs - Generate enhanced files")
+add_flags(parser)
+parser.add_argument("--out_dir", type=str, default="enhanced",
+                    help="directory putting enhanced wav files")
+parser.add_argument("--batch_size", default=1, type=int, help="batch size")
+parser.add_argument('-v', '--verbose', action='store_const', const=logging.DEBUG,
+                    default=logging.INFO, help="more loggging")
+group = parser.add_mutually_exclusive_group()
+group.add_argument("--noisy_dir", type=str, default=None,
+                   help="directory including noisy wav files")
+group.add_argument("--noisy_json", type=str, default=None,
+                   help="json file including noisy wav files")
+def get_estimate(model, noisy, args):
+    torch.set_num_threads(1)
+    if args.streaming:
+        streamer = DemucsStreamer(model, dry=args.dry)
+        with torch.no_grad():
+            estimate = torch.cat([
+                streamer.feed(noisy[0]),
+                streamer.flush()], dim=1)[None]
+    else:
+        with torch.no_grad():
+            estimate = model(noisy)
+            estimate = (1 - args.dry) * estimate + args.dry * noisy
+    return estimate
+def save_wavs(estimates, noisy_sigs, filenames, out_dir, sr=16_000):
+    # Write result
+    for estimate, noisy, filename in zip(estimates, noisy_sigs, filenames):
+        filename = os.path.join(out_dir, os.path.basename(filename).rsplit(".", 1)[0])
+        write(noisy, filename + "_noisy.wav", sr=sr)
+        write(estimate, filename + "_enhanced.wav", sr=sr)
+def write(wav, filename, sr=16_000):
+    # Normalize audio if it prevents clipping
+    wav = wav / max(wav.abs().max().item(), 1)
+    torchaudio.save(filename, wav.cpu(), sr)
+def get_dataset(args):
+    if hasattr(args, 'dset'):
+        paths = args.dset
+    else:
+        paths = args
+    if paths.noisy_json:
+        with open(paths.noisy_json) as f:
+            files = json.load(f)
+    elif paths.noisy_dir:
+        files = find_audio_files(paths.noisy_dir)
+    else:
+        logger.warning(
+            "Small sample set was not provided by either noisy_dir or noisy_json. "
+            "Skipping enhancement.")
+        return None
+    return Audioset(files, with_path=True, sample_rate=args.sample_rate)
+def enhance(args, model=None, local_out_dir=None):
+    # Load model
+    if not model:
+        model = pretrained.get_model(args).to(args.device)
+    model.eval()
+    if local_out_dir:
+        out_dir = local_out_dir
+    else:
+        out_dir = args.out_dir
+    dset = get_dataset(args)
+    if dset is None:
+        return
+    loader = distrib.loader(dset, batch_size=1)
+    if distrib.rank == 0:
+        os.makedirs(out_dir, exist_ok=True)
+    distrib.barrier()
+    with torch.no_grad():
+        iterator = LogProgress(logger, loader, name="Generate enhanced files")
+        for data in iterator:
+            # Get batch data
+            noisy_signals, filenames = data
+            noisy_signals = noisy_signals.to(args.device)
+            # Forward
+            estimate = get_estimate(model, noisy_signals, args)
+            save_wavs(estimate, noisy_signals, filenames, out_dir, sr=args.sample_rate)
+if __name__ == "__main__":
+    args = parser.parse_args()
+    logging.basicConfig(stream=sys.stderr, level=args.verbose)
+    logger.debug(args)
+    enhance(args, local_out_dir=args.out_dir)

denoiser/evaluate.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# author: adiyoss
+import argparse
+from concurrent.futures import ProcessPoolExecutor
+import json
+import logging
+import sys
+from pesq import pesq
+from pystoi import stoi
+import torch
+from .data import NoisyCleanSet
+from .enhance import add_flags, get_estimate
+from . import distrib, pretrained
+from .utils import bold, LogProgress
+logger = logging.getLogger(__name__)
+parser = argparse.ArgumentParser(
+        'denoiser.evaluate',
+        description='Speech enhancement using Demucs - Evaluate model performance')
+add_flags(parser)
+parser.add_argument('--data_dir', help='directory including noisy.json and clean.json files')
+parser.add_argument('--matching', default="sort", help='set this to dns for the dns dataset.')
+parser.add_argument('--no_pesq', action="store_false", dest="pesq", default=True,
+                    help="Don't compute PESQ.")
+parser.add_argument('-v', '--verbose', action='store_const', const=logging.DEBUG,
+                    default=logging.INFO, help="More loggging")
+def evaluate(args, model=None, data_loader=None):
+    total_pesq = 0
+    total_stoi = 0
+    total_cnt = 0
+    updates = 5
+    # Load model
+    if not model:
+        model = pretrained.get_model(args).to(args.device)
+    model.eval()
+    # Load data
+    if data_loader is None:
+        dataset = NoisyCleanSet(args.data_dir, matching=args.matching, sample_rate=args.sample_rate)
+        data_loader = distrib.loader(dataset, batch_size=1, num_workers=2)
+    pendings = []
+    with ProcessPoolExecutor(args.num_workers) as pool:
+        with torch.no_grad():
+            iterator = LogProgress(logger, data_loader, name="Eval estimates")
+            for i, data in enumerate(iterator):
+                # Get batch data
+                noisy, clean = [x.to(args.device) for x in data]
+                # If device is CPU, we do parallel evaluation in each CPU worker.
+                if args.device == 'cpu':
+                    pendings.append(
+                        pool.submit(_estimate_and_run_metrics, clean, model, noisy, args))
+                else:
+                    estimate = get_estimate(model, noisy, args)
+                    estimate = estimate.cpu()
+                    clean = clean.cpu()
+                    pendings.append(
+                        pool.submit(_run_metrics, clean, estimate, args))
+                total_cnt += clean.shape[0]
+        for pending in LogProgress(logger, pendings, updates, name="Eval metrics"):
+            pesq_i, stoi_i = pending.result()
+            total_pesq += pesq_i
+            total_stoi += stoi_i
+    metrics = [total_pesq, total_stoi]
+    pesq, stoi = distrib.average([m/total_cnt for m in metrics], total_cnt)
+    logger.info(bold(f'Test set performance:PESQ={pesq}, STOI={stoi}.'))
+    return pesq, stoi
+def _estimate_and_run_metrics(clean, model, noisy, args):
+    estimate = get_estimate(model, noisy, args)
+    return _run_metrics(clean, estimate, args)
+def _run_metrics(clean, estimate, args):
+    estimate = estimate.numpy()[:, 0]
+    clean = clean.numpy()[:, 0]
+    if args.pesq:
+        pesq_i = get_pesq(clean, estimate, sr=args.sample_rate)
+    else:
+        pesq_i = 0
+    stoi_i = get_stoi(clean, estimate, sr=args.sample_rate)
+    return pesq_i, stoi_i
+def get_pesq(ref_sig, out_sig, sr):
+    """Calculate PESQ.
+    Args:
+        ref_sig: numpy.ndarray, [B, T]
+        out_sig: numpy.ndarray, [B, T]
+    Returns:
+        PESQ
+    """
+    pesq_val = 0
+    for i in range(len(ref_sig)):
+        pesq_val += pesq(sr, ref_sig[i], out_sig[i], 'wb')
+    return pesq_val
+def get_stoi(ref_sig, out_sig, sr):
+    """Calculate STOI.
+    Args:
+        ref_sig: numpy.ndarray, [B, T]
+        out_sig: numpy.ndarray, [B, T]
+    Returns:
+        STOI
+    """
+    stoi_val = 0
+    for i in range(len(ref_sig)):
+        stoi_val += stoi(ref_sig[i], out_sig[i], sr, extended=False)
+    return stoi_val
+def main():
+    args = parser.parse_args()
+    logging.basicConfig(stream=sys.stderr, level=args.verbose)
+    logger.debug(args)
+    pesq, stoi = evaluate(args)
+    json.dump({'pesq': pesq, 'stoi': stoi}, sys.stdout)
+    sys.stdout.write('\n')
+if __name__ == '__main__':
+    main()

denoiser/executor.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# author: adefossez
+"""
+Start multiple process locally for DDP.
+"""
+import logging
+import subprocess as sp
+import sys
+from hydra import utils
+logger = logging.getLogger(__name__)
+class ChildrenManager:
+    def __init__(self):
+        self.children = []
+        self.failed = False
+    def add(self, child):
+        child.rank = len(self.children)
+        self.children.append(child)
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        if exc_value is not None:
+            logger.error("An exception happened while starting workers %r", exc_value)
+            self.failed = True
+        try:
+            while self.children and not self.failed:
+                for child in list(self.children):
+                    try:
+                        exitcode = child.wait(0.1)
+                    except sp.TimeoutExpired:
+                        continue
+                    else:
+                        self.children.remove(child)
+                        if exitcode:
+                            logger.error(f"Worker {child.rank} died, killing all workers")
+                            self.failed = True
+        except KeyboardInterrupt:
+            logger.error("Received keyboard interrupt, trying to kill all workers.")
+            self.failed = True
+        for child in self.children:
+            child.terminate()
+        if not self.failed:
+            logger.info("All workers completed successfully")
+def start_ddp_workers():
+    import torch as th
+    world_size = th.cuda.device_count()
+    if not world_size:
+        logger.error(
+            "DDP is only available on GPU. Make sure GPUs are properly configured with cuda.")
+        sys.exit(1)
+    logger.info(f"Starting {world_size} worker processes for DDP.")
+    with ChildrenManager() as manager:
+        for rank in range(world_size):
+            kwargs = {}
+            argv = list(sys.argv)
+            argv += [f"world_size={world_size}", f"rank={rank}"]
+            if rank > 0:
+                kwargs['stdin'] = sp.DEVNULL
+                kwargs['stdout'] = sp.DEVNULL
+                kwargs['stderr'] = sp.DEVNULL
+                log = utils.HydraConfig().hydra.job_logging.handlers.file.filename
+                log += f".{rank}"
+                argv.append("hydra.job_logging.handlers.file.filename=" + log)
+            manager.add(sp.Popen([sys.executable] + argv, cwd=utils.get_original_cwd(), **kwargs))
+    sys.exit(int(manager.failed))

denoiser/live.py ADDED Viewed

	@@ -0,0 +1,161 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# author: adefossez
+import argparse
+import sys
+import sounddevice as sd
+import torch
+from .demucs import DemucsStreamer
+from .pretrained import add_model_flags, get_model
+from .utils import bold
+def get_parser():
+    parser = argparse.ArgumentParser(
+        "denoiser.live",
+        description="Performs live speech enhancement, reading audio from "
+                    "the default mic (or interface specified by --in) and "
+                    "writing the enhanced version to 'Soundflower (2ch)' "
+                    "(or the interface specified by --out)."
+        )
+    parser.add_argument(
+        "-i", "--in", dest="in_",
+        help="name or index of input interface.")
+    parser.add_argument(
+        "-o", "--out", default="Soundflower (2ch)",
+        help="name or index of output interface.")
+    add_model_flags(parser)
+    parser.add_argument(
+        "--sample_rate", type=int, default=16_000,
+        help="Sample rate")
+    parser.add_argument(
+        "--no_compressor", action="store_false", dest="compressor",
+        help="Deactivate compressor on output, might lead to clipping.")
+    parser.add_argument(
+        "--device", default="cpu")
+    parser.add_argument(
+        "--dry", type=float, default=0.04,
+        help="Dry/wet knob, between 0 and 1. 0=maximum noise removal "
+             "but it might cause distortions. Default is 0.04")
+    parser.add_argument(
+        "-t", "--num_threads", type=int,
+        help="Number of threads. If you have DDR3 RAM, setting -t 1 can "
+             "improve performance.")
+    parser.add_argument(
+        "-f", "--num_frames", type=int, default=1,
+        help="Number of frames to process at once. Larger values increase "
+             "the overall lag, but will improve speed.")
+    return parser
+def parse_audio_device(device):
+    if device is None:
+        return device
+    try:
+        return int(device)
+    except ValueError:
+        return device
+def query_devices(device, kind):
+    try:
+        caps = sd.query_devices(device, kind=kind)
+    except ValueError:
+        message = bold(f"Invalid {kind} audio interface {device}.\n")
+        message += (
+            "If you are on Mac OS X, try installing Soundflower "
+            "(https://github.com/mattingalls/Soundflower).\n"
+            "You can list available interfaces with `python3 -m sounddevice` on Linux and OS X, "
+            "and `python.exe -m sounddevice` on Windows. You must have at least one loopback "
+            "audio interface to use this.")
+        print(message, file=sys.stderr)
+        sys.exit(1)
+    return caps
+def main():
+    args = get_parser().parse_args()
+    if args.num_threads:
+        torch.set_num_threads(args.num_threads)
+    model = get_model(args).to(args.device)
+    model.eval()
+    print("Model loaded.")
+    streamer = DemucsStreamer(model, dry=args.dry, num_frames=args.num_frames)
+    device_in = parse_audio_device(args.in_)
+    caps = query_devices(device_in, "input")
+    channels_in = min(caps['max_input_channels'], 2)
+    stream_in = sd.InputStream(
+        device=device_in,
+        samplerate=args.sample_rate,
+        channels=channels_in)
+    device_out = parse_audio_device(args.out)
+    caps = query_devices(device_out, "output")
+    channels_out = min(caps['max_output_channels'], 2)
+    stream_out = sd.OutputStream(
+        device=device_out,
+        samplerate=args.sample_rate,
+        channels=channels_out)
+    stream_in.start()
+    stream_out.start()
+    first = True
+    current_time = 0
+    last_log_time = 0
+    last_error_time = 0
+    cooldown_time = 2
+    log_delta = 10
+    sr_ms = args.sample_rate / 1000
+    stride_ms = streamer.stride / sr_ms
+    print(f"Ready to process audio, total lag: {streamer.total_length / sr_ms:.1f}ms.")
+    while True:
+        try:
+            if current_time > last_log_time + log_delta:
+                last_log_time = current_time
+                tpf = streamer.time_per_frame * 1000
+                rtf = tpf / stride_ms
+                print(f"time per frame: {tpf:.1f}ms, ", end='')
+                print(f"RTF: {rtf:.1f}")
+                streamer.reset_time_per_frame()
+            length = streamer.total_length if first else streamer.stride
+            first = False
+            current_time += length / args.sample_rate
+            frame, overflow = stream_in.read(length)
+            frame = torch.from_numpy(frame).mean(dim=1).to(args.device)
+            with torch.no_grad():
+                out = streamer.feed(frame[None])[0]
+            if not out.numel():
+                continue
+            if args.compressor:
+                out = 0.99 * torch.tanh(out)
+            out = out[:, None].repeat(1, channels_out)
+            mx = out.abs().max().item()
+            if mx > 1:
+                print("Clipping!!")
+            out.clamp_(-1, 1)
+            out = out.cpu().numpy()
+            underflow = stream_out.write(out)
+            if overflow or underflow:
+                if current_time >= last_error_time + cooldown_time:
+                    last_error_time = current_time
+                    tpf = 1000 * streamer.time_per_frame
+                    print(f"Not processing audio fast enough, time per frame is {tpf:.1f}ms "
+                          f"(should be less than {stride_ms:.1f}ms).")
+        except KeyboardInterrupt:
+            print("Stopping")
+            break
+    stream_out.stop()
+    stream_in.stop()
+if __name__ == "__main__":
+    main()

denoiser/pretrained.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# author: adefossez
+import logging
+import torch.hub
+from .demucs import Demucs
+from .utils import deserialize_model
+logger = logging.getLogger(__name__)
+ROOT = "https://dl.fbaipublicfiles.com/adiyoss/denoiser/"
+DNS_48_URL = ROOT + "dns48-11decc9d8e3f0998.th"
+DNS_64_URL = ROOT + "dns64-a7761ff99a7d5bb6.th"
+MASTER_64_URL = ROOT + "master64-8a5dfb4bb92753dd.th"
+def _demucs(pretrained, url, **kwargs):
+    model = Demucs(**kwargs)
+    if pretrained:
+        state_dict = torch.hub.load_state_dict_from_url(url, map_location='cpu')
+        model.load_state_dict(state_dict)
+    return model
+def dns48(pretrained=True):
+    return _demucs(pretrained, DNS_48_URL, hidden=48)
+def dns64(pretrained=True):
+    return _demucs(pretrained, DNS_64_URL, hidden=64)
+def master64(pretrained=True):
+    return _demucs(pretrained, MASTER_64_URL, hidden=64)
+def add_model_flags(parser):
+    group = parser.add_mutually_exclusive_group(required=False)
+    group.add_argument("-m", "--model_path", help="Path to local trained model.")
+    group.add_argument("--dns48", action="store_true",
+                       help="Use pre-trained real time H=48 model trained on DNS.")
+    group.add_argument("--dns64", action="store_true",
+                       help="Use pre-trained real time H=64 model trained on DNS.")
+    group.add_argument("--master64", action="store_true",
+                       help="Use pre-trained real time H=64 model trained on DNS and Valentini.")
+def get_model(args):
+    """
+    Load local model package or torchhub pre-trained model.
+    """
+    if args.model_path:
+        logger.info("Loading model from %s", args.model_path)
+        model = Demucs(hidden=64)
+        pkg = torch.load(args.model_path, map_location='cpu')
+        model.load_state_dict(pkg)
+    elif args.dns64:
+        logger.info("Loading pre-trained real time H=64 model trained on DNS.")
+        model = dns64()
+    elif args.master64:
+        logger.info("Loading pre-trained real time H=64 model trained on DNS and Valentini.")
+        model = master64()
+    else:
+        logger.info("Loading pre-trained real time H=48 model trained on DNS.")
+        model = dns48()
+    logger.debug(model)
+    return model

denoiser/resample.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# author: adefossez
+import math
+import torch as th
+from torch.nn import functional as F
+def sinc(t):
+    """sinc.
+    :param t: the input tensor
+    """
+    return th.where(t == 0, th.tensor(1., device=t.device, dtype=t.dtype), th.sin(t) / t)
+def kernel_upsample2(zeros=56):
+    """kernel_upsample2.
+    """
+    win = th.hann_window(4 * zeros + 1, periodic=False)
+    winodd = win[1::2]
+    t = th.linspace(-zeros + 0.5, zeros - 0.5, 2 * zeros)
+    t *= math.pi
+    kernel = (sinc(t) * winodd).view(1, 1, -1)
+    return kernel
+def upsample2(x, zeros=56):
+    """
+    Upsampling the input by 2 using sinc interpolation.
+    Smith, Julius, and Phil Gossett. "A flexible sampling-rate conversion method."
+    ICASSP'84. IEEE International Conference on Acoustics, Speech, and Signal Processing.
+    Vol. 9. IEEE, 1984.
+    """
+    *other, time = x.shape
+    kernel = kernel_upsample2(zeros).to(x)
+    out = F.conv1d(x.view(-1, 1, time), kernel, padding=zeros)[..., 1:].view(*other, time)
+    y = th.stack([x, out], dim=-1)
+    return y.view(*other, -1)
+def kernel_downsample2(zeros=56):
+    """kernel_downsample2.
+    """
+    win = th.hann_window(4 * zeros + 1, periodic=False)
+    winodd = win[1::2]
+    t = th.linspace(-zeros + 0.5, zeros - 0.5, 2 * zeros)
+    t.mul_(math.pi)
+    kernel = (sinc(t) * winodd).view(1, 1, -1)
+    return kernel
+def downsample2(x, zeros=56):
+    """
+    Downsampling the input by 2 using sinc interpolation.
+    Smith, Julius, and Phil Gossett. "A flexible sampling-rate conversion method."
+    ICASSP'84. IEEE International Conference on Acoustics, Speech, and Signal Processing.
+    Vol. 9. IEEE, 1984.
+    """
+    if x.shape[-1] % 2 != 0:
+        x = F.pad(x, (0, 1))
+    xeven = x[..., ::2]
+    xodd = x[..., 1::2]
+    *other, time = xodd.shape
+    kernel = kernel_downsample2(zeros).to(x)
+    out = xeven + F.conv1d(xodd.view(-1, 1, time), kernel, padding=zeros)[..., :-1].view(
+        *other, time)
+    return out.view(*other, -1).mul(0.5)

denoiser/solver.py ADDED Viewed

	@@ -0,0 +1,233 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# author: adiyoss
+import json
+import logging
+from pathlib import Path
+import os
+import time
+import torch
+import torch.nn.functional as F
+from . import augment, distrib, pretrained
+from .enhance import enhance
+from .evaluate import evaluate
+from .stft_loss import MultiResolutionSTFTLoss
+from .utils import bold, copy_state, pull_metric, serialize_model, swap_state, LogProgress
+logger = logging.getLogger(__name__)
+class Solver(object):
+    def __init__(self, data, model, optimizer, args):
+        self.tr_loader = data['tr_loader']
+        self.cv_loader = data['cv_loader']
+        self.tt_loader = data['tt_loader']
+        self.model = model
+        self.dmodel = distrib.wrap(model)
+        self.optimizer = optimizer
+        # data augment
+        augments = []
+        if args.remix:
+            augments.append(augment.Remix())
+        if args.bandmask:
+            augments.append(augment.BandMask(args.bandmask, sample_rate=args.sample_rate))
+        if args.shift:
+            augments.append(augment.Shift(args.shift, args.shift_same))
+        if args.revecho:
+            augments.append(
+                augment.RevEcho(args.revecho))
+        self.augment = torch.nn.Sequential(*augments)
+        # Training config
+        self.device = args.device
+        self.epochs = args.epochs
+        # Checkpoints
+        self.continue_from = args.continue_from
+        self.eval_every = args.eval_every
+        self.checkpoint = args.checkpoint
+        if self.checkpoint:
+            self.checkpoint_file = Path(args.checkpoint_file)
+            self.best_file = Path(args.best_file)
+            logger.debug("Checkpoint will be saved to %s", self.checkpoint_file.resolve())
+        self.history_file = args.history_file
+        self.best_state = None
+        self.restart = args.restart
+        self.history = []  # Keep track of loss
+        self.samples_dir = args.samples_dir  # Where to save samples
+        self.num_prints = args.num_prints  # Number of times to log per epoch
+        self.args = args
+        self.mrstftloss = MultiResolutionSTFTLoss(factor_sc=args.stft_sc_factor,
+                                                  factor_mag=args.stft_mag_factor)
+        self._reset()
+    def _serialize(self):
+        package = {}
+        package['model'] = serialize_model(self.model)
+        package['optimizer'] = self.optimizer.state_dict()
+        package['history'] = self.history
+        package['best_state'] = self.best_state
+        package['args'] = self.args
+        tmp_path = str(self.checkpoint_file) + ".tmp"
+        torch.save(package, tmp_path)
+        # renaming is sort of atomic on UNIX (not really true on NFS)
+        # but still less chances of leaving a half written checkpoint behind.
+        os.rename(tmp_path, self.checkpoint_file)
+        # Saving only the latest best model.
+        model = package['model']
+        model['state'] = self.best_state
+        tmp_path = str(self.best_file) + ".tmp"
+        torch.save(model, tmp_path)
+        os.rename(tmp_path, self.best_file)
+    def _reset(self):
+        """_reset."""
+        load_from = None
+        load_best = False
+        keep_history = True
+        # Reset
+        if self.checkpoint and self.checkpoint_file.exists() and not self.restart:
+            load_from = self.checkpoint_file
+        elif self.continue_from:
+            load_from = self.continue_from
+            load_best = self.args.continue_best
+            keep_history = False
+        if load_from:
+            logger.info(f'Loading checkpoint model: {load_from}')
+            package = torch.load(load_from, 'cpu')
+            if load_best:
+                self.model.load_state_dict(package['best_state'])
+            else:
+                self.model.load_state_dict(package['model']['state'])
+            if 'optimizer' in package and not load_best:
+                self.optimizer.load_state_dict(package['optimizer'])
+            if keep_history:
+                self.history = package['history']
+            self.best_state = package['best_state']
+        continue_pretrained = self.args.continue_pretrained
+        if continue_pretrained:
+            logger.info("Fine tuning from pre-trained model %s", continue_pretrained)
+            model = getattr(pretrained, self.args.continue_pretrained)()
+            self.model.load_state_dict(model.state_dict())
+    def train(self):
+        # Optimizing the model
+        if self.history:
+            logger.info("Replaying metrics from previous run")
+        for epoch, metrics in enumerate(self.history):
+            info = " ".join(f"{k.capitalize()}={v:.5f}" for k, v in metrics.items())
+            logger.info(f"Epoch {epoch + 1}: {info}")
+        for epoch in range(len(self.history), self.epochs):
+            # Train one epoch
+            self.model.train()
+            start = time.time()
+            logger.info('-' * 70)
+            logger.info("Training...")
+            train_loss = self._run_one_epoch(epoch)
+            logger.info(
+                bold(f'Train Summary | End of Epoch {epoch + 1} | '
+                     f'Time {time.time() - start:.2f}s | Train Loss {train_loss:.5f}'))
+            if self.cv_loader:
+                # Cross validation
+                logger.info('-' * 70)
+                logger.info('Cross validation...')
+                self.model.eval()
+                with torch.no_grad():
+                    valid_loss = self._run_one_epoch(epoch, cross_valid=True)
+                logger.info(
+                    bold(f'Valid Summary | End of Epoch {epoch + 1} | '
+                         f'Time {time.time() - start:.2f}s | Valid Loss {valid_loss:.5f}'))
+            else:
+                valid_loss = 0
+            best_loss = min(pull_metric(self.history, 'valid') + [valid_loss])
+            metrics = {'train': train_loss, 'valid': valid_loss, 'best': best_loss}
+            # Save the best model
+            if valid_loss == best_loss:
+                logger.info(bold('New best valid loss %.4f'), valid_loss)
+                self.best_state = copy_state(self.model.state_dict())
+            # evaluate and enhance samples every 'eval_every' argument number of epochs
+            # also evaluate on last epoch
+            if (epoch + 1) % self.eval_every == 0 or epoch == self.epochs - 1:
+                # Evaluate on the testset
+                logger.info('-' * 70)
+                logger.info('Evaluating on the test set...')
+                # We switch to the best known model for testing
+                with swap_state(self.model, self.best_state):
+                    pesq, stoi = evaluate(self.args, self.model, self.tt_loader)
+                metrics.update({'pesq': pesq, 'stoi': stoi})
+                # enhance some samples
+                logger.info('Enhance and save samples...')
+                enhance(self.args, self.model, self.samples_dir)
+            self.history.append(metrics)
+            info = " | ".join(f"{k.capitalize()} {v:.5f}" for k, v in metrics.items())
+            logger.info('-' * 70)
+            logger.info(bold(f"Overall Summary | Epoch {epoch + 1} | {info}"))
+            if distrib.rank == 0:
+                json.dump(self.history, open(self.history_file, "w"), indent=2)
+                # Save model each epoch
+                if self.checkpoint:
+                    self._serialize()
+                    logger.debug("Checkpoint saved to %s", self.checkpoint_file.resolve())
+    def _run_one_epoch(self, epoch, cross_valid=False):
+        total_loss = 0
+        data_loader = self.tr_loader if not cross_valid else self.cv_loader
+        # get a different order for distributed training, otherwise this will get ignored
+        data_loader.epoch = epoch
+        label = ["Train", "Valid"][cross_valid]
+        name = label + f" | Epoch {epoch + 1}"
+        logprog = LogProgress(logger, data_loader, updates=self.num_prints, name=name)
+        for i, data in enumerate(logprog):
+            noisy, clean = [x.to(self.device) for x in data]
+            if not cross_valid:
+                sources = torch.stack([noisy - clean, clean])
+                sources = self.augment(sources)
+                noise, clean = sources
+                noisy = noise + clean
+            estimate = self.dmodel(noisy)
+            # apply a loss function after each layer
+            with torch.autograd.set_detect_anomaly(True):
+                if self.args.loss == 'l1':
+                    loss = F.l1_loss(clean, estimate)
+                elif self.args.loss == 'l2':
+                    loss = F.mse_loss(clean, estimate)
+                elif self.args.loss == 'huber':
+                    loss = F.smooth_l1_loss(clean, estimate)
+                else:
+                    raise ValueError(f"Invalid loss {self.args.loss}")
+                # MultiResolution STFT loss
+                if self.args.stft_loss:
+                    sc_loss, mag_loss = self.mrstftloss(estimate.squeeze(1), clean.squeeze(1))
+                    loss += sc_loss + mag_loss
+                # optimize model in training mode
+                if not cross_valid:
+                    self.optimizer.zero_grad()
+                    loss.backward()
+                    self.optimizer.step()
+            total_loss += loss.item()
+            logprog.update(loss=format(total_loss / (i + 1), ".5f"))
+            # Just in case, clear some memory
+            del loss, estimate
+        return distrib.average([total_loss / (i + 1)], i + 1)[0]

denoiser/stft_loss.py ADDED Viewed

	@@ -0,0 +1,144 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Original copyright 2019 Tomoki Hayashi
+#  MIT License (https://opensource.org/licenses/MIT)
+"""STFT-based Loss modules."""
+import torch
+import torch.nn.functional as F
+def stft(x, fft_size, hop_size, win_length, window):
+    """Perform STFT and convert to magnitude spectrogram.
+    Args:
+        x (Tensor): Input signal tensor (B, T).
+        fft_size (int): FFT size.
+        hop_size (int): Hop size.
+        win_length (int): Window length.
+        window (str): Window function type.
+    Returns:
+        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
+    """
+    x_stft = torch.stft(x, fft_size, hop_size, win_length, window)
+    real = x_stft[..., 0]
+    imag = x_stft[..., 1]
+    # NOTE(kan-bayashi): clamp is needed to avoid nan or inf
+    return torch.sqrt(torch.clamp(real ** 2 + imag ** 2, min=1e-7)).transpose(2, 1)
+class SpectralConvergengeLoss(torch.nn.Module):
+    """Spectral convergence loss module."""
+    def __init__(self):
+        """Initilize spectral convergence loss module."""
+        super(SpectralConvergengeLoss, self).__init__()
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+        Args:
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            Tensor: Spectral convergence loss value.
+        """
+        return torch.norm(y_mag - x_mag, p="fro") / torch.norm(y_mag, p="fro")
+class LogSTFTMagnitudeLoss(torch.nn.Module):
+    """Log STFT magnitude loss module."""
+    def __init__(self):
+        """Initilize los STFT magnitude loss module."""
+        super(LogSTFTMagnitudeLoss, self).__init__()
+    def forward(self, x_mag, y_mag):
+        """Calculate forward propagation.
+        Args:
+            x_mag (Tensor): Magnitude spectrogram of predicted signal (B, #frames, #freq_bins).
+            y_mag (Tensor): Magnitude spectrogram of groundtruth signal (B, #frames, #freq_bins).
+        Returns:
+            Tensor: Log STFT magnitude loss value.
+        """
+        return F.l1_loss(torch.log(y_mag), torch.log(x_mag))
+class STFTLoss(torch.nn.Module):
+    """STFT loss module."""
+    def __init__(self, fft_size=1024, shift_size=120, win_length=600, window="hann_window"):
+        """Initialize STFT loss module."""
+        super(STFTLoss, self).__init__()
+        self.fft_size = fft_size
+        self.shift_size = shift_size
+        self.win_length = win_length
+        self.window = getattr(torch, window)(win_length)
+        self.spectral_convergenge_loss = SpectralConvergengeLoss()
+        self.log_stft_magnitude_loss = LogSTFTMagnitudeLoss()
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Predicted signal (B, T).
+            y (Tensor): Groundtruth signal (B, T).
+        Returns:
+            Tensor: Spectral convergence loss value.
+            Tensor: Log STFT magnitude loss value.
+        """
+        x_mag = stft(x, self.fft_size, self.shift_size, self.win_length, self.window)
+        y_mag = stft(y, self.fft_size, self.shift_size, self.win_length, self.window)
+        sc_loss = self.spectral_convergenge_loss(x_mag, y_mag)
+        mag_loss = self.log_stft_magnitude_loss(x_mag, y_mag)
+        return sc_loss, mag_loss
+class MultiResolutionSTFTLoss(torch.nn.Module):
+    """Multi resolution STFT loss module."""
+    def __init__(self,
+                 fft_sizes=[1024, 2048, 512],
+                 hop_sizes=[120, 240, 50],
+                 win_lengths=[600, 1200, 240],
+                 window="hann_window", factor_sc=0.1, factor_mag=0.1):
+        """Initialize Multi resolution STFT loss module.
+        Args:
+            fft_sizes (list): List of FFT sizes.
+            hop_sizes (list): List of hop sizes.
+            win_lengths (list): List of window lengths.
+            window (str): Window function type.
+            factor (float): a balancing factor across different losses.
+        """
+        super(MultiResolutionSTFTLoss, self).__init__()
+        assert len(fft_sizes) == len(hop_sizes) == len(win_lengths)
+        self.stft_losses = torch.nn.ModuleList()
+        for fs, ss, wl in zip(fft_sizes, hop_sizes, win_lengths):
+            self.stft_losses += [STFTLoss(fs, ss, wl, window)]
+        self.factor_sc = factor_sc
+        self.factor_mag = factor_mag
+    def forward(self, x, y):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Predicted signal (B, T).
+            y (Tensor): Groundtruth signal (B, T).
+        Returns:
+            Tensor: Multi resolution spectral convergence loss value.
+            Tensor: Multi resolution log STFT magnitude loss value.
+        """
+        sc_loss = 0.0
+        mag_loss = 0.0
+        for f in self.stft_losses:
+            sc_l, mag_l = f(x, y)
+            sc_loss += sc_l
+            mag_loss += mag_l
+        sc_loss /= len(self.stft_losses)
+        mag_loss /= len(self.stft_losses)
+        return self.factor_sc*sc_loss, self.factor_mag*mag_loss

denoiser/utils.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# author: adefossez
+import functools
+import logging
+from contextlib import contextmanager
+import inspect
+import time
+logger = logging.getLogger(__name__)
+def capture_init(init):
+    """capture_init.
+    Decorate `__init__` with this, and you can then
+    recover the *args and **kwargs passed to it in `self._init_args_kwargs`
+    """
+    @functools.wraps(init)
+    def __init__(self, *args, **kwargs):
+        self._init_args_kwargs = (args, kwargs)
+        init(self, *args, **kwargs)
+    return __init__
+def deserialize_model(package, strict=False):
+    """deserialize_model.
+    """
+    klass = package['class']
+    if strict:
+        model = klass(*package['args'], **package['kwargs'])
+    else:
+        sig = inspect.signature(klass)
+        kw = package['kwargs']
+        for key in list(kw):
+            if key not in sig.parameters:
+                logger.warning("Dropping inexistant parameter %s", key)
+                del kw[key]
+        model = klass(*package['args'], **kw)
+    model.load_state_dict(package['state'])
+    return model
+def copy_state(state):
+    return {k: v.cpu().clone() for k, v in state.items()}
+def serialize_model(model):
+    args, kwargs = model._init_args_kwargs
+    state = copy_state(model.state_dict())
+    return {"class": model.__class__, "args": args, "kwargs": kwargs, "state": state}
+@contextmanager
+def swap_state(model, state):
+    """
+    Context manager that swaps the state of a model, e.g:
+        # model is in old state
+        with swap_state(model, new_state):
+            # model in new state
+        # model back to old state
+    """
+    old_state = copy_state(model.state_dict())
+    model.load_state_dict(state)
+    try:
+        yield
+    finally:
+        model.load_state_dict(old_state)
+def pull_metric(history, name):
+    out = []
+    for metrics in history:
+        if name in metrics:
+            out.append(metrics[name])
+    return out
+class LogProgress:
+    """
+    Sort of like tqdm but using log lines and not as real time.
+    Args:
+        - logger: logger obtained from `logging.getLogger`,
+        - iterable: iterable object to wrap
+        - updates (int): number of lines that will be printed, e.g.
+            if `updates=5`, log every 1/5th of the total length.
+        - total (int): length of the iterable, in case it does not support
+            `len`.
+        - name (str): prefix to use in the log.
+        - level: logging level (like `logging.INFO`).
+    """
+    def __init__(self,
+                 logger,
+                 iterable,
+                 updates=5,
+                 total=None,
+                 name="LogProgress",
+                 level=logging.INFO):
+        self.iterable = iterable
+        self.total = total or len(iterable)
+        self.updates = updates
+        self.name = name
+        self.logger = logger
+        self.level = level
+    def update(self, **infos):
+        self._infos = infos
+    def __iter__(self):
+        self._iterator = iter(self.iterable)
+        self._index = -1
+        self._infos = {}
+        self._begin = time.time()
+        return self
+    def __next__(self):
+        self._index += 1
+        try:
+            value = next(self._iterator)
+        except StopIteration:
+            raise
+        else:
+            return value
+        finally:
+            log_every = max(1, self.total // self.updates)
+            # logging is delayed by 1 it, in order to have the metrics from update
+            if self._index >= 1 and self._index % log_every == 0:
+                self._log()
+    def _log(self):
+        self._speed = (1 + self._index) / (time.time() - self._begin)
+        infos = " | ".join(f"{k.capitalize()} {v}" for k, v in self._infos.items())
+        if self._speed < 1e-4:
+            speed = "oo sec/it"
+        elif self._speed < 0.1:
+            speed = f"{1/self._speed:.1f} sec/it"
+        else:
+            speed = f"{self._speed:.1f} it/sec"
+        out = f"{self.name} | {self._index}/{self.total} | {speed}"
+        if infos:
+            out += " | " + infos
+        self.logger.log(self.level, out)
+def colorize(text, color):
+    """
+    Display text with some ANSI color in the terminal.
+    """
+    code = f"\033[{color}m"
+    restore = "\033[0m"
+    return "".join([code, text, restore])
+def bold(text):
+    """
+    Display text in bold in the terminal.
+    """
+    return colorize(text, "1")