Spaces:

DeepLearning101
/

Speech-Separation

Sleeping

App Files Files Community

DeepLearning101 commited on May 3

Commit

b6c45cb

verified ·

1 Parent(s): 406f587

Upload 16 files

Browse files

Files changed (16) hide show

DPTNet_eval/DPTNet_quant_sep.py +108 -0
DPTNet_eval/asteroid_test/__init__.py +19 -0
DPTNet_eval/asteroid_test/dsp/__init__.py +5 -0
DPTNet_eval/asteroid_test/dsp/overlap_add.py +317 -0
DPTNet_eval/asteroid_test/filterbanks/__init__.py +107 -0
DPTNet_eval/asteroid_test/filterbanks/enc_dec.py +267 -0
DPTNet_eval/asteroid_test/filterbanks/free_fb.py +33 -0
DPTNet_eval/asteroid_test/masknn/__init__.py +12 -0
DPTNet_eval/asteroid_test/masknn/activations.py +82 -0
DPTNet_eval/asteroid_test/masknn/attention.py +271 -0
DPTNet_eval/asteroid_test/masknn/norms.py +156 -0
DPTNet_eval/asteroid_test/models/__init__.py +59 -0
DPTNet_eval/asteroid_test/models/base_models.py +351 -0
DPTNet_eval/asteroid_test/models/dptnet.py +96 -0
DPTNet_eval/asteroid_test/utils/__init__.py +9 -0
DPTNet_eval/asteroid_test/utils/torch_utils.py +126 -0

DPTNet_eval/DPTNet_quant_sep.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# DPTNet_quant_sep.py
+import os
+import torch
+import numpy as np
+import torchaudio
+from huggingface_hub import hf_hub_download
+from . import asteroid_test
+torchaudio.set_audio_backend("sox_io")
+def get_conf():
+    conf_filterbank = {
+        'n_filters': 64,
+        'kernel_size': 16,
+        'stride': 8
+    }
+    conf_masknet = {
+        'in_chan': 64,
+        'n_src': 2,
+        'out_chan': 64,
+        'ff_hid': 256,
+        'ff_activation': "relu",
+        'norm_type': "gLN",
+        'chunk_size': 100,
+        'hop_size': 50,
+        'n_repeats': 2,
+        'mask_act': 'sigmoid',
+        'bidirectional': True,
+        'dropout': 0
+    }
+    return conf_filterbank, conf_masknet
+def load_dpt_model():
+    print('Load Separation Model...')
+    # 從環境變數取得 Hugging Face Token
+    HF_TOKEN = os.getenv("HF_TOKEN")
+    if not HF_TOKEN:
+        raise EnvironmentError("環境變數 HF_TOKEN 未設定！請先執行 export HF_TOKEN=xxx")
+    # 從 Hugging Face Hub 下載模型權重
+    model_path = hf_hub_download(
+        repo_id="DeepLearning101/speech-separation",  # ← 替換成你的 repo 名稱
+        filename="train_dptnet_aishell_partOverlap_B2_300epoch_quan-int8.p",
+        token=HF_TOKEN
+    )
+    # 取得模型參數
+    conf_filterbank, conf_masknet = get_conf()
+    # 建立模型架構
+    model_class = getattr(asteroid_test, "DPTNet")
+    model = model_class(**conf_filterbank, **conf_masknet)
+    # 套用量化設定
+    model = torch.quantization.quantize_dynamic(
+        model,
+        {torch.nn.LSTM, torch.nn.Linear},
+        dtype=torch.qint8
+    )
+    # 載入權重（忽略不匹配的 keys）
+    state_dict = torch.load(model_path, map_location="cpu")
+    model_state_dict = model.state_dict()
+    filtered_state_dict = {k: v for k, v in state_dict.items() if k in model_state_dict}
+    model.load_state_dict(filtered_state_dict, strict=False)
+    model.eval()
+    return model
+def dpt_sep_process(wav_path, model=None, outfilename=None):
+    if model is None:
+        model = load_dpt_model()
+    x, sr = torchaudio.load(wav_path)
+    x = x.cpu()
+    with torch.no_grad():
+        est_sources = model(x)  # shape: (1, 2, T)
+    est_sources = est_sources.squeeze(0)  # shape: (2, T)
+    sep_1, sep_2 = est_sources  # 拆成兩個 (T,) 的 tensor
+    # 正規化
+    max_abs = x[0].abs().max().item()
+    sep_1 = sep_1 * max_abs / sep_1.abs().max().item()
+    sep_2 = sep_2 * max_abs / sep_2.abs().max().item()
+    # 增加 channel 維度，變為 (1, T)
+    sep_1 = sep_1.unsqueeze(0)
+    sep_2 = sep_2.unsqueeze(0)
+    # 儲存結果
+    if outfilename is not None:
+        torchaudio.save(outfilename.replace('.wav', '_sep1.wav'), sep_1, sr)
+        torchaudio.save(outfilename.replace('.wav', '_sep2.wav'), sep_2, sr)
+        torchaudio.save(outfilename.replace('.wav', '_mix.wav'), x, sr)
+    else:
+        torchaudio.save(wav_path.replace('.wav', '_sep1.wav'), sep_1, sr)
+        torchaudio.save(wav_path.replace('.wav', '_sep2.wav'), sep_2, sr)
+if __name__ == '__main__':
+    print("This module should be used via Flask or Gradio.")

DPTNet_eval/asteroid_test/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import pathlib
+from .models import DPTNet
+from .utils import torch_utils  # noqa
+project_root = str(pathlib.Path(__file__).expanduser().absolute().parent.parent)
+__version__ = "0.3.4"
+def show_available_models():
+    from .utils.hub_utils import MODELS_URLS_HASHTABLE
+    print(" \n".join(list(MODELS_URLS_HASHTABLE.keys())))
+__all__ = [
+    "DPTNet",
+    "show_available_models",
+]

DPTNet_eval/asteroid_test/dsp/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .overlap_add import DualPathProcessing
+__all__ = [
+    "DualPathProcessing",
+]

DPTNet_eval/asteroid_test/dsp/overlap_add.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import torch
+from scipy.signal import get_window
+# from asteroid_test.losses import PITLossWrapper
+from torch import nn
+'''
+class LambdaOverlapAdd(torch.nn.Module):
+    """Overlap-add with lambda transform on segments.
+    Segment input signal, apply lambda function (a neural network for example)
+    and combine with OLA.
+    Args:
+        nnet (callable): Function to apply to each segment.
+        n_src (int): Number of sources in the output of nnet.
+        window_size (int): Size of segmenting window.
+        hop_size (int): Segmentation hop size.
+        window (str): Name of the window (see scipy.signal.get_window) used
+            for the synthesis.
+        reorder_chunks (bool): Whether to reorder each consecutive segment.
+            This might be useful when `nnet` is permutation invariant, as
+            source assignements might change output channel from one segment
+            to the next (in classic speech separation for example).
+            Reordering is performed based on the correlation between
+            the overlapped part of consecutive segment.
+     Examples:
+        >>> from asteroid_test import ConvTasNet
+        >>> nnet = ConvTasNet(n_src=2)
+        >>> continuous_nnet = LambdaOverlapAdd(
+        >>>     nnet=nnet,
+        >>>     n_src=2,
+        >>>     window_size=64000,
+        >>>     hop_size=None,
+        >>>     window="hanning",
+        >>>     reorder_chunks=True,
+        >>>     enable_grad=False,
+        >>> )
+        >>> wav = torch.randn(1, 1, 500000)
+        >>> out_wavs = continuous_nnet.forward(wav)
+    """
+    def __init__(
+        self,
+        nnet,
+        n_src,
+        window_size,
+        hop_size=None,
+        window="hanning",
+        reorder_chunks=True,
+        enable_grad=False,
+    ):
+        super().__init__()
+        assert window_size % 2 == 0, "Window size must be even"
+        self.nnet = nnet
+        self.window_size = window_size
+        self.hop_size = hop_size if hop_size is not None else window_size // 2
+        self.n_src = n_src
+        if window:
+            window = get_window(window, self.window_size).astype("float32")
+            window = torch.from_numpy(window)
+            self.use_window = True
+        else:
+            self.use_window = False
+        self.register_buffer("window", window)
+        self.reorder_chunks = reorder_chunks
+        self.enable_grad = enable_grad
+    def ola_forward(self, x):
+        """Heart of the class: segment signal, apply func, combine with OLA."""
+        assert x.ndim == 3
+        batch, channels, n_frames = x.size()
+        # Overlap and add:
+        # [batch, chans, n_frames] -> [batch, chans, win_size, n_chunks]
+        unfolded = torch.nn.functional.unfold(
+            x.unsqueeze(-1),
+            kernel_size=(self.window_size, 1),
+            padding=(self.window_size, 0),
+            stride=(self.hop_size, 1),
+        )
+        out = []
+        n_chunks = unfolded.shape[-1]
+        for frame_idx in range(n_chunks):  # for loop to spare memory
+            frame = self.nnet(unfolded[..., frame_idx])
+            # user must handle multichannel by reshaping to batch
+            if frame_idx == 0:
+                assert frame.ndim == 3, "nnet should return (batch, n_src, time)"
+                assert frame.shape[1] == self.n_src, "nnet should return (batch, n_src, time)"
+            frame = frame.reshape(batch * self.n_src, -1)
+            if frame_idx != 0 and self.reorder_chunks:
+                # we determine best perm based on xcorr with previous sources
+                frame = _reorder_sources(
+                    frame, out[-1], self.n_src, self.window_size, self.hop_size
+                )
+            if self.use_window:
+                frame = frame * self.window
+            else:
+                frame = frame / (self.window_size / self.hop_size)
+            out.append(frame)
+        out = torch.stack(out).reshape(n_chunks, batch * self.n_src, self.window_size)
+        out = out.permute(1, 2, 0)
+        out = torch.nn.functional.fold(
+            out,
+            (n_frames, 1),
+            kernel_size=(self.window_size, 1),
+            padding=(self.window_size, 0),
+            stride=(self.hop_size, 1),
+        )
+        return out.squeeze(-1).reshape(batch, self.n_src, -1)
+    def forward(self, x):
+        """Forward module: segment signal, apply func, combine with OLA.
+        Args:
+            x (:class:`torch.Tensor`): waveform signal of shape (batch, 1, time).
+        Returns:
+            :class:`torch.Tensor`: The output of the lambda OLA.
+        """
+        # Here we can do the reshaping
+        with torch.autograd.set_grad_enabled(self.enable_grad):
+            olad = self.ola_forward(x)
+            return olad
+def _reorder_sources(
+    current: torch.FloatTensor,
+    previous: torch.FloatTensor,
+    n_src: int,
+    window_size: int,
+    hop_size: int,
+):
+    """
+     Reorder sources in current chunk to maximize correlation with previous chunk.
+     Used for Continuous Source Separation. Standard dsp correlation is used
+     for reordering.
+    Args:
+        current (:class:`torch.Tensor`): current chunk, tensor
+                                        of shape (batch, n_src, window_size)
+        previous (:class:`torch.Tensor`): previous chunk, tensor
+                                        of shape (batch, n_src, window_size)
+        n_src (:class:`int`): number of sources.
+        window_size (:class:`int`): window_size, equal to last dimension of
+                                    both current and previous.
+        hop_size (:class:`int`): hop_size between current and previous tensors.
+    Returns:
+        current:
+    """
+    batch, frames = current.size()
+    current = current.reshape(-1, n_src, frames)
+    previous = previous.reshape(-1, n_src, frames)
+    overlap_f = window_size - hop_size
+    def reorder_func(x, y):
+        x = x[..., :overlap_f]
+        y = y[..., -overlap_f:]
+        # Mean normalization
+        x = x - x.mean(-1, keepdim=True)
+        y = y - y.mean(-1, keepdim=True)
+        # Negative mean Correlation
+        return -torch.sum(x.unsqueeze(1) * y.unsqueeze(2), dim=-1)
+    # We maximize correlation-like between previous and current.
+    pit = PITLossWrapper(reorder_func)
+    current = pit(current, previous, return_est=True)[1]
+    return current.reshape(batch, frames)
+'''
+class DualPathProcessing(nn.Module):
+    """Perform Dual-Path processing via overlap-add as in DPRNN [1].
+     Args:
+        chunk_size (int): Size of segmenting window.
+        hop_size (int): segmentation hop size.
+    References:
+        [1] "Dual-path RNN: efficient long sequence modeling for
+            time-domain single-channel speech separation", Yi Luo, Zhuo Chen
+            and Takuya Yoshioka. https://arxiv.org/abs/1910.06379
+    """
+    def __init__(self, chunk_size, hop_size):
+        super(DualPathProcessing, self).__init__()
+        self.chunk_size = chunk_size
+        self.hop_size = hop_size
+        self.n_orig_frames = None
+    def unfold(self, x):
+        """Unfold the feature tensor from
+        (batch, channels, time) to (batch, channels, chunk_size, n_chunks).
+        Args:
+            x: (:class:`torch.Tensor`): feature tensor of shape (batch, channels, time).
+        Returns:
+            x: (:class:`torch.Tensor`): spliced feature tensor of shape
+                (batch, channels, chunk_size, n_chunks).
+        """
+        # x is (batch, chan, frames)
+        batch, chan, frames = x.size()
+        assert x.ndim == 3
+        self.n_orig_frames = x.shape[-1]
+        unfolded = torch.nn.functional.unfold(
+            x.unsqueeze(-1),
+            kernel_size=(self.chunk_size, 1),
+            padding=(self.chunk_size, 0),
+            stride=(self.hop_size, 1),
+        )
+        return unfolded.reshape(
+            batch, chan, self.chunk_size, -1
+        )  # (batch, chan, chunk_size, n_chunks)
+    def fold(self, x, output_size=None):
+        """Folds back the spliced feature tensor.
+        Input shape (batch, channels, chunk_size, n_chunks) to original shape
+        (batch, channels, time) using overlap-add.
+        Args:
+            x: (:class:`torch.Tensor`): spliced feature tensor of shape
+                (batch, channels, chunk_size, n_chunks).
+            output_size: (int, optional): sequence length of original feature tensor.
+                If None, the original length cached by the previous call of `unfold`
+                will be used.
+        Returns:
+            x: (:class:`torch.Tensor`):  feature tensor of shape (batch, channels, time).
+        .. note:: `fold` caches the original length of the pr
+        """
+        output_size = output_size if output_size is not None else self.n_orig_frames
+        # x is (batch, chan, chunk_size, n_chunks)
+        batch, chan, chunk_size, n_chunks = x.size()
+        to_unfold = x.reshape(batch, chan * self.chunk_size, n_chunks)
+        x = torch.nn.functional.fold(
+            to_unfold,
+            (output_size, 1),
+            kernel_size=(self.chunk_size, 1),
+            padding=(self.chunk_size, 0),
+            stride=(self.hop_size, 1),
+        )
+        x /= self.chunk_size / self.hop_size
+        return x.reshape(batch, chan, self.n_orig_frames)
+    @staticmethod
+    def intra_process(x, module):
+        """Performs intra-chunk processing.
+        Args:
+            x (:class:`torch.Tensor`): spliced feature tensor of shape
+                (batch, channels, chunk_size, n_chunks).
+            module (:class:`torch.nn.Module`): module one wish to apply to each chunk
+                of the spliced feature tensor.
+        Returns:
+            x (:class:`torch.Tensor`): processed spliced feature tensor of shape
+                (batch, channels, chunk_size, n_chunks).
+        .. note:: the module should have the channel first convention and accept
+            a 3D tensor of shape (batch, channels, time).
+        """
+        # x is (batch, channels, chunk_size, n_chunks)
+        batch, channels, chunk_size, n_chunks = x.size()
+        # we reshape to batch*chunk_size, channels, n_chunks
+        x = x.transpose(1, -1).reshape(batch * n_chunks, chunk_size, channels).transpose(1, -1)
+        x = module(x)
+        x = x.reshape(batch, n_chunks, channels, chunk_size).transpose(1, -1).transpose(1, 2)
+        return x
+    @staticmethod
+    def inter_process(x, module):
+        """Performs inter-chunk processing.
+        Args:
+            x (:class:`torch.Tensor`): spliced feature tensor of shape
+                (batch, channels, chunk_size, n_chunks).
+            module (:class:`torch.nn.Module`): module one wish to apply between
+                each chunk of the spliced feature tensor.
+        Returns:
+            x (:class:`torch.Tensor`): processed spliced feature tensor of shape
+                (batch, channels, chunk_size, n_chunks).
+        .. note:: the module should have the channel first convention and accept
+            a 3D tensor of shape (batch, channels, time).
+        """
+        batch, channels, chunk_size, n_chunks = x.size()
+        x = x.transpose(1, 2).reshape(batch * chunk_size, channels, n_chunks)
+        x = module(x)
+        x = x.reshape(batch, chunk_size, channels, n_chunks).transpose(1, 2)
+        return x

DPTNet_eval/asteroid_test/filterbanks/__init__.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# from .analytic_free_fb import AnalyticFreeFB
+from .free_fb import FreeFB
+from .enc_dec import Filterbank, Encoder, Decoder
+def make_enc_dec(
+    fb_name,
+    n_filters,
+    kernel_size,
+    stride=None,
+    who_is_pinv=None,
+    padding=0,
+    output_padding=0,
+    **kwargs,
+):
+    """Creates congruent encoder and decoder from the same filterbank family.
+    Args:
+        fb_name (str, className): Filterbank family from which to make encoder
+            and decoder. To choose among [``'free'``, ``'analytic_free'``,
+            ``'param_sinc'``, ``'stft'``]. Can also be a class defined in a
+            submodule in this subpackade (e.g. :class:`~.FreeFB`).
+        n_filters (int): Number of filters.
+        kernel_size (int): Length of the filters.
+        stride (int, optional): Stride of the convolution.
+            If None (default), set to ``kernel_size // 2``.
+        who_is_pinv (str, optional): If `None`, no pseudo-inverse filters will
+            be used. If string (among [``'encoder'``, ``'decoder'``]), decides
+            which of ``Encoder`` or ``Decoder`` will be the pseudo inverse of
+            the other one.
+        padding (int): Zero-padding added to both sides of the input.
+            Passed to Encoder and Decoder.
+        output_padding (int): Additional size added to one side of the output shape.
+            Passed to Decoder.
+        **kwargs: Arguments which will be passed to the filterbank class
+            additionally to the usual `n_filters`, `kernel_size` and `stride`.
+            Depends on the filterbank family.
+    Returns:
+        :class:`.Encoder`, :class:`.Decoder`
+    """
+    fb_class = get(fb_name)
+    if who_is_pinv in ["dec", "decoder"]:
+        fb = fb_class(n_filters, kernel_size, stride=stride, **kwargs)
+        enc = Encoder(fb, padding=padding)
+        # Decoder filterbank is pseudo inverse of encoder filterbank.
+        dec = Decoder.pinv_of(fb)
+    elif who_is_pinv in ["enc", "encoder"]:
+        fb = fb_class(n_filters, kernel_size, stride=stride, **kwargs)
+        dec = Decoder(fb, padding=padding, output_padding=output_padding)
+        # Encoder filterbank is pseudo inverse of decoder filterbank.
+        enc = Encoder.pinv_of(fb)
+    else:
+        fb = fb_class(n_filters, kernel_size, stride=stride, **kwargs)
+        enc = Encoder(fb, padding=padding)
+        # Filters between encoder and decoder should not be shared.
+        fb = fb_class(n_filters, kernel_size, stride=stride, **kwargs)
+        dec = Decoder(fb, padding=padding, output_padding=output_padding)
+    return enc, dec
+def register_filterbank(custom_fb):
+    """Register a custom filterbank, gettable with `filterbanks.get`.
+    Args:
+        custom_fb: Custom filterbank to register.
+    """
+    if custom_fb.__name__ in globals().keys() or custom_fb.__name__.lower() in globals().keys():
+        raise ValueError(f"Filterbank {custom_fb.__name__} already exists. Choose another name.")
+    globals().update({custom_fb.__name__: custom_fb})
+def get(identifier):
+    """Returns a filterbank class from a string. Returns its input if it
+    is callable (already a :class:`.Filterbank` for example).
+    Args:
+        identifier (str or Callable or None): the filterbank identifier.
+    Returns:
+        :class:`.Filterbank` or None
+    """
+    if identifier is None:
+        return None
+    elif callable(identifier):
+        return identifier
+    elif isinstance(identifier, str):
+        cls = globals().get(identifier)
+        if cls is None:
+            raise ValueError("Could not interpret filterbank identifier: " + str(identifier))
+        return cls
+    else:
+        raise ValueError("Could not interpret filterbank identifier: " + str(identifier))
+# Aliases.
+free = FreeFB
+# For the docs
+__all__ = [
+    "Filterbank",
+    "Encoder",
+    "Decoder",
+    "FreeFB",
+    "make_enc_dec",
+]

DPTNet_eval/asteroid_test/filterbanks/enc_dec.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import warnings
+import torch
+from torch import nn
+from torch.nn import functional as F
+class Filterbank(nn.Module):
+    """Base Filterbank class.
+    Each subclass has to implement a `filters` property.
+    Args:
+        n_filters (int): Number of filters.
+        kernel_size (int): Length of the filters.
+        stride (int, optional): Stride of the conv or transposed conv. (Hop size).
+            If None (default), set to ``kernel_size // 2``.
+    Attributes:
+        n_feats_out (int): Number of output filters.
+    """
+    def __init__(self, n_filters, kernel_size, stride=None):
+        super(Filterbank, self).__init__()
+        self.n_filters = n_filters
+        self.kernel_size = kernel_size
+        self.stride = stride if stride else self.kernel_size // 2
+        # If not specified otherwise in the filterbank's init, output
+        # number of features is equal to number of required filters.
+        self.n_feats_out = n_filters
+    @property
+    def filters(self):
+        """ Abstract method for filters. """
+        raise NotImplementedError
+    def get_config(self):
+        """ Returns dictionary of arguments to re-instantiate the class. """
+        config = {
+            "fb_name": self.__class__.__name__,
+            "n_filters": self.n_filters,
+            "kernel_size": self.kernel_size,
+            "stride": self.stride,
+        }
+        return config
+class _EncDec(nn.Module):
+    """Base private class for Encoder and Decoder.
+    Common parameters and methods.
+    Args:
+        filterbank (:class:`Filterbank`): Filterbank instance. The filterbank
+            to use as an encoder or a decoder.
+        is_pinv (bool): Whether to be the pseudo inverse of filterbank.
+    Attributes:
+        filterbank (:class:`Filterbank`)
+        stride (int)
+        is_pinv (bool)
+    """
+    def __init__(self, filterbank, is_pinv=False):
+        super(_EncDec, self).__init__()
+        self.filterbank = filterbank
+        self.stride = self.filterbank.stride
+        self.is_pinv = is_pinv
+    @property
+    def filters(self):
+        return self.filterbank.filters
+    def compute_filter_pinv(self, filters):
+        """ Computes pseudo inverse filterbank of given filters."""
+        scale = self.filterbank.stride / self.filterbank.kernel_size
+        shape = filters.shape
+        ifilt = torch.pinverse(filters.squeeze()).transpose(-1, -2).view(shape)
+        # Compensate for the overlap-add.
+        return ifilt * scale
+    def get_filters(self):
+        """ Returns filters or pinv filters depending on `is_pinv` attribute """
+        if self.is_pinv:
+            return self.compute_filter_pinv(self.filters)
+        else:
+            return self.filters
+    def get_config(self):
+        """ Returns dictionary of arguments to re-instantiate the class."""
+        config = {"is_pinv": self.is_pinv}
+        base_config = self.filterbank.get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+class Encoder(_EncDec):
+    """Encoder class.
+    Add encoding methods to Filterbank classes.
+    Not intended to be subclassed.
+    Args:
+        filterbank (:class:`Filterbank`): The filterbank to use
+            as an encoder.
+        is_pinv (bool): Whether to be the pseudo inverse of filterbank.
+        as_conv1d (bool): Whether to behave like nn.Conv1d.
+            If True (default), forwarding input with shape (batch, 1, time)
+            will output a tensor of shape (batch, freq, conv_time).
+            If False, will output a tensor of shape (batch, 1, freq, conv_time).
+        padding (int): Zero-padding added to both sides of the input.
+    """
+    def __init__(self, filterbank, is_pinv=False, as_conv1d=True, padding=0):
+        super(Encoder, self).__init__(filterbank, is_pinv=is_pinv)
+        self.as_conv1d = as_conv1d
+        self.n_feats_out = self.filterbank.n_feats_out
+        self.padding = padding
+    @classmethod
+    def pinv_of(cls, filterbank, **kwargs):
+        """Returns an :class:`~.Encoder`, pseudo inverse of a
+        :class:`~.Filterbank` or :class:`~.Decoder`."""
+        if isinstance(filterbank, Filterbank):
+            return cls(filterbank, is_pinv=True, **kwargs)
+        elif isinstance(filterbank, Decoder):
+            return cls(filterbank.filterbank, is_pinv=True, **kwargs)
+    def forward(self, waveform):
+        """Convolve input waveform with the filters from a filterbank.
+        Args:
+            waveform (:class:`torch.Tensor`): any tensor with samples along the
+                last dimension. The waveform representation with and
+                batch/channel etc.. dimension.
+        Returns:
+            :class:`torch.Tensor`: The corresponding TF domain signal.
+        Shapes:
+            >>> (time, ) --> (freq, conv_time)
+            >>> (batch, time) --> (batch, freq, conv_time)  # Avoid
+            >>> if as_conv1d:
+            >>>     (batch, 1, time) --> (batch, freq, conv_time)
+            >>>     (batch, chan, time) --> (batch, chan, freq, conv_time)
+            >>> else:
+            >>>     (batch, chan, time) --> (batch, chan, freq, conv_time)
+            >>> (batch, any, dim, time) --> (batch, any, dim, freq, conv_time)
+        """
+        filters = self.get_filters()
+        if waveform.ndim == 1:
+            # Assumes 1D input with shape (time,)
+            # Output will be (freq, conv_time)
+            return F.conv1d(
+                waveform[None, None], filters, stride=self.stride, padding=self.padding
+            ).squeeze()
+        elif waveform.ndim == 2:
+            # Assume 2D input with shape (batch or channels, time)
+            # Output will be (batch or channels, freq, conv_time)
+            warnings.warn(
+                "Input tensor was 2D. Applying the corresponding "
+                "Decoder to the current output will result in a 3D "
+                "tensor. This behaviours was introduced to match "
+                "Conv1D and ConvTranspose1D, please use 3D inputs "
+                "to avoid it. For example, this can be done with "
+                "input_tensor.unsqueeze(1)."
+            )
+            return F.conv1d(
+                waveform.unsqueeze(1), filters, stride=self.stride, padding=self.padding
+            )
+        elif waveform.ndim == 3:
+            batch, channels, time_len = waveform.shape
+            if channels == 1 and self.as_conv1d:
+                # That's the common single channel case (batch, 1, time)
+                # Output will be (batch, freq, stft_time), behaves as Conv1D
+                return F.conv1d(waveform, filters, stride=self.stride, padding=self.padding)
+            else:
+                # Return batched convolution, input is (batch, 3, time),
+                # output will be (batch, 3, freq, conv_time).
+                # Useful for multichannel transforms
+                # If as_conv1d is false, (batch, 1, time) will output
+                # (batch, 1, freq, conv_time), useful for consistency.
+                return self.batch_1d_conv(waveform, filters)
+        else:  # waveform.ndim > 3
+            # This is to compute "multi"multichannel convolution.
+            # Input can be (*, time), output will be (*, freq, conv_time)
+            return self.batch_1d_conv(waveform, filters)
+    def batch_1d_conv(self, inp, filters):
+        # Here we perform multichannel / multi-source convolution. Ou
+        # Output should be (batch, channels, freq, conv_time)
+        batched_conv = F.conv1d(
+            inp.view(-1, 1, inp.shape[-1]), filters, stride=self.stride, padding=self.padding
+        )
+        output_shape = inp.shape[:-1] + batched_conv.shape[-2:]
+        return batched_conv.view(output_shape)
+class Decoder(_EncDec):
+    """Decoder class.
+    Add decoding methods to Filterbank classes.
+    Not intended to be subclassed.
+    Args:
+        filterbank (:class:`Filterbank`): The filterbank to use as an decoder.
+        is_pinv (bool): Whether to be the pseudo inverse of filterbank.
+        padding (int): Zero-padding added to both sides of the input.
+        output_padding (int): Additional size added to one side of the
+            output shape.
+    Notes
+        `padding` and `output_padding` arguments are directly passed to
+        F.conv_transpose1d.
+    """
+    def __init__(self, filterbank, is_pinv=False, padding=0, output_padding=0):
+        super().__init__(filterbank, is_pinv=is_pinv)
+        self.padding = padding
+        self.output_padding = output_padding
+    @classmethod
+    def pinv_of(cls, filterbank):
+        """ Returns an Decoder, pseudo inverse of a filterbank or Encoder."""
+        if isinstance(filterbank, Filterbank):
+            return cls(filterbank, is_pinv=True)
+        elif isinstance(filterbank, Encoder):
+            return cls(filterbank.filterbank, is_pinv=True)
+    def forward(self, spec):
+        """Applies transposed convolution to a TF representation.
+        This is equivalent to overlap-add.
+        Args:
+            spec (:class:`torch.Tensor`): 3D or 4D Tensor. The TF
+                representation. (Output of :func:`Encoder.forward`).
+        Returns:
+            :class:`torch.Tensor`: The corresponding time domain signal.
+        """
+        filters = self.get_filters()
+        if spec.ndim == 2:
+            # Input is (freq, conv_time), output is (time)
+            return F.conv_transpose1d(
+                spec.unsqueeze(0),
+                filters,
+                stride=self.stride,
+                padding=self.padding,
+                output_padding=self.output_padding,
+            ).squeeze()
+        if spec.ndim == 3:
+            # Input is (batch, freq, conv_time), output is (batch, 1, time)
+            return F.conv_transpose1d(
+                spec,
+                filters,
+                stride=self.stride,
+                padding=self.padding,
+                output_padding=self.output_padding,
+            )
+        elif spec.ndim > 3:
+            # Multiply all the left dimensions together and group them in the
+            # batch. Make the convolution and restore.
+            view_as = (-1,) + spec.shape[-2:]
+            out = F.conv_transpose1d(
+                spec.view(view_as),
+                filters,
+                stride=self.stride,
+                padding=self.padding,
+                output_padding=self.output_padding,
+            )
+            return out.view(spec.shape[:-2] + (-1,))

DPTNet_eval/asteroid_test/filterbanks/free_fb.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torch
+import torch.nn as nn
+from .enc_dec import Filterbank
+class FreeFB(Filterbank):
+    """Free filterbank without any constraints. Equivalent to
+    :class:`nn.Conv1d`.
+    Args:
+        n_filters (int): Number of filters.
+        kernel_size (int): Length of the filters.
+        stride (int, optional): Stride of the convolution.
+            If None (default), set to ``kernel_size // 2``.
+    Attributes:
+        n_feats_out (int): Number of output filters.
+    References:
+        [1] : "Filterbank design for end-to-end speech separation".
+        Submitted to ICASSP 2020. Manuel Pariente, Samuele Cornell,
+        Antoine Deleforge, Emmanuel Vincent.
+    """
+    def __init__(self, n_filters, kernel_size, stride=None, **kwargs):
+        super(FreeFB, self).__init__(n_filters, kernel_size, stride=stride)
+        self._filters = nn.Parameter(torch.ones(n_filters, 1, kernel_size))
+        for p in self.parameters():
+            nn.init.xavier_normal_(p)
+    @property
+    def filters(self):
+        return self._filters

DPTNet_eval/asteroid_test/masknn/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# from .convolutional import TDConvNet, TDConvNetpp, SuDORMRF, SuDORMRFImproved
+# from .recurrent import DPRNN, LSTMMasker
+from .attention import DPTransformer
+__all__ = [
+    # "TDConvNet",
+    # "DPRNN",
+    "DPTransformer",
+    # "LSTMMasker",
+    # "SuDORMRF",
+    # "SuDORMRFImproved",
+]

DPTNet_eval/asteroid_test/masknn/activations.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from functools import partial
+import torch
+from torch import nn
+class Swish(nn.Module):
+    def __init__(self):
+        super(Swish, self).__init__()
+    def forward(self, x):
+        return x * torch.sigmoid(x)
+def linear():
+    return nn.Identity()
+def relu():
+    return nn.ReLU()
+def prelu():
+    return nn.PReLU()
+def leaky_relu():
+    return nn.LeakyReLU()
+def sigmoid():
+    return nn.Sigmoid()
+def softmax(dim=None):
+    return nn.Softmax(dim=dim)
+def tanh():
+    return nn.Tanh()
+def gelu():
+    return nn.GELU()
+def swish():
+    return Swish()
+def register_activation(custom_act):
+    """Register a custom activation, gettable with `activation.get`.
+    Args:
+        custom_act: Custom activation function to register.
+    """
+    if custom_act.__name__ in globals().keys() or custom_act.__name__.lower() in globals().keys():
+        raise ValueError(f"Activation {custom_act.__name__} already exists. Choose another name.")
+    globals().update({custom_act.__name__: custom_act})
+def get(identifier):
+    """Returns an activation function from a string. Returns its input if it
+    is callable (already an activation for example).
+    Args:
+        identifier (str or Callable or None): the activation identifier.
+    Returns:
+        :class:`nn.Module` or None
+    """
+    if identifier is None:
+        return None
+    elif callable(identifier):
+        return identifier
+    elif isinstance(identifier, str):
+        cls = globals().get(identifier)
+        if cls is None:
+            raise ValueError("Could not interpret activation identifier: " + str(identifier))
+        return cls
+    else:
+        raise ValueError("Could not interpret activation identifier: " + str(identifier))

DPTNet_eval/asteroid_test/masknn/attention.py ADDED Viewed

	@@ -0,0 +1,271 @@

+from math import ceil
+import warnings
+import torch.nn as nn
+from torch.nn.modules.activation import MultiheadAttention
+from ..masknn import activations, norms
+import torch
+from ..dsp.overlap_add import DualPathProcessing
+import inspect
+class ImprovedTransformedLayer(nn.Module):
+    """
+    Improved Transformer module as used in [1].
+    It is Multi-Head self-attention followed by LSTM, activation and linear projection layer.
+    Args:
+        embed_dim (int): Number of input channels.
+        n_heads (int): Number of attention heads.
+        dim_ff (int): Number of neurons in the RNNs cell state.
+            Defaults to 256. RNN here replaces standard FF linear layer in plain Transformer.
+        dropout (float, optional): Dropout ratio, must be in [0,1].
+        activation (str, optional): activation function applied at the output of RNN.
+        bidirectional (bool, optional): True for bidirectional Inter-Chunk RNN
+            (Intra-Chunk is always bidirectional).
+        norm_type (str, optional): Type of normalization to use.
+    References:
+        [1] Chen, Jingjing, Qirong Mao, and Dong Liu.
+        "Dual-Path Transformer Network: Direct Context-Aware Modeling for End-to-End Monaural Speech Separation."
+         arXiv preprint arXiv:2007.13975 (2020).
+    """
+    def __init__(
+        self,
+        embed_dim,
+        n_heads,
+        dim_ff,
+        dropout=0.0,
+        activation="relu",
+        bidirectional=True,
+        norm="gLN",
+    ):
+        super(ImprovedTransformedLayer, self).__init__()
+        self.mha = MultiheadAttention(embed_dim, n_heads, dropout=dropout)
+        # self.linear_first = nn.Linear(embed_dim, 2 * dim_ff) # Added by Kay. 20201119
+        self.dropout = nn.Dropout(dropout)
+        self.recurrent = nn.LSTM(embed_dim, dim_ff, bidirectional=bidirectional, batch_first=True)
+        ff_inner_dim = 2 * dim_ff if bidirectional else dim_ff
+        self.linear = nn.Linear(ff_inner_dim, embed_dim)
+        self.activation = activations.get(activation)()
+        self.norm_mha = norms.get(norm)(embed_dim)
+        self.norm_ff = norms.get(norm)(embed_dim)
+    def forward(self, x):
+        tomha = x.permute(2, 0, 1)
+        # x is batch, channels, seq_len
+        # mha is seq_len, batch, channels
+        # self-attention is applied
+        out = self.mha(tomha, tomha, tomha)[0]
+        x = self.dropout(out.permute(1, 2, 0)) + x
+        x = self.norm_mha(x)
+        # lstm is applied
+        out = self.linear(self.dropout(self.activation(self.recurrent(x.transpose(1, -1))[0])))
+        x = self.dropout(out.transpose(1, -1)) + x
+        return self.norm_ff(x)
+    ''' version 0.3.4
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        # x is batch, seq_len, channels
+        # self-attention is applied
+        out = self.mha(x, x, x)[0]
+        x = self.dropout(out) + x
+        x = self.norm_mha(x.transpose(1, -1)).transpose(1, -1)
+        # lstm is applied
+        out = self.linear(self.dropout(self.activation(self.recurrent(x)[0])))
+        # out = self.linear(self.dropout(self.activation(self.linear_first(x)[0])))
+        x = self.dropout(out) + x
+        return self.norm_ff(x.transpose(1, -1))
+    '''
+class DPTransformer(nn.Module):
+    """Dual-path Transformer introduced in [1].
+    Args:
+        in_chan (int): Number of input filters.
+        n_src (int): Number of masks to estimate.
+        n_heads (int): Number of attention heads.
+        ff_hid (int): Number of neurons in the RNNs cell state.
+            Defaults to 256.
+        chunk_size (int): window size of overlap and add processing.
+            Defaults to 100.
+        hop_size (int or None): hop size (stride) of overlap and add processing.
+            Default to `chunk_size // 2` (50% overlap).
+        n_repeats (int): Number of repeats. Defaults to 6.
+        norm_type (str, optional): Type of normalization to use.
+        ff_activation (str, optional): activation function applied at the output of RNN.
+        mask_act (str, optional): Which non-linear function to generate mask.
+        bidirectional (bool, optional): True for bidirectional Inter-Chunk RNN
+            (Intra-Chunk is always bidirectional).
+        dropout (float, optional): Dropout ratio, must be in [0,1].
+    References
+        [1] Chen, Jingjing, Qirong Mao, and Dong Liu. "Dual-Path Transformer
+        Network: Direct Context-Aware Modeling for End-to-End Monaural Speech Separation."
+        arXiv (2020).
+    """
+    def __init__(
+        self,
+        in_chan,
+        n_src,
+        n_heads=4,
+        ff_hid=256,
+        chunk_size=100,
+        hop_size=None,
+        n_repeats=6,
+        norm_type="gLN",
+        ff_activation="relu",
+        mask_act="relu",
+        bidirectional=True,
+        dropout=0,
+    ):
+        super(DPTransformer, self).__init__()
+        self.in_chan = in_chan
+        self.n_src = n_src
+        self.n_heads = n_heads
+        self.ff_hid = ff_hid
+        self.chunk_size = chunk_size
+        hop_size = hop_size if hop_size is not None else chunk_size // 2
+        self.hop_size = hop_size
+        self.n_repeats = n_repeats
+        self.n_src = n_src
+        self.norm_type = norm_type
+        self.ff_activation = ff_activation
+        self.mask_act = mask_act
+        self.bidirectional = bidirectional
+        self.dropout = dropout
+        # version 0.3.4
+        # self.in_norm = norms.get(norm_type)(in_chan)
+        self.mha_in_dim = ceil(self.in_chan / self.n_heads) * self.n_heads
+        if self.in_chan % self.n_heads != 0:
+            warnings.warn(
+                f"DPTransformer input dim ({self.in_chan}) is not a multiple of the number of "
+                f"heads ({self.n_heads}). Adding extra linear layer at input to accomodate "
+                f"(size [{self.in_chan} x {self.mha_in_dim}])"
+            )
+            self.input_layer = nn.Linear(self.in_chan, self.mha_in_dim)
+        else:
+            self.input_layer = None
+        self.in_norm = norms.get(norm_type)(self.mha_in_dim)
+        self.ola = DualPathProcessing(self.chunk_size, self.hop_size)
+        # Succession of DPRNNBlocks.
+        self.layers = nn.ModuleList([])
+        for x in range(self.n_repeats):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        ImprovedTransformedLayer(
+                            self.mha_in_dim,
+                            self.n_heads,
+                            self.ff_hid,
+                            self.dropout,
+                            self.ff_activation,
+                            True,
+                            self.norm_type,
+                        ),
+                        ImprovedTransformedLayer(
+                            self.mha_in_dim,
+                            self.n_heads,
+                            self.ff_hid,
+                            self.dropout,
+                            self.ff_activation,
+                            self.bidirectional,
+                            self.norm_type,
+                        ),
+                    ]
+                )
+            )
+        net_out_conv = nn.Conv2d(self.mha_in_dim, n_src * self.in_chan, 1)
+        self.first_out = nn.Sequential(nn.PReLU(), net_out_conv)
+        # Gating and masking in 2D space (after fold)
+        self.net_out = nn.Sequential(nn.Conv1d(self.in_chan, self.in_chan, 1), nn.Tanh())
+        self.net_gate = nn.Sequential(nn.Conv1d(self.in_chan, self.in_chan, 1), nn.Sigmoid())
+        # Get activation function.
+        mask_nl_class = activations.get(mask_act)
+        # For softmax, feed the source dimension.
+        if has_arg(mask_nl_class, "dim"):
+            self.output_act = mask_nl_class(dim=1)
+        else:
+            self.output_act = mask_nl_class()
+    def forward(self, mixture_w):
+        r"""Forward.
+        Args:
+            mixture_w (:class:`torch.Tensor`): Tensor of shape $(batch, nfilters, nframes)$
+        Returns:
+            :class:`torch.Tensor`: estimated mask of shape $(batch, nsrc, nfilters, nframes)$
+        """
+        if self.input_layer is not None:
+            mixture_w = self.input_layer(mixture_w.transpose(1, 2)).transpose(1, 2)
+        mixture_w = self.in_norm(mixture_w)  # [batch, bn_chan, n_frames]
+        n_orig_frames = mixture_w.shape[-1]
+        mixture_w = self.ola.unfold(mixture_w)
+        batch, n_filters, self.chunk_size, n_chunks = mixture_w.size()
+        for layer_idx in range(len(self.layers)):
+            intra, inter = self.layers[layer_idx]
+            mixture_w = self.ola.intra_process(mixture_w, intra)
+            mixture_w = self.ola.inter_process(mixture_w, inter)
+        output = self.first_out(mixture_w)
+        output = output.reshape(batch * self.n_src, self.in_chan, self.chunk_size, n_chunks)
+        output = self.ola.fold(output, output_size=n_orig_frames)
+        output = self.net_out(output) * self.net_gate(output)
+        # Compute mask
+        output = output.reshape(batch, self.n_src, self.in_chan, -1)
+        est_mask = self.output_act(output)
+        return est_mask
+    def get_config(self):
+        config = {
+            "in_chan": self.in_chan,
+            "ff_hid": self.ff_hid,
+            "n_heads": self.n_heads,
+            "chunk_size": self.chunk_size,
+            "hop_size": self.hop_size,
+            "n_repeats": self.n_repeats,
+            "n_src": self.n_src,
+            "norm_type": self.norm_type,
+            "ff_activation": self.ff_activation,
+            "mask_act": self.mask_act,
+            "bidirectional": self.bidirectional,
+            "dropout": self.dropout,
+        }
+        return config
+def has_arg(fn, name):
+    """Checks if a callable accepts a given keyword argument.
+    Args:
+        fn (callable): Callable to inspect.
+        name (str): Check if `fn` can be called with `name` as a keyword
+            argument.
+    Returns:
+        bool: whether `fn` accepts a `name` keyword argument.
+    """
+    signature = inspect.signature(fn)
+    parameter = signature.parameters.get(name)
+    if parameter is None:
+        return False
+    return parameter.kind in (
+        inspect.Parameter.POSITIONAL_OR_KEYWORD,
+        inspect.Parameter.KEYWORD_ONLY,
+    )

DPTNet_eval/asteroid_test/masknn/norms.py ADDED Viewed

	@@ -0,0 +1,156 @@

+from functools import partial
+import torch
+from torch import nn
+from torch.nn.modules.batchnorm import _BatchNorm
+EPS = 1e-8
+class _LayerNorm(nn.Module):
+    """Layer Normalization base class."""
+    def __init__(self, channel_size):
+        super(_LayerNorm, self).__init__()
+        self.channel_size = channel_size
+        self.gamma = nn.Parameter(torch.ones(channel_size), requires_grad=True)
+        self.beta = nn.Parameter(torch.zeros(channel_size), requires_grad=True)
+    def apply_gain_and_bias(self, normed_x):
+        """ Assumes input of size `[batch, chanel, *]`. """
+        return (self.gamma * normed_x.transpose(1, -1) + self.beta).transpose(1, -1)
+class GlobLN(_LayerNorm):
+    """Global Layer Normalization (globLN)."""
+    def forward(self, x):
+        """Applies forward pass.
+        Works for any input size > 2D.
+        Args:
+            x (:class:`torch.Tensor`): Shape `[batch, chan, *]`
+        Returns:
+            :class:`torch.Tensor`: gLN_x `[batch, chan, *]`
+        """
+        dims = list(range(1, len(x.shape)))
+        mean = x.mean(dim=dims, keepdim=True)
+        var = torch.pow(x - mean, 2).mean(dim=dims, keepdim=True)
+        return self.apply_gain_and_bias((x - mean) / (var + EPS).sqrt())
+class ChanLN(_LayerNorm):
+    """Channel-wise Layer Normalization (chanLN)."""
+    def forward(self, x):
+        """Applies forward pass.
+        Works for any input size > 2D.
+        Args:
+            x (:class:`torch.Tensor`): `[batch, chan, *]`
+        Returns:
+            :class:`torch.Tensor`: chanLN_x `[batch, chan, *]`
+        """
+        mean = torch.mean(x, dim=1, keepdim=True)
+        var = torch.var(x, dim=1, keepdim=True, unbiased=False)
+        return self.apply_gain_and_bias((x - mean) / (var + EPS).sqrt())
+class CumLN(_LayerNorm):
+    """Cumulative Global layer normalization(cumLN)."""
+    def forward(self, x):
+        """
+        Args:
+            x (:class:`torch.Tensor`): Shape `[batch, channels, length]`
+        Returns:
+             :class:`torch.Tensor`: cumLN_x `[batch, channels, length]`
+        """
+        batch, chan, spec_len = x.size()
+        cum_sum = torch.cumsum(x.sum(1, keepdim=True), dim=-1)
+        cum_pow_sum = torch.cumsum(x.pow(2).sum(1, keepdim=True), dim=-1)
+        cnt = torch.arange(start=chan, end=chan * (spec_len + 1), step=chan, dtype=x.dtype).view(
+            1, 1, -1
+        )
+        cum_mean = cum_sum / cnt
+        cum_var = cum_pow_sum - cum_mean.pow(2)
+        return self.apply_gain_and_bias((x - cum_mean) / (cum_var + EPS).sqrt())
+class FeatsGlobLN(_LayerNorm):
+    """feature-wise global Layer Normalization (FeatsGlobLN).
+    Applies normalization over frames for each channel."""
+    def forward(self, x):
+        """Applies forward pass.
+        Works for any input size > 2D.
+        Args:
+            x (:class:`torch.Tensor`): `[batch, chan, time]`
+        Returns:
+            :class:`torch.Tensor`: chanLN_x `[batch, chan, time]`
+        """
+        stop = len(x.size())
+        dims = list(range(2, stop))
+        mean = torch.mean(x, dim=dims, keepdim=True)
+        var = torch.var(x, dim=dims, keepdim=True, unbiased=False)
+        return self.apply_gain_and_bias((x - mean) / (var + EPS).sqrt())
+class BatchNorm(_BatchNorm):
+    """Wrapper class for pytorch BatchNorm1D and BatchNorm2D"""
+    def _check_input_dim(self, input):
+        if input.dim() < 2 or input.dim() > 4:
+            raise ValueError("expected 4D or 3D input (got {}D input)".format(input.dim()))
+# Aliases.
+gLN = GlobLN
+fgLN = FeatsGlobLN
+cLN = ChanLN
+cgLN = CumLN
+bN = BatchNorm
+def register_norm(custom_norm):
+    """Register a custom norm, gettable with `norms.get`.
+    Args:
+        custom_norm: Custom norm to register.
+    """
+    if custom_norm.__name__ in globals().keys() or custom_norm.__name__.lower() in globals().keys():
+        raise ValueError(f"Norm {custom_norm.__name__} already exists. Choose another name.")
+    globals().update({custom_norm.__name__: custom_norm})
+def get(identifier):
+    """Returns a norm class from a string. Returns its input if it
+    is callable (already a :class:`._LayerNorm` for example).
+    Args:
+        identifier (str or Callable or None): the norm identifier.
+    Returns:
+        :class:`._LayerNorm` or None
+    """
+    if identifier is None:
+        return None
+    elif callable(identifier):
+        return identifier
+    elif isinstance(identifier, str):
+        cls = globals().get(identifier)
+        if cls is None:
+            raise ValueError("Could not interpret normalization identifier: " + str(identifier))
+        return cls
+    else:
+        raise ValueError("Could not interpret normalization identifier: " + str(identifier))

DPTNet_eval/asteroid_test/models/__init__.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Models
+# from .conv_tasnet import ConvTasNet
+# from .dccrnet import DCCRNet
+# from .dcunet import DCUNet
+# from .dprnn_tasnet import DPRNNTasNet
+# from .sudormrf import SuDORMRFImprovedNet, SuDORMRFNet
+from .dptnet import DPTNet
+# from .lstm_tasnet import LSTMTasNet
+# from .demask import DeMask
+# Sharing-related
+# from .publisher import save_publishable, upload_publishable
+__all__ = [
+    # "ConvTasNet",
+    # "DPRNNTasNet",
+    # "SuDORMRFImprovedNet",
+    # "SuDORMRFNet",
+    "DPTNet",
+    # "LSTMTasNet",
+    # "DeMask",
+    # "DCUNet",
+    # "DCCRNet",
+    # "save_publishable",
+    # "upload_publishable",
+]
+def register_model(custom_model):
+    """Register a custom model, gettable with `models.get`.
+    Args:
+        custom_model: Custom model to register.
+    """
+    if (
+        custom_model.__name__ in globals().keys()
+        or custom_model.__name__.lower() in globals().keys()
+    ):
+        raise ValueError(f"Model {custom_model.__name__} already exists. Choose another name.")
+    globals().update({custom_model.__name__: custom_model})
+def get(identifier):
+    """Returns an model class from a string (case-insensitive).
+    Args:
+        identifier (str): the model name.
+    Returns:
+        :class:`torch.nn.Module`
+    """
+    if isinstance(identifier, str):
+        to_get = {k.lower(): v for k, v in globals().items()}
+        cls = to_get.get(identifier.lower())
+        if cls is None:
+            raise ValueError(f"Could not interpret model name : {str(identifier)}")
+        return cls
+    raise ValueError(f"Could not interpret model name : {str(identifier)}")

DPTNet_eval/asteroid_test/models/base_models.py ADDED Viewed

	@@ -0,0 +1,351 @@

+import os
+import warnings
+import numpy as np
+import torch
+from torch import nn
+from ..masknn import activations
+from ..utils.torch_utils import pad_x_to_y
+def _unsqueeze_to_3d(x):
+    if x.ndim == 1:
+        return x.reshape(1, 1, -1)
+    elif x.ndim == 2:
+        return x.unsqueeze(1)
+    else:
+        return x
+class BaseModel(nn.Module):
+    def __init__(self):
+        print("initialize BaseModel")
+        super().__init__()
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError
+    @torch.no_grad()
+    def separate(self, wav, output_dir=None, force_overwrite=False, **kwargs):
+        """Infer separated sources from input waveforms.
+        Also supports filenames.
+        Args:
+            wav (Union[torch.Tensor, numpy.ndarray, str]): waveform array/tensor.
+                Shape: 1D, 2D or 3D tensor, time last.
+            output_dir (str): path to save all the wav files. If None,
+                estimated sources will be saved next to the original ones.
+            force_overwrite (bool): whether to overwrite existing files.
+            **kwargs: keyword arguments to be passed to `_separate`.
+        Returns:
+            Union[torch.Tensor, numpy.ndarray, None], the estimated sources.
+                (batch, n_src, time) or (n_src, time) w/o batch dim.
+        .. note::
+            By default, `separate` calls `_separate` which calls `forward`.
+            For models whose `forward` doesn't return waveform tensors,
+            overwrite `_separate` to return waveform tensors.
+        """
+        if isinstance(wav, str):
+            self.file_separate(
+                wav, output_dir=output_dir, force_overwrite=force_overwrite, **kwargs
+            )
+        elif isinstance(wav, np.ndarray):
+            print("is ndarray")
+            # import pdb ; pdb.set_trace()
+            return self.numpy_separate(wav, **kwargs)
+        elif isinstance(wav, torch.Tensor):
+            print("is torch.Tensor")
+            return self.torch_separate(wav, **kwargs)
+        else:
+            raise ValueError(
+                f"Only support filenames, numpy arrays and torch tensors, received {type(wav)}"
+            )
+    def torch_separate(self, wav: torch.Tensor, **kwargs) -> torch.Tensor:
+        """ Core logic of `separate`."""
+        # Handle device placement
+        input_device = wav.device
+        model_device = next(self.parameters()).device
+        wav = wav.to(model_device)
+        # Forward
+        out_wavs = self._separate(wav, **kwargs)
+        # FIXME: for now this is the best we can do.
+        out_wavs *= wav.abs().sum() / (out_wavs.abs().sum())
+        # Back to input device (and numpy if necessary)
+        out_wavs = out_wavs.to(input_device)
+        return out_wavs
+    def numpy_separate(self, wav: np.ndarray, **kwargs) -> np.ndarray:
+        """ Numpy interface to `separate`."""
+        wav = torch.from_numpy(wav)
+        out_wav = self.torch_separate(wav, **kwargs)
+        out_wav = out_wav.data.numpy()
+        return out_wav
+    def file_separate(
+        self, filename: str, output_dir=None, force_overwrite=False, **kwargs
+    ) -> None:
+        """ Filename interface to `separate`."""
+        import soundfile as sf
+        wav, fs = sf.read(filename, dtype="float32", always_2d=True)
+        # FIXME: support only single-channel files for now.
+        to_save = self.numpy_separate(wav[:, 0], **kwargs)
+        # Save wav files to filename_est1.wav etc...
+        for src_idx, est_src in enumerate(to_save):
+            base = ".".join(filename.split(".")[:-1])
+            save_name = base + "_est{}.".format(src_idx + 1) + filename.split(".")[-1]
+            if os.path.isfile(save_name) and not force_overwrite:
+                warnings.warn(
+                    f"File {save_name} already exists, pass `force_overwrite=True` to overwrite it",
+                    UserWarning,
+                )
+                return
+            if output_dir is not None:
+                save_name = os.path.join(output_dir, save_name.split("/")[-1])
+            sf.write(save_name, est_src, fs)
+    def _separate(self, wav, *args, **kwargs):
+        """Hidden separation method
+        Args:
+            wav (Union[torch.Tensor, numpy.ndarray, str]): waveform array/tensor.
+                Shape: 1D, 2D or 3D tensor, time last.
+        Returns:
+            The output of self(wav, *args, **kwargs).
+        """
+        return self(wav, *args, **kwargs)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_conf_or_path, *args, **kwargs):
+        """Instantiate separation model from a model config (file or dict).
+        Args:
+            pretrained_model_conf_or_path (Union[dict, str]): model conf as
+                returned by `serialize`, or path to it. Need to contain
+                `model_args` and `state_dict` keys.
+            *args: Positional arguments to be passed to the model.
+            **kwargs: Keyword arguments to be passed to the model.
+                They overwrite the ones in the model package.
+        Returns:
+            nn.Module corresponding to the pretrained model conf/URL.
+        Raises:
+            ValueError if the input config file doesn't contain the keys
+                `model_name`, `model_args` or `state_dict`.
+        """
+        from . import get  # Avoid circular imports
+        if isinstance(pretrained_model_conf_or_path, str):
+            # cached_model = self.cached_download(pretrained_model_conf_or_path)
+            if os.path.isfile(pretrained_model_conf_or_path):
+                cached_model = pretrained_model_conf_or_path
+            else:
+                raise ValueError(
+                    "Model {} is not a file or doesn't exist.".format(pretrained_model_conf_or_path)
+                )
+            conf = torch.load(cached_model, map_location="cpu")
+        else:
+            conf = pretrained_model_conf_or_path
+        if "model_name" not in conf.keys():
+            raise ValueError(
+                "Expected config dictionary to have field "
+                "model_name`. Found only: {}".format(conf.keys())
+            )
+        if "state_dict" not in conf.keys():
+            raise ValueError(
+                "Expected config dictionary to have field "
+                "state_dict`. Found only: {}".format(conf.keys())
+            )
+        if "model_args" not in conf.keys():
+            raise ValueError(
+                "Expected config dictionary to have field "
+                "model_args`. Found only: {}".format(conf.keys())
+            )
+        conf["model_args"].update(kwargs)  # kwargs overwrite config.
+        # Attempt to find the model and instantiate it.
+        try:
+            model_class = get(conf["model_name"])
+        except ValueError:  # Couldn't get the model, maybe custom.
+            model = cls(*args, **conf["model_args"])  # Child class.
+        else:
+            model = model_class(*args, **conf["model_args"])
+        model.load_state_dict(conf["state_dict"])
+        return model
+    def serialize(self):
+        """Serialize model and output dictionary.
+        Returns:
+            dict, serialized model with keys `model_args` and `state_dict`.
+        """
+        import pytorch_lightning as pl  # Not used in torch.hub
+        from .. import __version__ as asteroid_version  # Avoid circular imports
+        model_conf = dict(
+            model_name=self.__class__.__name__,
+            state_dict=self.get_state_dict(),
+            model_args=self.get_model_args(),
+        )
+        # Additional infos
+        infos = dict()
+        infos["software_versions"] = dict(
+            torch_version=torch.__version__,
+            pytorch_lightning_version=pl.__version__,
+            asteroid_version=asteroid_version,
+        )
+        model_conf["infos"] = infos
+        return model_conf
+    def get_state_dict(self):
+        """ In case the state dict needs to be modified before sharing the model."""
+        return self.state_dict()
+    def get_model_args(self):
+        raise NotImplementedError
+    def cached_download(self, filename_or_url):
+        if os.path.isfile(filename_or_url):
+            print("is file")
+            return filename_or_url
+        else:
+            print("Model {} is not a file or doesn't exist.".format(filename_or_url))
+class BaseEncoderMaskerDecoder(BaseModel):
+    """Base class for encoder-masker-decoder separation models.
+    Args:
+        encoder (Encoder): Encoder instance.
+        masker (nn.Module): masker network.
+        decoder (Decoder): Decoder instance.
+        encoder_activation (Optional[str], optional): Activation to apply after encoder.
+            See ``asteroid.masknn.activations`` for valid values.
+    """
+    def __init__(self, encoder, masker, decoder, encoder_activation=None):
+        super().__init__()
+        self.encoder = encoder
+        self.masker = masker
+        self.decoder = decoder
+        self.encoder_activation = encoder_activation
+        self.enc_activation = activations.get(encoder_activation or "linear")()
+    def forward(self, wav):
+        """Enc/Mask/Dec model forward
+        Args:
+            wav (torch.Tensor): waveform tensor. 1D, 2D or 3D tensor, time last.
+        Returns:
+            torch.Tensor, of shape (batch, n_src, time) or (n_src, time).
+        """
+        # Handle 1D, 2D or n-D inputs
+        was_one_d = wav.ndim == 1
+        # Reshape to (batch, n_mix, time)
+        wav = _unsqueeze_to_3d(wav)
+        # Real forward
+        tf_rep = self.encoder(wav)
+        tf_rep = self.postprocess_encoded(tf_rep)
+        tf_rep = self.enc_activation(tf_rep)
+        est_masks = self.masker(tf_rep)
+        est_masks = self.postprocess_masks(est_masks)
+        masked_tf_rep = est_masks * tf_rep.unsqueeze(1)
+        masked_tf_rep = self.postprocess_masked(masked_tf_rep)
+        decoded = self.decoder(masked_tf_rep)
+        decoded = self.postprocess_decoded(decoded)
+        reconstructed = pad_x_to_y(decoded, wav)
+        if was_one_d:
+            return reconstructed.squeeze(0)
+        else:
+            return reconstructed
+    def postprocess_encoded(self, tf_rep):
+        """Hook to perform transformations on the encoded, time-frequency domain
+        representation (output of the encoder) before encoder activation is applied.
+        Args:
+            tf_rep (Tensor of shape (batch, freq, time)):
+                Output of the encoder, before encoder activation is applied.
+        Return:
+            Transformed `tf_rep`
+        """
+        return tf_rep
+    def postprocess_masks(self, masks):
+        """Hook to perform transformations on the masks (output of the masker) before
+        masks are applied.
+        Args:
+            masks (Tensor of shape (batch, n_src, freq, time)):
+                Output of the masker
+        Return:
+            Transformed `masks`
+        """
+        return masks
+    def postprocess_masked(self, masked_tf_rep):
+        """Hook to perform transformations on the masked time-frequency domain
+        representation (result of masking in the time-frequency domain) before decoding.
+        Args:
+            masked_tf_rep (Tensor of shape (batch, n_src, freq, time)):
+                Masked time-frequency representation, before decoding.
+        Return:
+            Transformed `masked_tf_rep`
+        """
+        return masked_tf_rep
+    def postprocess_decoded(self, decoded):
+        """Hook to perform transformations on the decoded, time domain representation
+        (output of the decoder) before original shape reconstruction.
+        Args:
+            decoded (Tensor of shape (batch, n_src, time)):
+                Output of the decoder, before original shape reconstruction.
+        Return:
+            Transformed `decoded`
+        """
+        return decoded
+    def get_model_args(self):
+        """ Arguments needed to re-instantiate the model. """
+        fb_config = self.encoder.filterbank.get_config()
+        masknet_config = self.masker.get_config()
+        # Assert both dict are disjoint
+        if not all(k not in fb_config for k in masknet_config):
+            raise AssertionError(
+                "Filterbank and Mask network config share" "common keys. Merging them is not safe."
+            )
+        # Merge all args under model_args.
+        model_args = {
+            **fb_config,
+            **masknet_config,
+            "encoder_activation": self.encoder_activation,
+        }
+        return model_args
+# Backwards compatibility
+BaseTasNet = BaseEncoderMaskerDecoder

DPTNet_eval/asteroid_test/models/dptnet.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from ..filterbanks import make_enc_dec
+from ..masknn import DPTransformer
+from .base_models import BaseEncoderMaskerDecoder
+class DPTNet(BaseEncoderMaskerDecoder):
+    """DPTNet separation model, as described in [1].
+    Args:
+        n_src (int): Number of masks to estimate.
+        out_chan  (int or None): Number of bins in the estimated masks.
+            Defaults to `in_chan`.
+        bn_chan (int): Number of channels after the bottleneck.
+            Defaults to 128.
+        hid_size (int): Number of neurons in the RNNs cell state.
+            Defaults to 128.
+        chunk_size (int): window size of overlap and add processing.
+            Defaults to 100.
+        hop_size (int or None): hop size (stride) of overlap and add processing.
+            Default to `chunk_size // 2` (50% overlap).
+        n_repeats (int): Number of repeats. Defaults to 6.
+        norm_type (str, optional): Type of normalization to use. To choose from
+            - ``'gLN'``: global Layernorm
+            - ``'cLN'``: channelwise Layernorm
+        mask_act (str, optional): Which non-linear function to generate mask.
+        bidirectional (bool, optional): True for bidirectional Inter-Chunk RNN
+            (Intra-Chunk is always bidirectional).
+        rnn_type (str, optional): Type of RNN used. Choose between ``'RNN'``,
+            ``'LSTM'`` and ``'GRU'``.
+        num_layers (int, optional): Number of layers in each RNN.
+        dropout (float, optional): Dropout ratio, must be in [0,1].
+        in_chan (int, optional): Number of input channels, should be equal to
+            n_filters.
+        fb_name (str, className): Filterbank family from which to make encoder
+            and decoder. To choose among [``'free'``, ``'analytic_free'``,
+            ``'param_sinc'``, ``'stft'``].
+        n_filters (int): Number of filters / Input dimension of the masker net.
+        kernel_size (int): Length of the filters.
+        stride (int, optional): Stride of the convolution.
+            If None (default), set to ``kernel_size // 2``.
+        **fb_kwargs (dict): Additional kwards to pass to the filterbank
+            creation.
+    References:
+        [1]: Jingjing Chen et al. "Dual-Path Transformer Network: Direct
+            Context-Aware Modeling for End-to-End Monaural Speech Separation"
+            Interspeech 2020.
+    """
+    def __init__(
+        self,
+        n_src,
+        ff_hid=256,
+        chunk_size=100,
+        hop_size=None,
+        n_repeats=6,
+        norm_type="gLN",
+        ff_activation="relu",
+        encoder_activation="relu",
+        mask_act="relu",
+        bidirectional=True,
+        dropout=0,
+        in_chan=None,
+        fb_name="free",
+        kernel_size=16,
+        n_filters=64,
+        stride=8,
+        **fb_kwargs,
+    ):
+        encoder, decoder = make_enc_dec(
+            fb_name, kernel_size=kernel_size, n_filters=n_filters, stride=stride, **fb_kwargs
+        )
+        n_feats = encoder.n_feats_out
+        if in_chan is not None:
+            assert in_chan == n_feats, (
+                "Number of filterbank output channels"
+                " and number of input channels should "
+                "be the same. Received "
+                f"{n_feats} and {in_chan}"
+            )
+        # Update in_chan
+        masker = DPTransformer(
+            n_feats,
+            n_src,
+            ff_hid=ff_hid,
+            ff_activation=ff_activation,
+            chunk_size=chunk_size,
+            hop_size=hop_size,
+            n_repeats=n_repeats,
+            norm_type=norm_type,
+            mask_act=mask_act,
+            bidirectional=bidirectional,
+            dropout=dropout,
+        )
+        super().__init__(encoder, masker, decoder, encoder_activation=encoder_activation)

DPTNet_eval/asteroid_test/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .torch_utils import tensors_to_device, to_cuda
+# The functions above were all in asteroid/utils.py before refactoring into
+# asteroid/utils/*_utils.py files. They are imported for backward compatibility.
+__all__ = [
+    "tensors_to_device",
+    "to_cuda",
+]

DPTNet_eval/asteroid_test/utils/torch_utils.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import torch
+from torch import nn
+from collections import OrderedDict
+def to_cuda(tensors):  # pragma: no cover (No CUDA on travis)
+    """Transfer tensor, dict or list of tensors to GPU.
+    Args:
+        tensors (:class:`torch.Tensor`, list or dict): May be a single, a
+            list or a dictionary of tensors.
+    Returns:
+        :class:`torch.Tensor`:
+            Same as input but transferred to cuda. Goes through lists and dicts
+            and transfers the torch.Tensor to cuda. Leaves the rest untouched.
+    """
+    if isinstance(tensors, torch.Tensor):
+        return tensors.cuda()
+    if isinstance(tensors, list):
+        return [to_cuda(tens) for tens in tensors]
+    if isinstance(tensors, dict):
+        for key in tensors.keys():
+            tensors[key] = to_cuda(tensors[key])
+        return tensors
+    raise TypeError(
+        "tensors must be a tensor or a list or dict of tensors. "
+        " Got tensors of type {}".format(type(tensors))
+    )
+def tensors_to_device(tensors, device):
+    """Transfer tensor, dict or list of tensors to device.
+    Args:
+        tensors (:class:`torch.Tensor`): May be a single, a list or a
+            dictionary of tensors.
+        device (:class: `torch.device`): the device where to place the tensors.
+    Returns:
+        Union [:class:`torch.Tensor`, list, tuple, dict]:
+            Same as input but transferred to device.
+            Goes through lists and dicts and transfers the torch.Tensor to
+            device. Leaves the rest untouched.
+    """
+    if isinstance(tensors, torch.Tensor):
+        return tensors.to(device)
+    elif isinstance(tensors, (list, tuple)):
+        return [tensors_to_device(tens, device) for tens in tensors]
+    elif isinstance(tensors, dict):
+        for key in tensors.keys():
+            tensors[key] = tensors_to_device(tensors[key], device)
+        return tensors
+    else:
+        return tensors
+def pad_x_to_y(x, y, axis=-1):
+    """Pad first argument to have same size as second argument
+    Args:
+        x (torch.Tensor): Tensor to be padded.
+        y (torch.Tensor): Tensor to pad x to.
+        axis (int): Axis to pad on.
+    Returns:
+        torch.Tensor, x padded to match y's shape.
+    """
+    if axis != -1:
+        raise NotImplementedError
+    inp_len = y.size(axis)
+    output_len = x.size(axis)
+    return nn.functional.pad(x, [0, inp_len - output_len])
+def load_state_dict_in(state_dict, model):
+    """Strictly loads state_dict in model, or the next submodel.
+        Useful to load standalone model after training it with System.
+    Args:
+        state_dict (OrderedDict): the state_dict to load.
+        model (torch.nn.Module): the model to load it into
+    Returns:
+        torch.nn.Module: model with loaded weights.
+    # .. note:: Keys in a state_dict look like object1.object2.layer_name.weight.etc
+        We first try to load the model in the classic way.
+        If this fail we removes the first left part of the key to obtain
+        object2.layer_name.weight.etc.
+        Blindly loading with strictly=False should be done with some logging
+        of the missing keys in the state_dict and the model.
+    """
+    try:
+        # This can fail if the model was included into a bigger nn.Module
+        # object. For example, into System.
+        model.load_state_dict(state_dict, strict=True)
+    except RuntimeError:
+        # keys look like object1.object2.layer_name.weight.etc
+        # The following will remove the first left part of the key to obtain
+        # object2.layer_name.weight.etc.
+        # Blindly loading with strictly=False should be done with some
+        # new_state_dict of the missing keys in the state_dict and the model.
+        new_state_dict = OrderedDict()
+        for k, v in state_dict.items():
+            new_k = k[k.find(".") + 1 :]
+            new_state_dict[new_k] = v
+        model.load_state_dict(new_state_dict, strict=True)
+    return model
+def are_models_equal(model1, model2):
+    """Check for weights equality between models.
+    Args:
+        model1 (nn.Module): model instance to be compared.
+        model2 (nn.Module): second model instance to be compared.
+    Returns:
+        bool: Whether all model weights are equal.
+    """
+    for p1, p2 in zip(model1.parameters(), model2.parameters()):
+        if p1.data.ne(p2.data).sum() > 0:
+            return False
+    return True